Examples

RAG System Evaluation

Evaluate RAG responses for relevance to source documents and factual accuracy.

const ragEvaluator = createWeightedEvaluator({
  evaluators: {
    // Check if named entities (people, places, dates) are preserved
    entities: createContextEvaluator({ 
      type: "entities-recall" 
    }),
    // Verify factual correctness using embedding similarity
    precision: createContextEvaluator({ 
      type: "precision" 
    }),
    // Check if all relevant information is included
    recall: createContextEvaluator({ 
      type: "recall" 
    }),
    // Assess overall contextual relevance
    relevance: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate how well the response uses the context"
    })
  },
  weights: {
    entities: 0.2,   // Lower weight as it's more supplementary
    precision: 0.3,  // Higher weight for factual correctness
    recall: 0.3,     // Higher weight for information coverage
    relevance: 0.2   // Balance of overall relevance
  }
});
 
const result = await ragEvaluator({
  data: [{
    prompt: "What are the key financial metrics?",
    completion: "Revenue grew 25% to $10M in Q3 2023",
    groundTruth: "Q3 2023 saw 25% revenue growth to $10M",
    contexts: [
      "In Q3 2023, company revenue increased 25% to $10M",
      "Operating margins improved to 15%"
    ]
  }]
});
 
/* Example output:
{
  results: [{
    score: 0.85,
    scores: [
      { score: 1.0, evaluator: "entities" },    // Perfect entity preservation
      { score: 0.92, evaluator: "precision" },  // High factual accuracy
      { score: 0.75, evaluator: "recall" },     // Missing margin information
      { score: 0.78, evaluator: "relevance" }   // Good contextual relevance
    ],
    item: {
      prompt: "What were the key financial metrics?",
      completion: "Revenue grew 25% to $10M in Q3 2023",
      groundTruth: "Q3 2023 saw 25% revenue growth to $10M",
      contexts: [...]
    }
  }],
  scoreResults: {
    value: 0.85,
    individual: {
      entities: 1.0,
      precision: 0.92,
      recall: 0.75,
      relevance: 0.78
    }
  }
}
*/

Content Moderation Evaluation

Binary evaluation for content policy compliance, useful for automated content filtering.

const moderationEvaluator = createEvaluator({
  client: oai,
  model: "gpt-4-turbo",
  resultsType: "binary",  // Changes output to true/false counts
  evaluationDescription: "Score 1 if content follows all policies (safe, respectful, appropriate), 0 if any violation exists"
});
 
const moderationResult = await moderationEvaluator({
  data: [
    {
      prompt: "Describe our product benefits",
      completion: "Our product helps improve productivity",
      expectedCompletion: "Professional product description"
    },
    {
      prompt: "Respond to negative review",
      completion: "Your complaint is totally wrong...",
      expectedCompletion: "Professional response to feedback"
    }
  ]
});
 
/* Example output:
{
  results: [
    { score: 1, item: { ... } },  // Meets content guidelines
    { score: 0, item: { ... } }   // Violates professional tone policy
  ],
  binaryResults: {
    trueCount: 1,
    falseCount: 1
  }
}
*/

Student Answer Evaluation

Demonstrates weighted evaluation combining exact matching, semantic understanding, and qualitative assessment.

const gradingEvaluator = createWeightedEvaluator({
  evaluators: {
    // Check for presence of required terminology
    keyTerms: createAccuracyEvaluator({
      weights: { 
        factual: 0.9,   // High weight on exact matches
        semantic: 0.1    // Low weight on similar terms
      }
    }),
    // Assess conceptual understanding
    understanding: createAccuracyEvaluator({
      weights: { 
        factual: 0.2,   // Low weight on exact matches
        semantic: 0.8    // High weight on meaning similarity
      }
    }),
    // Evaluate answer quality like a human grader
    quality: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate answer completeness and clarity 0-1"
    })
  },
  weights: {
    keyTerms: 0.3,      // Balance terminology requirements
    understanding: 0.4,  // Emphasize conceptual grasp
    quality: 0.3        // Consider overall presentation
  }
});
 
const gradingResult = await gradingEvaluator({
  data: [{
    prompt: "Explain how photosynthesis works",
    completion: "Plants convert sunlight into chemical energy through chlorophyll",
    expectedCompletion: "Photosynthesis is the process where plants use chlorophyll to convert sunlight, water, and CO2 into glucose and oxygen"
  }]
});
 
/* Example output:
{
  results: [{
    score: 0.78,  // Overall grade (78%)
    scores: [
      { 
        score: 0.65,           // Missing key terms (water, CO2, glucose)
        evaluator: "keyTerms",
        evaluatorType: "accuracy"
      },
      { 
        score: 0.90,           // Shows good conceptual understanding
        evaluator: "understanding",
        evaluatorType: "accuracy"
      },
      { 
        score: 0.75,           // Clear but not comprehensive
        evaluator: "quality",
        evaluatorType: "model-graded"
      }
    ],
    item: { ... }
  }],
  scoreResults: {
    value: 0.78,
    individual: {
      keyTerms: 0.65,
      understanding: 0.90,
      quality: 0.75
    }
  }
}
*/

Chatbot Quality Assessment

Monitor chatbot response quality across multiple dimensions.

const chatbotEvaluator = createWeightedEvaluator({
  evaluators: {
    // Evaluate response appropriateness
    relevance: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate how well the response addresses the user's query"
    }),
    // Check response tone
    tone: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate the professionalism and friendliness of the response"
    }),
    // Verify against known good responses
    accuracy: createAccuracyEvaluator({
      weights: { semantic: 0.8, factual: 0.2 }
    })
  },
  weights: {
    relevance: 0.4,
    tone: 0.3,
    accuracy: 0.3
  }
});
 
const result = await chatbotEvaluator({
  data: [{
    prompt: "How do I reset my password?",
    completion: "You can reset your password by clicking the 'Forgot Password' link on the login page.",
    expectedCompletion: "To reset your password, use the 'Forgot Password' option at login.",
    contexts: ["Previous support interactions"]
  }]
});

Content Generation Pipeline

Evaluate generated content for quality and accuracy.

const contentEvaluator = createWeightedEvaluator({
  evaluators: {
    // Check writing quality
    quality: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate clarity, structure, and engagement"
    }),
    // Verify factual accuracy
    factCheck: createAccuracyEvaluator({
      weights: { factual: 1.0 }
    }),
    // Assess source usage
    citations: createContextEvaluator({ 
      type: "entities-recall" 
    })
  },
  weights: {
    quality: 0.4,
    factCheck: 0.4,
    citations: 0.2
  }
});
 
const result = await contentEvaluator({
  data: [{
    prompt: "Write an article about renewable energy trends",
    completion: "Solar and wind power installations increased by 30% in 2023...",
    contexts: [
      "Global renewable energy deployment grew by 30% year-over-year",
      "Solar and wind remained the fastest-growing sectors"
    ],
    groundTruth: "Renewable energy saw significant growth, led by solar and wind"
  }]
});

Document Processing System

Evaluate document extraction and summarization quality.

const documentEvaluator = createWeightedEvaluator({
  evaluators: {
    // Verify key information extraction
    extraction: createContextEvaluator({
      type: "recall"
    }),
    // Check summary quality
    summary: createEvaluator({
      client: oai,
      model: "gpt-4-turbo",
      evaluationDescription: "Rate conciseness and completeness"
    }),
    // Validate against reference summary
    accuracy: createAccuracyEvaluator({
      weights: { semantic: 0.6, factual: 0.4 }
    })
  },
  weights: {
    extraction: 0.4,
    summary: 0.3,
    accuracy: 0.3
  }
});
 
const result = await documentEvaluator({
  data: [{
    prompt: "Summarize the quarterly report",
    completion: "Q3 revenue grew 25% YoY, driven by new product launches...",
    contexts: [
      "Revenue increased 25% compared to Q3 2022",
      "Growth primarily attributed to successful product launches"
    ],
    groundTruth: "Q3 saw 25% YoY revenue growth due to new products"
  }]
});

Island AI