Examples
RAG System Evaluation
Evaluate RAG responses for relevance to source documents and factual accuracy.
const ragEvaluator = createWeightedEvaluator({
evaluators: {
// Check if named entities (people, places, dates) are preserved
entities: createContextEvaluator({
type: "entities-recall"
}),
// Verify factual correctness using embedding similarity
precision: createContextEvaluator({
type: "precision"
}),
// Check if all relevant information is included
recall: createContextEvaluator({
type: "recall"
}),
// Assess overall contextual relevance
relevance: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate how well the response uses the context"
})
},
weights: {
entities: 0.2, // Lower weight as it's more supplementary
precision: 0.3, // Higher weight for factual correctness
recall: 0.3, // Higher weight for information coverage
relevance: 0.2 // Balance of overall relevance
}
});
const result = await ragEvaluator({
data: [{
prompt: "What are the key financial metrics?",
completion: "Revenue grew 25% to $10M in Q3 2023",
groundTruth: "Q3 2023 saw 25% revenue growth to $10M",
contexts: [
"In Q3 2023, company revenue increased 25% to $10M",
"Operating margins improved to 15%"
]
}]
});
/* Example output:
{
results: [{
score: 0.85,
scores: [
{ score: 1.0, evaluator: "entities" }, // Perfect entity preservation
{ score: 0.92, evaluator: "precision" }, // High factual accuracy
{ score: 0.75, evaluator: "recall" }, // Missing margin information
{ score: 0.78, evaluator: "relevance" } // Good contextual relevance
],
item: {
prompt: "What were the key financial metrics?",
completion: "Revenue grew 25% to $10M in Q3 2023",
groundTruth: "Q3 2023 saw 25% revenue growth to $10M",
contexts: [...]
}
}],
scoreResults: {
value: 0.85,
individual: {
entities: 1.0,
precision: 0.92,
recall: 0.75,
relevance: 0.78
}
}
}
*/
Content Moderation Evaluation
Binary evaluation for content policy compliance, useful for automated content filtering.
const moderationEvaluator = createEvaluator({
client: oai,
model: "gpt-4-turbo",
resultsType: "binary", // Changes output to true/false counts
evaluationDescription: "Score 1 if content follows all policies (safe, respectful, appropriate), 0 if any violation exists"
});
const moderationResult = await moderationEvaluator({
data: [
{
prompt: "Describe our product benefits",
completion: "Our product helps improve productivity",
expectedCompletion: "Professional product description"
},
{
prompt: "Respond to negative review",
completion: "Your complaint is totally wrong...",
expectedCompletion: "Professional response to feedback"
}
]
});
/* Example output:
{
results: [
{ score: 1, item: { ... } }, // Meets content guidelines
{ score: 0, item: { ... } } // Violates professional tone policy
],
binaryResults: {
trueCount: 1,
falseCount: 1
}
}
*/
Student Answer Evaluation
Demonstrates weighted evaluation combining exact matching, semantic understanding, and qualitative assessment.
const gradingEvaluator = createWeightedEvaluator({
evaluators: {
// Check for presence of required terminology
keyTerms: createAccuracyEvaluator({
weights: {
factual: 0.9, // High weight on exact matches
semantic: 0.1 // Low weight on similar terms
}
}),
// Assess conceptual understanding
understanding: createAccuracyEvaluator({
weights: {
factual: 0.2, // Low weight on exact matches
semantic: 0.8 // High weight on meaning similarity
}
}),
// Evaluate answer quality like a human grader
quality: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate answer completeness and clarity 0-1"
})
},
weights: {
keyTerms: 0.3, // Balance terminology requirements
understanding: 0.4, // Emphasize conceptual grasp
quality: 0.3 // Consider overall presentation
}
});
const gradingResult = await gradingEvaluator({
data: [{
prompt: "Explain how photosynthesis works",
completion: "Plants convert sunlight into chemical energy through chlorophyll",
expectedCompletion: "Photosynthesis is the process where plants use chlorophyll to convert sunlight, water, and CO2 into glucose and oxygen"
}]
});
/* Example output:
{
results: [{
score: 0.78, // Overall grade (78%)
scores: [
{
score: 0.65, // Missing key terms (water, CO2, glucose)
evaluator: "keyTerms",
evaluatorType: "accuracy"
},
{
score: 0.90, // Shows good conceptual understanding
evaluator: "understanding",
evaluatorType: "accuracy"
},
{
score: 0.75, // Clear but not comprehensive
evaluator: "quality",
evaluatorType: "model-graded"
}
],
item: { ... }
}],
scoreResults: {
value: 0.78,
individual: {
keyTerms: 0.65,
understanding: 0.90,
quality: 0.75
}
}
}
*/
Chatbot Quality Assessment
Monitor chatbot response quality across multiple dimensions.
const chatbotEvaluator = createWeightedEvaluator({
evaluators: {
// Evaluate response appropriateness
relevance: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate how well the response addresses the user's query"
}),
// Check response tone
tone: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate the professionalism and friendliness of the response"
}),
// Verify against known good responses
accuracy: createAccuracyEvaluator({
weights: { semantic: 0.8, factual: 0.2 }
})
},
weights: {
relevance: 0.4,
tone: 0.3,
accuracy: 0.3
}
});
const result = await chatbotEvaluator({
data: [{
prompt: "How do I reset my password?",
completion: "You can reset your password by clicking the 'Forgot Password' link on the login page.",
expectedCompletion: "To reset your password, use the 'Forgot Password' option at login.",
contexts: ["Previous support interactions"]
}]
});
Content Generation Pipeline
Evaluate generated content for quality and accuracy.
const contentEvaluator = createWeightedEvaluator({
evaluators: {
// Check writing quality
quality: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate clarity, structure, and engagement"
}),
// Verify factual accuracy
factCheck: createAccuracyEvaluator({
weights: { factual: 1.0 }
}),
// Assess source usage
citations: createContextEvaluator({
type: "entities-recall"
})
},
weights: {
quality: 0.4,
factCheck: 0.4,
citations: 0.2
}
});
const result = await contentEvaluator({
data: [{
prompt: "Write an article about renewable energy trends",
completion: "Solar and wind power installations increased by 30% in 2023...",
contexts: [
"Global renewable energy deployment grew by 30% year-over-year",
"Solar and wind remained the fastest-growing sectors"
],
groundTruth: "Renewable energy saw significant growth, led by solar and wind"
}]
});
Document Processing System
Evaluate document extraction and summarization quality.
const documentEvaluator = createWeightedEvaluator({
evaluators: {
// Verify key information extraction
extraction: createContextEvaluator({
type: "recall"
}),
// Check summary quality
summary: createEvaluator({
client: oai,
model: "gpt-4-turbo",
evaluationDescription: "Rate conciseness and completeness"
}),
// Validate against reference summary
accuracy: createAccuracyEvaluator({
weights: { semantic: 0.6, factual: 0.4 }
})
},
weights: {
extraction: 0.4,
summary: 0.3,
accuracy: 0.3
}
});
const result = await documentEvaluator({
data: [{
prompt: "Summarize the quarterly report",
completion: "Q3 revenue grew 25% YoY, driven by new product launches...",
contexts: [
"Revenue increased 25% compared to Q3 2022",
"Growth primarily attributed to successful product launches"
],
groundTruth: "Q3 saw 25% YoY revenue growth due to new products"
}]
});