Implementation:Run llama Llama index EvaluationResult Analysis
Overview
EvaluationResult_Analysis documents the EvaluationResult data model and practical patterns for analyzing, aggregating, and reporting on evaluation results from LlamaIndex's evaluation pipeline. The EvaluationResult is the standard output type returned by all evaluators and the BatchEvalRunner, providing a uniform interface for pass/fail verdicts, numeric scores, textual feedback, and error handling.
Principle:Run_llama_Llama_index_Evaluation_Result_Analysis
RAG Evaluation Metrics Analysis LlamaIndex API
Source File
llama-index-core/llama_index/core/evaluation/base.py, Lines 12–43
Import Statement
from llama_index.core.evaluation import EvaluationResult
EvaluationResult Fields
The EvaluationResult class extends BaseModel (Pydantic) with the following fields:
| Field | Type | Default | Description |
|---|---|---|---|
| query | Optional[str] |
None |
The query string that was evaluated |
| contexts | Optional[Sequence[str]] |
None |
The context strings used during evaluation |
| response | Optional[str] |
None |
The response string that was evaluated |
| passing | Optional[bool] |
None |
Whether the response passed the evaluation criteria |
| feedback | Optional[str] |
None |
Textual explanation from the judge LLM justifying the verdict |
| score | Optional[float] |
None |
Numeric quality score (primarily used by CorrectnessEvaluator, typically 1.0–5.0) |
| pairwise_source | Optional[str] |
None |
Source identifier for pairwise comparison evaluations |
| invalid_result | bool |
False |
Whether the evaluation itself failed (unparseable output, timeout, etc.) |
| invalid_reason | Optional[str] |
None |
Explanation of why the evaluation result is invalid |
Accessing Individual Results
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.llms.openai import OpenAI
judge_llm = OpenAI(model="gpt-4", temperature=0.0)
evaluator = FaithfulnessEvaluator(llm=judge_llm)
result = await evaluator.aevaluate(
query="What is the capital of France?",
response="The capital of France is Paris.",
contexts=["Paris is the capital and largest city of France."],
)
# Access all fields of the result
print(f"Query: {result.query}")
print(f"Response: {result.response}")
print(f"Passing: {result.passing}")
print(f"Score: {result.score}")
print(f"Feedback: {result.feedback}")
print(f"Contexts: {result.contexts}")
print(f"Invalid: {result.invalid_result}")
print(f"Invalid reason: {result.invalid_reason}")
Computing Pass Rates
def compute_pass_rate(results: list) -> float:
"""Compute pass rate from a list of EvaluationResult objects."""
valid_results = [r for r in results if not r.invalid_result]
if not valid_results:
return 0.0
passing_count = sum(1 for r in valid_results if r.passing)
return passing_count / len(valid_results)
# After batch evaluation
# results: Dict[str, List[EvaluationResult]]
for metric_name, metric_results in results.items():
rate = compute_pass_rate(metric_results)
print(f"{metric_name}: {rate:.1%} pass rate")
Computing Score Statistics
import statistics
def compute_score_stats(results: list) -> dict:
"""Compute statistics for scored evaluation results."""
scores = [
r.score for r in results
if r.score is not None and not r.invalid_result
]
if not scores:
return {"mean": None, "median": None, "stdev": None, "min": None, "max": None}
return {
"mean": statistics.mean(scores),
"median": statistics.median(scores),
"stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
"min": min(scores),
"max": max(scores),
}
# Analyze correctness scores
if "correctness" in results:
stats = compute_score_stats(results["correctness"])
print(f"Correctness - Mean: {stats['mean']:.2f}, "
f"Median: {stats['median']:.2f}, "
f"Min: {stats['min']:.1f}, Max: {stats['max']:.1f}")
Identifying Failing Queries
def get_failures(results: list) -> list:
"""Extract failing evaluation results with their feedback."""
return [
{
"query": r.query,
"response": r.response,
"feedback": r.feedback,
"score": r.score,
}
for r in results
if not r.invalid_result and not r.passing
]
# Find faithfulness failures (hallucinations)
faithfulness_failures = get_failures(results["faithfulness"])
print(f"\n{len(faithfulness_failures)} faithfulness failures:")
for failure in faithfulness_failures:
print(f" Q: {failure['query']}")
print(f" R: {failure['response'][:100]}...")
print(f" Feedback: {failure['feedback']}")
print()
Tracking Invalid Results
def get_invalid_results(results: list) -> list:
"""Extract invalid evaluation results for debugging."""
return [
{
"query": r.query,
"reason": r.invalid_reason,
}
for r in results
if r.invalid_result
]
for metric_name, metric_results in results.items():
invalid = get_invalid_results(metric_results)
total = len(metric_results)
print(f"{metric_name}: {len(invalid)}/{total} invalid results")
for inv in invalid:
print(f" Query: {inv['query']}")
print(f" Reason: {inv['reason']}")
Cross-Metric Failure Analysis
def cross_metric_analysis(results: dict, queries: list) -> None:
"""Analyze failure patterns across multiple metrics."""
for i, query in enumerate(queries):
faith_pass = results["faithfulness"][i].passing
relev_pass = results["relevancy"][i].passing
correct_pass = results.get("correctness", [None]*(i+1))[i]
correct_pass = correct_pass.passing if correct_pass else None
# Identify failure pattern
if not faith_pass and relev_pass:
print(f"GENERATION ISSUE - Q: {query}")
print(f" Faithfulness feedback: "
f"{results['faithfulness'][i].feedback}")
elif faith_pass and not relev_pass:
print(f"RETRIEVAL ISSUE - Q: {query}")
print(f" Relevancy feedback: "
f"{results['relevancy'][i].feedback}")
elif faith_pass and relev_pass and correct_pass is False:
print(f"SYNTHESIS ISSUE - Q: {query}")
print(f" Correctness score: "
f"{results['correctness'][i].score}")
print(f" Correctness feedback: "
f"{results['correctness'][i].feedback}")
cross_metric_analysis(results, eval_questions)
Building a Summary Report
def build_evaluation_report(results: dict) -> dict:
"""Build a comprehensive evaluation summary report."""
report = {}
for metric_name, metric_results in results.items():
valid = [r for r in metric_results if not r.invalid_result]
invalid = [r for r in metric_results if r.invalid_result]
passing = [r for r in valid if r.passing]
failing = [r for r in valid if not r.passing]
scores = [r.score for r in valid if r.score is not None]
report[metric_name] = {
"total": len(metric_results),
"valid": len(valid),
"invalid": len(invalid),
"passing": len(passing),
"failing": len(failing),
"pass_rate": len(passing) / len(valid) if valid else 0.0,
"mean_score": (
sum(scores) / len(scores) if scores else None
),
}
return report
report = build_evaluation_report(results)
for metric, stats in report.items():
print(f"\n=== {metric.upper()} ===")
print(f" Pass rate: {stats['pass_rate']:.1%} "
f"({stats['passing']}/{stats['valid']})")
if stats["mean_score"] is not None:
print(f" Mean score: {stats['mean_score']:.2f}")
if stats["invalid"] > 0:
print(f" Invalid: {stats['invalid']}/{stats['total']}")
Converting Results to DataFrame
import pandas as pd
def results_to_dataframe(results: dict) -> pd.DataFrame:
"""Convert batch evaluation results to a pandas DataFrame."""
rows = []
# Determine number of queries from any metric
sample_key = next(iter(results))
num_queries = len(results[sample_key])
for i in range(num_queries):
row = {
"query": results[sample_key][i].query,
"response": results[sample_key][i].response,
}
for metric_name, metric_results in results.items():
r = metric_results[i]
row[f"{metric_name}_passing"] = r.passing
row[f"{metric_name}_score"] = r.score
row[f"{metric_name}_feedback"] = r.feedback
row[f"{metric_name}_invalid"] = r.invalid_result
rows.append(row)
return pd.DataFrame(rows)
df = results_to_dataframe(results)
print(df[["query", "faithfulness_passing",
"relevancy_passing", "correctness_score"]].to_string())