Why Evaluate AI Agents?

Unlike traditional software with deterministic outputs, AI agents produce variable responses. Proper evaluation ensures:

  • Quality assurance: Verify agents meet accuracy requirements
  • Regression detection: Catch degradation from prompt/model changes
  • Comparison: Choose between different approaches
  • Optimization: Guide improvements with measurable metrics

Evaluation Metrics

Common Metrics

Accuracy

Percentage of correct answers. Good for factual Q&A with known answers.

Relevance

How relevant is the response to the query? Often LLM-judged.

Faithfulness

Is the response grounded in the context (RAG)? Measures hallucination.

Latency

Response time. Critical for user experience.

RAG-Specific Metrics

# RAG Evaluation Metrics

1. Context Precision
   - How much retrieved context is relevant?
   - Relevant chunks / Total chunks retrieved

2. Context Recall
   - Did we retrieve all relevant information?
   - Relevant chunks retrieved / Total relevant chunks available

3. Answer Faithfulness
   - Is the answer grounded in context?
   - Claims supported by context / Total claims in answer

4. Answer Relevancy
   - Does the answer address the question?
   - LLM-judged score 0-1

Evaluation Frameworks

RAGAS (RAG Assessment)

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation data
eval_data = {
    "question": [
        "What is the return policy?",
        "How do I contact support?"
    ],
    "answer": [
        "You can return items within 30 days.",
        "Contact support at help@example.com"
    ],
    "contexts": [
        ["Our return policy allows returns within 30 days of purchase."],
        ["For support, email help@example.com or call 1-800-XXX"]
    ],
    "ground_truth": [
        "30-day return policy",
        "help@example.com"
    ]
}

dataset = Dataset.from_dict(eval_data)

# Run evaluation
results = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
)

print(results)
# {'faithfulness': 0.95, 'answer_relevancy': 0.92, ...}

DeepEval

from deepeval import evaluate
from deepeval.metrics import (
    GEval,
    FaithfulnessMetric,
    AnswerRelevancyMetric
)
from deepeval.test_case import LLMTestCase

# Create test cases
test_cases = [
    LLMTestCase(
        input="What is machine learning?",
        actual_output="Machine learning is a subset of AI...",
        expected_output="ML is AI that learns from data",
        retrieval_context=["ML definition from textbook..."]
    )
]

# Define metrics
faithfulness = FaithfulnessMetric(threshold=0.7)
relevancy = AnswerRelevancyMetric(threshold=0.7)

# Custom metric with G-Eval
helpfulness = GEval(
    name="Helpfulness",
    criteria="Determine if the response is helpful and actionable",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ]
)

# Evaluate
results = evaluate(test_cases, [faithfulness, relevancy, helpfulness])

LLM-as-Judge

Use LLMs to evaluate other LLM outputs:

from openai import OpenAI

client = OpenAI()

def llm_judge(question: str, answer: str, criteria: str) -> dict:
    """Use GPT-4 to evaluate an answer."""

    prompt = f"""Evaluate the following answer based on the criteria.

Question: {question}
Answer: {answer}
Criteria: {criteria}

Provide your evaluation as JSON:
{{
    "score": <1-5>,
    "reasoning": "",
    "suggestions": ""
}}"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

# Usage
result = llm_judge(
    question="Explain quantum computing",
    answer="Quantum computing uses qubits...",
    criteria="Accuracy, clarity, and completeness"
)

print(f"Score: {result['score']}/5")
print(f"Reasoning: {result['reasoning']}")

Building an Evaluation Pipeline

import json
from dataclasses import dataclass
from typing import List, Callable
import statistics

@dataclass
class TestCase:
    input: str
    expected_output: str = None
    context: List[str] = None
    metadata: dict = None

@dataclass
class EvalResult:
    test_case: TestCase
    actual_output: str
    scores: dict
    latency_ms: float

class AgentEvaluator:
    def __init__(self, agent_fn: Callable):
        self.agent = agent_fn
        self.results: List[EvalResult] = []

    def add_metric(self, name: str, metric_fn: Callable):
        self.metrics[name] = metric_fn

    def evaluate(self, test_cases: List[TestCase]) -> dict:
        for tc in test_cases:
            # Run agent
            start = time.time()
            output = self.agent(tc.input)
            latency = (time.time() - start) * 1000

            # Calculate scores
            scores = {}
            for name, metric in self.metrics.items():
                scores[name] = metric(tc, output)

            self.results.append(EvalResult(
                test_case=tc,
                actual_output=output,
                scores=scores,
                latency_ms=latency
            ))

        return self.aggregate_results()

    def aggregate_results(self) -> dict:
        summary = {"total_cases": len(self.results)}

        for metric_name in self.metrics:
            scores = [r.scores[metric_name] for r in self.results]
            summary[metric_name] = {
                "mean": statistics.mean(scores),
                "std": statistics.stdev(scores) if len(scores) > 1 else 0,
                "min": min(scores),
                "max": max(scores)
            }

        summary["latency"] = {
            "mean_ms": statistics.mean([r.latency_ms for r in self.results]),
            "p95_ms": sorted([r.latency_ms for r in self.results])[int(len(self.results) * 0.95)]
        }

        return summary

# Usage
evaluator = AgentEvaluator(my_rag_agent)
evaluator.add_metric("exact_match", lambda tc, out: tc.expected_output.lower() in out.lower())
evaluator.add_metric("length", lambda tc, out: min(len(out) / 500, 1.0))

results = evaluator.evaluate(test_cases)
print(json.dumps(results, indent=2))

A/B Testing Agents

import random
from collections import defaultdict

class AgentABTest:
    def __init__(self, agent_a, agent_b, judge_fn):
        self.agent_a = agent_a
        self.agent_b = agent_b
        self.judge = judge_fn
        self.results = defaultdict(int)

    def run_comparison(self, query: str) -> str:
        """Run both agents and get preference."""
        output_a = self.agent_a(query)
        output_b = self.agent_b(query)

        # Randomize order to avoid position bias
        if random.random() > 0.5:
            outputs = [("A", output_a), ("B", output_b)]
        else:
            outputs = [("B", output_b), ("A", output_a)]

        winner = self.judge(query, outputs[0][1], outputs[1][1])
        # winner is "first", "second", or "tie"

        if winner == "first":
            self.results[outputs[0][0]] += 1
        elif winner == "second":
            self.results[outputs[1][0]] += 1
        else:
            self.results["tie"] += 1

        return dict(self.results)

    def get_winner(self) -> str:
        if self.results["A"] > self.results["B"]:
            return "Agent A"
        elif self.results["B"] > self.results["A"]:
            return "Agent B"
        else:
            return "Tie"

def llm_preference_judge(query, output1, output2):
    # Use LLM to pick preferred response
    prompt = f"""Compare these two responses to the query.

Query: {query}
Response 1: {output1}
Response 2: {output2}

Which is better? Reply with only: "first", "second", or "tie"."""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip().lower()

Continuous Evaluation

# Integrate evaluation into CI/CD

# pytest test file
import pytest
from my_agent import RAGAgent

@pytest.fixture
def agent():
    return RAGAgent()

@pytest.fixture
def test_cases():
    return load_test_cases("eval_dataset.json")

def test_accuracy(agent, test_cases):
    correct = 0
    for tc in test_cases:
        output = agent.query(tc["question"])
        if tc["expected"] in output:
            correct += 1

    accuracy = correct / len(test_cases)
    assert accuracy >= 0.8, f"Accuracy {accuracy} below threshold 0.8"

def test_latency(agent, test_cases):
    latencies = []
    for tc in test_cases[:10]:  # Sample
        start = time.time()
        agent.query(tc["question"])
        latencies.append(time.time() - start)

    p95 = sorted(latencies)[int(len(latencies) * 0.95)]
    assert p95 < 2.0, f"P95 latency {p95}s exceeds 2s threshold"

def test_no_hallucination(agent, test_cases):
    for tc in test_cases:
        output = agent.query(tc["question"])
        # Check output is grounded in context
        assert check_grounded(output, tc["context"])

Best Practices

  • Diverse test sets: Cover edge cases, different topics, adversarial inputs
  • Multiple metrics: No single metric captures all aspects of quality
  • Human baseline: Include human judgments for calibration
  • Version datasets: Track changes to evaluation data
  • Automate: Run evaluations in CI/CD pipeline
  • Monitor production: Sample and evaluate live traffic

Master AI Quality Assurance

Our Agentic AI program covers evaluation and testing in-depth. Learn to build reliable, high-quality AI systems.

Explore Agentic AI Program

Related Articles