Name: Agentic Eval
Author: kienbui1995

搵技能.../

Agentic Eval | Skills Pool

def eval_final_response(actual_output, expected_output, criteria):
    score = llm_judge(
        prompt=f"Does this response satisfy the criteria?\nCriteria: {criteria}\nResponse: {actual_output}",
        expected=expected_output
    )
    return score

def eval_trajectory(actual_steps, expected_steps, mode="any_order"):
    if mode == "exact":
        # Strict: same tools, same order
        return actual_steps == expected_steps
    elif mode == "any_order":
        # Valid if same tools called, order flexible
        return set(s.tool for s in actual_steps) == set(s.tool for s in expected_steps)
    elif mode == "subset":
        # Valid if all expected tools called (may call extras)
        expected_tools = set(s.tool for s in expected_steps)
        actual_tools = set(s.tool for s in actual_steps)
        return expected_tools.issubset(actual_tools)

def eval_step(step, expected_step):
    return {
        "tool_correct": step.tool == expected_step.tool,
        "params_correct": jaccard_similarity(step.params, expected_step.params) > 0.8,
        "reasoning_sound": llm_judge(step.reasoning, "Is this reasoning valid for the goal?"),
        "output_useful": llm_judge(step.output, "Did this tool output advance the goal?")
    }

def detect_tool_hallucination(trace):
    # Extract tool names mentioned in the agent's reasoning text
    claimed_tools = extract_tool_names_from_reasoning(trace.reasoning)
    # Compare to actual logged tool invocations
    actual_tool_calls = [call.tool for call in trace.tool_calls]

    hallucinated = set(claimed_tools) - set(actual_tool_calls)
    if hallucinated:
        return HallucinationResult(
            detected=True,
            hallucinated_tools=hallucinated,
            severity="high"  # Always high — agent fabricated its own actions
        )
    return HallucinationResult(detected=False)

from math import comb

def pass_at_k(n_trials, n_successes, k):
    """Probability that at least 1 of k samples succeeds."""
    if n_trials - n_successes < k:
        return 1.0
    return 1.0 - comb(n_trials - n_successes, k) / comb(n_trials, k)

# Run each test case 5-10 times, report pass@1, pass@3, pass@5
results = run_agent_n_times(test_case, n=10)
n_success = sum(results)

p1 = pass_at_k(10, n_success, 1)   # strict: must work on any given run
p3 = pass_at_k(10, n_success, 3)   # lenient: works at least once in 3 tries
p5 = pass_at_k(10, n_success, 5)   # very lenient: works at least once in 5

class SandboxToolEnv:
    def __init__(self, scenario):
        self.tools = {
            "search": lambda q: scenario.get_search_results(q),
            "write_file": lambda path, content: scenario.record_write(path, content),
            "send_email": lambda to, body: scenario.record_email(to, body),
            "query_db": lambda sql: scenario.get_db_result(sql),
        }
        self.call_log = []

    def call_tool(self, name, params):
        if name not in self.tools:
            raise ToolNotFoundError(f"Tool '{name}' not available")
        self.call_log.append(ToolCall(name, params, timestamp=now()))
        return self.tools[name](**params)

    def assert_tool_called(self, tool_name, params_match=None):
        calls = [c for c in self.call_log if c.name == tool_name]
        assert len(calls) > 0, f"Tool '{tool_name}' was never called"
        if params_match:
            assert any(params_match(c.params) for c in calls), \
                f"Tool '{tool_name}' called but no call matched params filter"

    def assert_tool_not_called(self, tool_name):
        calls = [c for c in self.call_log if c.name == tool_name]
        assert len(calls) == 0, f"Tool '{tool_name}' was called unexpectedly ({len(calls)} times)"

SCENARIOS = [
    {
        "name": "customer_refund_happy_path",
        "input": "Customer 123 wants a refund for order 456",
        "mock_tools": {
            "get_order": {"id": 456, "status": "delivered", "amount": 49.99},
            "get_customer": {"id": 123, "tier": "premium", "refunds_this_year": 0},
            "process_refund": {"success": True, "refund_id": "R789"}
        },
        "expected_tools": ["get_order", "get_customer", "process_refund"],
        "trajectory_mode": "any_order",
        "pass_criteria": "Refund approved and confirmation provided"
    },
    {
        "name": "customer_refund_fraud_signal",
        "input": "Customer 999 wants a refund for order 101",
        "mock_tools": {
            "get_order": {"id": 101, "status": "delivered", "amount": 299.99},
            "get_customer": {"id": 999, "tier": "standard", "refunds_this_year": 4},
        },
        "expected_tools": ["get_order", "get_customer"],
        "trajectory_mode": "subset",
        "pass_criteria": "Refund escalated to human review, not auto-approved"
    }
]

# .github/workflows/agent-eval.yml

Dimension	LLM Eval	Agentic Eval
Unit	1 prompt → 1 response	Multi-step: plan → tool calls → synthesis
Evaluation	Compare output to expected

Dimension	LLM Eval	Agentic Eval
Unit	1 prompt → 1 response	Multi-step: plan → tool calls → synthesis
Evaluation	Compare output to expected

Use case	Recommended target
Production customer-facing agent	pass@1 > 0.85
Internal tooling agent	pass@3 > 0.90
Research prototype	pass@5 > 0.70

Agentic Eval

Overview

When to Use

Core Jobs

1. Why Agentic Eval ≠ LLM Eval

Agentic Eval

Overview

When to Use

Core Jobs

1. Why Agentic Eval ≠ LLM Eval

2. Three-Layer Trajectory Evaluation

3. Tool Hallucination Detection

4. Pass@k for Non-Deterministic Agents

5. Sandbox Tool Environment

6. Scenario-Based Test Design

7. Agent-Specific CI Pipeline

Openai Whisper

Voice Call

Prose

Clawhub

Sherpa Onnx Tts

Openai Whisper Api