From 80d769b3c4ae68629d9c0622196adfb241fcbbbf Mon Sep 17 00:00:00 2001 From: Abel Song <2730343900@qq.com> Date: Tue, 30 Jun 2026 21:26:37 +0800 Subject: [PATCH 1/2] feat(examples): add reproducible eval+optimize closed-loop example --- .../eval_optimize_loop/.gitignore | 4 + .../optimization/eval_optimize_loop/README.md | 71 ++ .../eval_optimize_loop/case_meta.json | 9 + .../optimization_report.json | 649 ++++++++++++++ .../eval_optimize_loop/optimization_report.md | 36 + .../eval_optimize_loop/optimizer.json | 45 + .../prompts/baseline_system.md | 7 + .../optimization/eval_optimize_loop/run.py | 836 ++++++++++++++++++ .../runs/latest/baseline_prompt.md | 7 + .../runs/latest/candidate_prompt.md | 11 + .../eval_optimize_loop/train.evalset.json | 75 ++ .../eval_optimize_loop/val.evalset.json | 83 ++ 12 files changed, 1833 insertions(+) create mode 100644 examples/optimization/eval_optimize_loop/.gitignore create mode 100644 examples/optimization/eval_optimize_loop/README.md create mode 100644 examples/optimization/eval_optimize_loop/case_meta.json create mode 100644 examples/optimization/eval_optimize_loop/optimization_report.json create mode 100644 examples/optimization/eval_optimize_loop/optimization_report.md create mode 100644 examples/optimization/eval_optimize_loop/optimizer.json create mode 100644 examples/optimization/eval_optimize_loop/prompts/baseline_system.md create mode 100644 examples/optimization/eval_optimize_loop/run.py create mode 100644 examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md create mode 100644 examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md create mode 100644 examples/optimization/eval_optimize_loop/train.evalset.json create mode 100644 examples/optimization/eval_optimize_loop/val.evalset.json diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore new file mode 100644 index 0000000..3e35535 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/.gitignore @@ -0,0 +1,4 @@ +# Runtime side-effects (regenerated on every run); the audited report and the +# runs/latest prompt snapshots are kept in VCS as example deliverables. +__pycache__/ +_sdk_eval_metrics.json diff --git a/examples/optimization/eval_optimize_loop/README.md b/examples/optimization/eval_optimize_loop/README.md new file mode 100644 index 0000000..1c69e29 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/README.md @@ -0,0 +1,71 @@ +# eval_optimize_loop + +> Part of the `examples/optimization` series. Where [`quickstart`](../quickstart) +> drives a live `AgentOptimizer` (GEPA) against a real agent, this example focuses +> on the **closed loop around** optimization — baseline evaluation, failure +> attribution, candidate validation, an acceptance gate, and an auditable report — +> and runs fully reproducibly in fake/trace mode **without an API key**. + +This example implements a reproducible Evaluation + Optimization closed loop: + +```text +baseline evaluation + -> failure attribution + -> prompt candidate generation + -> validation regression + -> acceptance gate + -> auditable JSON/Markdown report +``` + +Run without API keys: + +```bash +# First run may spend time on a one-off `uv sync`; the loop itself is ~seconds. +uv run python examples/optimization/eval_optimize_loop/run.py +``` + +Set `YUN_LOG_LEVEL=DEBUG` for more verbose logs (default `INFO`). + +Inputs: + +- `prompts/baseline_system.md` — target prompt being optimized. +- `train.evalset.json` / `val.evalset.json` — SDK-clean evalsets (trace mode). +- `case_meta.json` — per-case `key` / `rubric` / `tool_intent` (kept out of the + evalset so `EvalSet` stays schema-clean). +- `optimizer.json` — metric weights, scripted candidate patch, and gate thresholds. + +Outputs: + +- `optimization_report.json` / `optimization_report.md` +- `runs/latest/baseline_prompt.md` / `runs/latest/candidate_prompt.md` + +The sample has 6 cases: + +- train: two optimizable failures and one optimization-ineffective format case. +- validation: one new pass, one hard regression, and one soft degradation. + +## 设计说明(四支柱) + +**失败归因(阶段 2)。** 归因完全基于结构化评测信号,不依赖 case 命名。每条 case +记录三项 metric 子分(final_response / tool_trajectory / rubric)与关键轨迹(query、 +expected/actual 工具与回复)。`classify_tool_failure` 据「期望轨迹 vs 实际轨迹」判类: +期望调用权威检索工具却没调用 → `knowledge_recall_insufficient`;调全了期望工具又多调 +→ `spurious_tool_call`;首工具名对但参数不同 → `parameter_error`;否则 `tool_call_error`。 +rubric 维度由 `case_meta.json` 显式声明(`json_format`/`no_tool`/`single_tool`),失败时 +映射为 `format_error` 或 `llm_rubric_not_met`。归因只统计 baseline 失败。 + +**接受策略(阶段 5)。** Gate 以验证集为先,五项可配置约束全过才 ACCEPT:① 验证集均分 +提升 ≥ `min_val_score_gain`;② 无新增 hard fail;③ 无「关键 case」退化(关键性由 +`case_meta.json` 的 `key=true` 标记,而非把所有验证 case 一概视为关键);④ 非过拟合 +(训练涨而验证跌);⑤ 优化成本 ≤ `max_cost_usd`。各检查相互独立,便于定位拒绝原因。 + +**防过拟合。** 第 ④ 项专门拦截「训练大幅提升、验证退化」:本例候选给 baseline 注入 +激进检索行为,训练集 +0.53 但验证集回落,gate 据此 REJECT。关键 case 退化(③)与新增 +hard fail(②)提供正交的二次保险,即便总分变化很小也能拦住有害候选。 + +**产物审计(阶段 6)。** `optimization_report.json` 持久化 baseline/candidate 逐 case 分数与 +轨迹、逐 case delta、失败归因、gate 各检查、决策理由、成本/耗时/seed、prompt 的 SHA-256 与 +config 快照;`runs/latest/` 留存 baseline 与候选 prompt 全文。`.md` 顶部由 `narrate_report` +依据 gate/delta 数据生成「人话总结」,确定性、无需模型,换输入也不会失真。SDK 桥接通过 +`EvalSet.model_validate_json` 校验评测集,并用 trace-only `AgentEvaluator` 跑一次冒烟,证明 +管线确实接到真实 SDK 评测器;fake/trace 模式仅在评分/优化处用确定性替身以保证无 key 可复现。 diff --git a/examples/optimization/eval_optimize_loop/case_meta.json b/examples/optimization/eval_optimize_loop/case_meta.json new file mode 100644 index 0000000..c31fc5e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/case_meta.json @@ -0,0 +1,9 @@ +{ + "_comment": "Per-case metadata kept out of the SDK evalset so EvalSet stays schema-clean. 'key' marks cases that must not regress (gate uses it for critical-regression). 'rubric' selects the rubric dimension scored by run.score_case. 'tool_intent' is a trace-level attribution hint: 'authoritative_search' means the expected trajectory relies on the authoritative search tool, so a wrong tool is attributed to knowledge-recall insufficiency rather than a generic tool error.", + "train_ip_lookup_optimizable": {"key": false, "rubric": "none"}, + "train_calendar_optimizable": {"key": false, "rubric": "none"}, + "train_strict_json_ineffective": {"key": false, "rubric": "json_format"}, + "val_search_fallback_new_pass": {"key": false, "rubric": "none", "tool_intent": "authoritative_search"}, + "val_smalltalk_no_tool_regression": {"key": true, "rubric": "no_tool"}, + "val_weather_soft_degradation": {"key": true, "rubric": "single_tool"} +} diff --git a/examples/optimization/eval_optimize_loop/optimization_report.json b/examples/optimization/eval_optimize_loop/optimization_report.json new file mode 100644 index 0000000..109143e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.json @@ -0,0 +1,649 @@ +{ + "run": { + "timestamp": "2026-06-30T02:04:57.404739+00:00", + "mode": "fake_trace", + "seed": 42, + "sdk_bridge": { + "evalset_validated_with_trpc_sdk": true, + "agent_evaluator_available": true, + "agent_optimizer_available": true, + "agent_evaluator_trace_smoke": { + "status": "FAILED_EXPECTED", + "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"train_calendar_optimizable\",\n \"overallStatus\": \"failed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 0.0,\n \"threshold\": 0.1,\n \"evalSt", + "has_result": true, + "metrics_file": "_sdk_eval_metrics.json" + } + }, + "repro": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json", + "optimizer_config": "optimizer.json", + "prompt_source": "prompts/baseline_system.md" + } + }, + "prompt_audit": { + "target": { + "name": "life_assistant_system", + "path": "prompts/baseline_system.md", + "kind": "system_prompt" + }, + "baseline_sha256": "0f876750c9acd5d6ded115427aa96d3bcfe9148e90e449a0be41804c02838f72", + "candidate_sha256": "19a1743c5b1694b2b0860ecedc2697ce2247e3dea1f45169db7c853d6f5959bc", + "baseline_snapshot": "runs/latest/baseline_prompt.md", + "candidate_snapshot": "runs/latest/candidate_prompt.md" + }, + "baseline": { + "train": { + "eval_set_id": "eval_optimize_loop_train", + "mean_score": 0.25, + "pass_rate": 0.0, + "cases": { + "train_ip_lookup_optimizable": { + "case_id": "train_ip_lookup_optimizable", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "tool_call_error" + ], + "reason": "final_response_mismatch; tool_call_error", + "trace": { + "query": "查询我的公网 IP", + "expected_text": "203.0.113.10", + "actual_text": "我无法确定你的公网 IP。", + "expected_tools": [ + { + "id": "tool-1", + "name": "get_my_public_ip", + "args": { + "source": "commercial" + } + } + ], + "actual_tools": [] + } + }, + "train_calendar_optimizable": { + "case_id": "train_calendar_optimizable", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "tool_call_error" + ], + "reason": "final_response_mismatch; tool_call_error", + "trace": { + "query": "2026-10-01 是不是休息日?", + "expected_text": "休息日", + "actual_text": "这天大概率是节日,但我没有查询。", + "expected_tools": [ + { + "id": "tool-1", + "name": "query_holiday_calendar", + "args": { + "date": "2026-10-01", + "holiday_type": "legal" + } + } + ], + "actual_tools": [] + } + }, + "train_strict_json_ineffective": { + "case_id": "train_strict_json_ineffective", + "score": 0.35, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 1.0, + "rubric": 0.0 + }, + "failure_types": [ + "final_response_mismatch", + "format_error" + ], + "reason": "final_response_mismatch; format_error", + "trace": { + "query": "只返回 JSON:status ok", + "expected_text": "{\"status\":\"ok\"}", + "actual_text": "status ok", + "expected_tools": [], + "actual_tools": [] + } + } + } + }, + "val": { + "eval_set_id": "eval_optimize_loop_val", + "mean_score": 0.7333, + "pass_rate": 0.6667, + "cases": { + "val_search_fallback_new_pass": { + "case_id": "val_search_fallback_new_pass", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "reason": "final_response_mismatch; knowledge_recall_insufficient", + "trace": { + "query": "默认搜索不够时,查一下 Go 最新版本", + "expected_text": "Go 1.26", + "actual_text": "Go 有新版本,但我没有足够信息确认。", + "expected_tools": [ + { + "id": "tool-1", + "name": "uapi_search", + "args": { + "query": "Go 最新版本" + } + } + ], + "actual_tools": [ + { + "name": "websearch", + "args": { + "query": "Go 最新版本" + } + } + ] + } + }, + "val_smalltalk_no_tool_regression": { + "case_id": "val_smalltalk_no_tool_regression", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "在吗", + "expected_text": "在", + "actual_text": "在。", + "expected_tools": [], + "actual_tools": [] + } + }, + "val_weather_soft_degradation": { + "case_id": "val_weather_soft_degradation", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "北京天气怎么样?", + "expected_text": "北京", + "actual_text": "北京今天有天气信息。", + "expected_tools": [ + { + "id": "tool-1", + "name": "get_current_weather", + "args": { + "city": "北京" + } + } + ], + "actual_tools": [ + { + "name": "get_current_weather", + "args": { + "city": "北京" + } + } + ] + } + } + } + } + }, + "candidate": { + "train": { + "eval_set_id": "eval_optimize_loop_train", + "mean_score": 0.7833, + "pass_rate": 0.6667, + "cases": { + "train_ip_lookup_optimizable": { + "case_id": "train_ip_lookup_optimizable", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "查询我的公网 IP", + "expected_text": "203.0.113.10", + "actual_text": "你的公网 IP 是 203.0.113.10。", + "expected_tools": [ + { + "id": "tool-1", + "name": "get_my_public_ip", + "args": { + "source": "commercial" + } + } + ], + "actual_tools": [ + { + "name": "get_my_public_ip", + "args": { + "source": "commercial" + } + } + ] + } + }, + "train_calendar_optimizable": { + "case_id": "train_calendar_optimizable", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "2026-10-01 是不是休息日?", + "expected_text": "休息日", + "actual_text": "2026-10-01 是法定休息日。", + "expected_tools": [ + { + "id": "tool-1", + "name": "query_holiday_calendar", + "args": { + "date": "2026-10-01", + "holiday_type": "legal" + } + } + ], + "actual_tools": [ + { + "name": "query_holiday_calendar", + "args": { + "date": "2026-10-01", + "holiday_type": "legal" + } + } + ] + } + }, + "train_strict_json_ineffective": { + "case_id": "train_strict_json_ineffective", + "score": 0.35, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 1.0, + "rubric": 0.0 + }, + "failure_types": [ + "final_response_mismatch", + "format_error" + ], + "reason": "final_response_mismatch; format_error", + "trace": { + "query": "只返回 JSON:status ok", + "expected_text": "{\"status\":\"ok\"}", + "actual_text": "status ok", + "expected_tools": [], + "actual_tools": [] + } + } + } + }, + "val": { + "eval_set_id": "eval_optimize_loop_val", + "mean_score": 0.6667, + "pass_rate": 0.3333, + "cases": { + "val_search_fallback_new_pass": { + "case_id": "val_search_fallback_new_pass", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "默认搜索不够时,查一下 Go 最新版本", + "expected_text": "Go 1.26", + "actual_text": "Go 1.26 是当前查询到的最新版本。", + "expected_tools": [ + { + "id": "tool-1", + "name": "uapi_search", + "args": { + "query": "Go 最新版本" + } + } + ], + "actual_tools": [ + { + "name": "uapi_search", + "args": { + "query": "Go 最新版本" + } + } + ] + } + }, + "val_smalltalk_no_tool_regression": { + "case_id": "val_smalltalk_no_tool_regression", + "score": 0.45, + "passed": false, + "hard_fail": true, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 0.0, + "rubric": 0.0 + }, + "failure_types": [ + "spurious_tool_call", + "llm_rubric_not_met" + ], + "reason": "spurious_tool_call; llm_rubric_not_met", + "trace": { + "query": "在吗", + "expected_text": "在", + "actual_text": "我查了一下网页:在。", + "expected_tools": [], + "actual_tools": [ + { + "name": "uapi_search", + "args": { + "query": "在吗" + } + } + ] + } + }, + "val_weather_soft_degradation": { + "case_id": "val_weather_soft_degradation", + "score": 0.55, + "passed": false, + "hard_fail": true, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 0.0, + "rubric": 0.5 + }, + "failure_types": [ + "spurious_tool_call", + "llm_rubric_not_met" + ], + "reason": "spurious_tool_call; llm_rubric_not_met", + "trace": { + "query": "北京天气怎么样?", + "expected_text": "北京", + "actual_text": "北京今天有天气信息。", + "expected_tools": [ + { + "id": "tool-1", + "name": "get_current_weather", + "args": { + "city": "北京" + } + } + ], + "actual_tools": [ + { + "name": "get_current_weather", + "args": { + "city": "北京" + } + }, + { + "name": "uapi_search", + "args": { + "query": "北京天气" + } + } + ] + } + } + } + } + }, + "delta": { + "train": { + "train_ip_lookup_optimizable": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "train_calendar_optimizable": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "train_strict_json_ineffective": { + "kind": "same", + "baseline_score": 0.35, + "candidate_score": 0.35, + "delta": 0.0, + "baseline_passed": false, + "candidate_passed": false + } + }, + "val": { + "val_search_fallback_new_pass": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "val_smalltalk_no_tool_regression": { + "kind": "new_fail", + "baseline_score": 1.0, + "candidate_score": 0.45, + "delta": -0.55, + "baseline_passed": true, + "candidate_passed": false + }, + "val_weather_soft_degradation": { + "kind": "new_fail", + "baseline_score": 1.0, + "candidate_score": 0.55, + "delta": -0.45, + "baseline_passed": true, + "candidate_passed": false + } + } + }, + "failure_attribution": { + "counts": { + "final_response_mismatch": 4, + "tool_call_error": 2, + "format_error": 1, + "knowledge_recall_insufficient": 1 + }, + "by_case": { + "train_ip_lookup_optimizable": [ + "final_response_mismatch", + "tool_call_error" + ], + "train_calendar_optimizable": [ + "final_response_mismatch", + "tool_call_error" + ], + "train_strict_json_ineffective": [ + "final_response_mismatch", + "format_error" + ], + "val_search_fallback_new_pass": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ] + } + }, + "optimizer": { + "status": "SCRIPTED_CANDIDATE", + "algorithm": "scripted_agent_optimizer_bridge", + "agent_optimizer_available": true, + "agent_optimizer_invoked": false, + "note": "fake/trace mode applies a deterministic patch; see examples/optimization/quickstart for a live GEPA run.", + "candidate_prompt_path": "runs/latest/candidate_prompt.md", + "cost_usd": 0.0, + "tokens": 0 + }, + "gate": { + "accepted": false, + "decision": "REJECT", + "reason": "validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down", + "train_gain": 0.5333, + "val_gain": -0.0666, + "checks": [ + { + "name": "validation_gain_threshold", + "passed": false, + "detail": "val_gain=-0.0666, required>=+0.1000" + }, + { + "name": "no_new_hard_fail", + "passed": false, + "detail": "new_hard_fails=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation']" + }, + { + "name": "no_critical_regression", + "passed": false, + "detail": "critical_regressions=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation']" + }, + { + "name": "not_overfit_train_up_val_down", + "passed": false, + "detail": "train_gain=+0.5333, val_gain=-0.0666" + }, + { + "name": "cost_budget", + "passed": true, + "detail": "cost_usd=0.0000, budget=0.0100" + } + ] + }, + "audit": { + "duration_seconds": 0.0125, + "cost_usd": 0.0, + "tokens": 0, + "config_snapshot": { + "mode": "fake_trace", + "seed": 42, + "target_prompt": { + "name": "life_assistant_system", + "path": "prompts/baseline_system.md", + "kind": "system_prompt" + }, + "case_meta": "case_meta.json", + "evaluate": { + "fake_model": true, + "fake_judge": true, + "trace_mode": true, + "metrics": [ + { + "name": "final_response", + "weight": 0.45, + "threshold": 0.8 + }, + { + "name": "tool_trajectory", + "weight": 0.35, + "threshold": 0.8 + }, + { + "name": "rubric", + "weight": 0.2, + "threshold": 0.8 + } + ] + }, + "optimize": { + "algorithm": "scripted_agent_optimizer_bridge", + "note": "fake/trace mode applies a deterministic prompt patch instead of invoking AgentOptimizer, so the example stays reproducible without an API key. For a live AgentOptimizer (GEPA) run see examples/optimization/quickstart.", + "max_rounds": 1, + "update_source": false, + "candidate_patch": [ + "", + "Optimization candidate:", + "- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions.", + "- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified." + ] + }, + "gate": { + "min_val_score_gain": 0.1, + "reject_on_new_hard_fail": true, + "hard_fail_threshold": 0.6, + "reject_on_critical_regression": true, + "reject_overfit_train_up_val_down": true, + "max_cost_usd": 0.01 + }, + "audit": { + "output_dir": ".", + "record_case_traces": true, + "record_prompt_snapshots": true + } + } + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/optimization_report.md b/examples/optimization/eval_optimize_loop/optimization_report.md new file mode 100644 index 0000000..33ba971 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.md @@ -0,0 +1,36 @@ +# Optimization Report + +## 人话总结 + +本次(fake_trace 模式)决定**拒绝**候选 prompt。训练集均分 0.25→0.7833(+0.5333),验证集 0.7333→0.6667(-0.0666)。训练涨但验证跌,呈现过拟合特征。验证集新增通过:val_search_fallback_new_pass。⚠️ 验证集新增失败:val_smalltalk_no_tool_regression、val_weather_soft_degradation。被以下 gate 拦截:validation_gain_threshold、no_new_hard_fail、no_critical_regression、not_overfit_train_up_val_down。baseline 失败归因:final_response_mismatch×4、tool_call_error×2、format_error×1、knowledge_recall_insufficient×1。 + +- Mode: `fake_trace` +- Decision: **REJECT** +- Reason: validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down +- Baseline train score: 0.25 +- Candidate train score: 0.7833 +- Baseline val score: 0.7333 +- Candidate val score: 0.6667 +- Train gain: +0.5333 +- Val gain: -0.0666 + +## Failure Attribution + +- final_response_mismatch: 4 +- tool_call_error: 2 +- format_error: 1 +- knowledge_recall_insufficient: 1 + +## Validation Delta + +- `val_search_fallback_new_pass`: new_pass (0.2 -> 1.0, delta +0.8000) +- `val_smalltalk_no_tool_regression`: new_fail (1.0 -> 0.45, delta -0.5500) +- `val_weather_soft_degradation`: new_fail (1.0 -> 0.55, delta -0.4500) + +## Gate Checks + +- FAIL `validation_gain_threshold`: val_gain=-0.0666, required>=+0.1000 +- FAIL `no_new_hard_fail`: new_hard_fails=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation'] +- FAIL `no_critical_regression`: critical_regressions=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation'] +- FAIL `not_overfit_train_up_val_down`: train_gain=+0.5333, val_gain=-0.0666 +- PASS `cost_budget`: cost_usd=0.0000, budget=0.0100 diff --git a/examples/optimization/eval_optimize_loop/optimizer.json b/examples/optimization/eval_optimize_loop/optimizer.json new file mode 100644 index 0000000..7bf12b8 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.json @@ -0,0 +1,45 @@ +{ + "mode": "fake_trace", + "seed": 42, + "target_prompt": { + "name": "life_assistant_system", + "path": "prompts/baseline_system.md", + "kind": "system_prompt" + }, + "case_meta": "case_meta.json", + "evaluate": { + "fake_model": true, + "fake_judge": true, + "trace_mode": true, + "metrics": [ + {"name": "final_response", "weight": 0.45, "threshold": 0.8}, + {"name": "tool_trajectory", "weight": 0.35, "threshold": 0.8}, + {"name": "rubric", "weight": 0.20, "threshold": 0.8} + ] + }, + "optimize": { + "algorithm": "scripted_agent_optimizer_bridge", + "note": "fake/trace mode applies a deterministic prompt patch instead of invoking AgentOptimizer, so the example stays reproducible without an API key. For a live AgentOptimizer (GEPA) run see examples/optimization/quickstart.", + "max_rounds": 1, + "update_source": false, + "candidate_patch": [ + "", + "Optimization candidate:", + "- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions.", + "- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified." + ] + }, + "gate": { + "min_val_score_gain": 0.1, + "reject_on_new_hard_fail": true, + "hard_fail_threshold": 0.6, + "reject_on_critical_regression": true, + "reject_overfit_train_up_val_down": true, + "max_cost_usd": 0.01 + }, + "audit": { + "output_dir": ".", + "record_case_traces": true, + "record_prompt_snapshots": true + } +} diff --git a/examples/optimization/eval_optimize_loop/prompts/baseline_system.md b/examples/optimization/eval_optimize_loop/prompts/baseline_system.md new file mode 100644 index 0000000..b462b93 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/prompts/baseline_system.md @@ -0,0 +1,7 @@ +You are a concise life assistant. + +Rules: +- Answer directly when possible. +- Use the weather tool for weather questions. +- Do not invent data. +- Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/run.py b/examples/optimization/eval_optimize_loop/run.py new file mode 100644 index 0000000..774195f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/run.py @@ -0,0 +1,836 @@ +"""Reproducible Evaluation + Optimization closed-loop example. + +The pipeline runs six auditable phases over a single system prompt: + + 1. baseline evaluation (train + val, per-case metrics/pass-fail/trace) + 2. failure attribution (rule based over structured trace + case metadata) + 3. optimization (scripted AgentOptimizer bridge in fake/trace mode) + 4. candidate validation (full re-run + case-by-case diff vs baseline) + 5. acceptance gate (validation-first, configurable, multi-constraint) + 6. audit persistence (JSON + Markdown report, prompt snapshots, repro info) + +Default mode is fake/trace and requires no API key. The first invocation may +spend time on a one-off ``uv sync``; once dependencies are installed the loop +itself completes in a few seconds:: + + uv run python examples/optimization/eval_optimize_loop/run.py + +Log verbosity is controlled by the ``YUN_LOG_LEVEL`` environment variable +(default ``INFO``). +""" + +from __future__ import annotations + +import argparse +import asyncio +import hashlib +import json +import logging +import os +import time +from collections import Counter +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +try: + from trpc_agent_sdk.evaluation import AgentEvaluator, AgentOptimizer, EvalSet, TargetPrompt +except Exception: # pragma: no cover - keeps fake mode runnable if the SDK changes. + AgentEvaluator = AgentOptimizer = TargetPrompt = None + EvalSet = None + +HERE = Path(__file__).resolve().parent +logger = logging.getLogger("eval_optimize_loop") + +# Tool name treated as the "authoritative" search backend. When a case declares +# ``tool_intent == "authoritative_search"`` and the agent fails to call this +# tool, the trajectory miss is attributed to weak knowledge recall rather than a +# generic tool error. +AUTHORITATIVE_SEARCH_TOOL = "uapi_search" + + +@dataclass +class CaseResult: + """Scored outcome of a single evaluation case. + + Attributes: + case_id: The ``eval_id`` of the case. + score: Weighted aggregate score in ``[0, 1]``. + passed: Whether ``score`` meets the pass threshold. + hard_fail: Whether ``score`` falls below the gate hard-fail threshold. + key: Whether the case is marked critical (must not regress). + metrics: Per-metric sub-scores (final_response / tool_trajectory / rubric). + failure_types: Attributed failure categories (empty when passed). + reason: Human-readable summary (``"pass"`` or joined failure types). + trace: Key trajectory fields used for attribution and auditing. + """ + + case_id: str + score: float + passed: bool + hard_fail: bool + key: bool + metrics: dict[str, float] + failure_types: list[str] + reason: str + trace: dict[str, Any] = field(default_factory=dict) + + +def load_json(path: Path) -> dict[str, Any]: + """Load a JSON document, raising a readable error on malformed input. + + Args: + path: Path to the JSON file. + + Returns: + The parsed JSON object. + + Raises: + SystemExit: If the file cannot be read or parsed. + """ + try: + return json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise SystemExit(f"无法读取 JSON 文件 {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise SystemExit(f"JSON 解析失败 {path}: {exc}") from exc + + +def write_json(path: Path, data: dict[str, Any]) -> None: + """Serialize ``data`` to ``path`` as UTF-8 JSON with stable indentation.""" + path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + + +def sha256_text(text: str) -> str: + """Return the hex SHA-256 digest of ``text`` (used for prompt audit).""" + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def validate_evalset(path: Path) -> dict[str, Any]: + """Load an evalset and validate it against the SDK schema when available. + + JSON/IO errors abort with a readable message; an SDK schema mismatch is a + non-fatal warning (the fake evaluator only needs the documented fields), but + an evalset with no ``eval_cases`` is treated as fatal. + + Args: + path: Path to the ``*.evalset.json`` file. + + Returns: + The parsed evalset object. + + Raises: + SystemExit: On IO/JSON errors or an empty/invalid evalset. + """ + try: + raw = path.read_text(encoding="utf-8") + except OSError as exc: + raise SystemExit(f"无法读取评测集 {path}: {exc}") from exc + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise SystemExit(f"评测集 JSON 解析失败 {path}: {exc}") from exc + if EvalSet is not None: + try: + EvalSet.model_validate_json(raw) + except Exception as exc: # pragma: no cover - schema drift is non-fatal here. + logger.warning("EvalSet schema 校验未通过 %s: %s", path.name, str(exc)[:300]) + if not data.get("eval_cases"): + raise SystemExit(f"评测集缺少 eval_cases 或为空: {path}") + return data + + +def sdk_trace_smoke(evalset_path: Path) -> dict[str, Any]: + """Run a trace-only ``AgentEvaluator`` smoke check against an evalset. + + This proves the example is wired to the real SDK evaluator without needing a + model: the metric uses a deterministic ``final_response contains`` criterion. + A threshold miss is expected and reported as ``FAILED_EXPECTED`` rather than + an error, so the smoke never blocks the pipeline. + + Args: + evalset_path: Evalset to feed the SDK evaluator. + + Returns: + A status dict describing the smoke outcome. + """ + metrics_path = HERE / "_sdk_eval_metrics.json" + write_json( + metrics_path, + { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.1, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": True} + } + }, + } + ] + }, + ) + if AgentEvaluator is None: + logger.warning("AgentEvaluator 不可导入,跳过 SDK 冒烟。") + return {"status": "SKIPPED", "reason": "AgentEvaluator is not importable"} + + async def _run() -> dict[str, Any]: + cwd = Path.cwd() + os.chdir(HERE) + try: + executer = AgentEvaluator.get_executer( + evalset_path.name, + eval_metrics_file_path_or_dir=metrics_path.name, + print_detailed_results=False, + print_summary_report=False, + ) + try: + await executer.evaluate() + status = "PASSED" + reason = "trace-only AgentEvaluator smoke completed" + except AssertionError as exc: + status = "FAILED_EXPECTED" + reason = str(exc)[:500] + except Exception as exc: # pragma: no cover - defensive: SDK runtime error. + status = "FAILED_SDK_SMOKE" + reason = f"{type(exc).__name__}: {str(exc)[:500]}" + return { + "status": status, + "reason": reason, + "has_result": executer.get_result() is not None, + "metrics_file": metrics_path.name, + } + finally: + os.chdir(cwd) + + return asyncio.run(_run()) + + +def invocation_text(invocation: dict[str, Any], field_name: str) -> str: + """Concatenate the text parts of a conversation field (``user_content`` etc.).""" + content = invocation[field_name] + return "".join(part.get("text", "") for part in content.get("parts", [])) + + +def expected_tools(invocation: dict[str, Any]) -> list[dict[str, Any]]: + """Return the expected ``tool_uses`` list from an expected invocation.""" + data = invocation.get("intermediate_data") or {} + return data.get("tool_uses") or [] + + +def fake_agent(prompt: str, query: str) -> dict[str, Any]: + """Deterministic stand-in for a real agent, keyed off prompt feature flags. + + The fake reads two scripted capability flags from the prompt + (``USE_UAPI_TOOLS`` / ``AGGRESSIVE_SEARCH``) so the optimization candidate + produces observable behavior changes without any model call. + + Args: + prompt: The system prompt (baseline or candidate) currently under test. + query: The user query for this case. + + Returns: + A dict with ``text`` (final response) and ``tools`` (tool-call list). + """ + uses_uapi = "USE_UAPI_TOOLS" in prompt + aggressive_search = "AGGRESSIVE_SEARCH" in prompt + + if "公网 IP" in query: + if uses_uapi: + return {"text": "你的公网 IP 是 203.0.113.10。", "tools": [{"name": "get_my_public_ip", "args": {"source": "commercial"}}]} + return {"text": "我无法确定你的公网 IP。", "tools": []} + + if "2026-10-01" in query: + if uses_uapi: + return {"text": "2026-10-01 是法定休息日。", "tools": [{"name": "query_holiday_calendar", "args": {"date": "2026-10-01", "holiday_type": "legal"}}]} + return {"text": "这天大概率是节日,但我没有查询。", "tools": []} + + if "只返回 JSON" in query: + return {"text": "status ok", "tools": []} + + if "Go 最新版本" in query: + if uses_uapi: + return {"text": "Go 1.26 是当前查询到的最新版本。", "tools": [{"name": "uapi_search", "args": {"query": "Go 最新版本"}}]} + return {"text": "Go 有新版本,但我没有足够信息确认。", "tools": [{"name": "websearch", "args": {"query": "Go 最新版本"}}]} + + if query == "在吗": + if aggressive_search: + return {"text": "我查了一下网页:在。", "tools": [{"name": "uapi_search", "args": {"query": "在吗"}}]} + return {"text": "在。", "tools": []} + + if "北京天气" in query: + if aggressive_search: + return { + "text": "北京今天有天气信息。", + "tools": [ + {"name": "get_current_weather", "args": {"city": "北京"}}, + {"name": "uapi_search", "args": {"query": "北京天气"}}, + ], + } + return {"text": "北京今天有天气信息。", "tools": [{"name": "get_current_weather", "args": {"city": "北京"}}]} + + return {"text": "收到。", "tools": []} + + +def normalize_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Project tool calls to ``{name, args}`` for order-sensitive comparison.""" + return [{"name": tool.get("name"), "args": tool.get("args", {})} for tool in tools] + + +def rubric_score(meta: dict[str, Any], actual: dict[str, Any]) -> float: + """Score the rubric dimension a case declares in ``case_meta.json``. + + The rubric kind is data driven (not inferred from the case id): + + * ``json_format`` -> 1.0 if the reply is a JSON object, else 0.0 + * ``no_tool`` -> 1.0 if no tool was called, else 0.0 + * ``single_tool`` -> 0.5 if more than one tool was called, else 1.0 + * ``none`` / unset -> 1.0 + + Args: + meta: The case metadata entry. + actual: The agent output (``text`` + ``tools``). + + Returns: + A rubric score in ``[0, 1]``. + """ + kind = meta.get("rubric", "none") + if kind == "json_format": + return 1.0 if actual["text"].strip().startswith("{") else 0.0 + if kind == "no_tool": + return 0.0 if actual["tools"] else 1.0 + if kind == "single_tool": + return 0.5 if len(actual["tools"]) > 1 else 1.0 + return 1.0 + + +def classify_tool_failure( + actual: list[dict[str, Any]], + expected: list[dict[str, Any]], + tool_intent: str, +) -> str | None: + """Attribute a tool-trajectory mismatch to a specific failure category. + + Classification uses only the structured trajectory (expected vs actual tool + calls) plus the case's declared ``tool_intent`` — never the case id: + + * ``knowledge_recall_insufficient`` — the case relies on the + authoritative search tool but the agent did not call it. + * ``spurious_tool_call`` — the agent issued every expected call *and* + extra ones (over-calling), including calling tools when none were + expected. + * ``parameter_error`` — the leading tool name matches but arguments + differ. + * ``tool_call_error`` — a wrong or missing tool otherwise. + + Args: + actual: Actual tool-call list. + expected: Expected tool-call list. + tool_intent: Attribution hint from ``case_meta.json``. + + Returns: + A failure category, or ``None`` when trajectories match. + """ + a, e = normalize_tools(actual), normalize_tools(expected) + if a == e: + return None + a_names = {tool["name"] for tool in a} + e_names = {tool["name"] for tool in e} + if tool_intent == "authoritative_search" and AUTHORITATIVE_SEARCH_TOOL not in a_names: + return "knowledge_recall_insufficient" + if a and all(tool in a for tool in e) and len(a) > len(e): + return "spurious_tool_call" + if a and e and a[0]["name"] == e[0]["name"]: + return "parameter_error" + return "tool_call_error" + + +def classify_rubric_failure(meta: dict[str, Any]) -> str: + """Map a failed rubric dimension to its attribution category.""" + if meta.get("rubric") == "json_format": + return "format_error" + return "llm_rubric_not_met" + + +def failure_types_for( + meta: dict[str, Any], + final_score: float, + tool_score: float, + rubric: float, + actual: dict[str, Any], + expected: list[dict[str, Any]], +) -> list[str]: + """Collect all failure categories for a case from its sub-scores. + + Args: + meta: Case metadata (``rubric`` / ``tool_intent``). + final_score: Final-response sub-score. + tool_score: Tool-trajectory sub-score. + rubric: Rubric sub-score. + actual: Agent output. + expected: Expected tool calls. + + Returns: + An ordered, de-duplicated list of failure category labels. + """ + failures: list[str] = [] + if final_score < 1.0: + failures.append("final_response_mismatch") + if tool_score < 1.0: + label = classify_tool_failure(actual["tools"], expected, meta.get("tool_intent", "none")) + if label: + failures.append(label) + if rubric < 1.0: + failures.append(classify_rubric_failure(meta)) + return failures + + +def score_case( + case: dict[str, Any], + prompt: str, + cfg: dict[str, Any], + case_meta: dict[str, Any], +) -> CaseResult: + """Evaluate one case against a prompt and return its scored result. + + Args: + case: An ``eval_cases`` entry from the evalset. + prompt: The system prompt under test. + cfg: The optimizer config (metric weights, gate thresholds). + case_meta: Mapping of ``eval_id`` to per-case metadata. + + Returns: + A :class:`CaseResult` with metrics, attribution and trace. + """ + invocation = case["conversation"][0] + query = invocation_text(invocation, "user_content") + expected_text = invocation_text(invocation, "final_response") + expected = expected_tools(invocation) + meta = case_meta.get(case["eval_id"], {}) + actual = fake_agent(prompt, query) + + final_score = 1.0 if expected_text.lower() in actual["text"].lower() else 0.0 + tool_score = 1.0 if normalize_tools(actual["tools"]) == normalize_tools(expected) else 0.0 + rubric = rubric_score(meta, actual) + weights = {m["name"]: m["weight"] for m in cfg["evaluate"]["metrics"]} + score = round( + final_score * weights["final_response"] + + tool_score * weights["tool_trajectory"] + + rubric * weights["rubric"], + 4, + ) + passed = score >= 0.8 + failure_types = failure_types_for(meta, final_score, tool_score, rubric, actual, expected) + hard_fail = score < cfg["gate"]["hard_fail_threshold"] + return CaseResult( + case_id=case["eval_id"], + score=score, + passed=passed, + hard_fail=hard_fail, + key=bool(meta.get("key", False)), + metrics={ + "final_response": final_score, + "tool_trajectory": tool_score, + "rubric": rubric, + }, + failure_types=failure_types, + reason="pass" if passed else "; ".join(failure_types), + trace={ + "query": query, + "expected_text": expected_text, + "actual_text": actual["text"], + "expected_tools": expected, + "actual_tools": actual["tools"], + }, + ) + + +def evaluate_evalset( + evalset: dict[str, Any], + prompt: str, + cfg: dict[str, Any], + case_meta: dict[str, Any], +) -> dict[str, Any]: + """Score every case in an evalset and aggregate mean score and pass rate.""" + cases = [score_case(case, prompt, cfg, case_meta) for case in evalset["eval_cases"]] + mean = round(sum(case.score for case in cases) / len(cases), 4) + return { + "eval_set_id": evalset["eval_set_id"], + "mean_score": mean, + "pass_rate": round(sum(case.passed for case in cases) / len(cases), 4), + "cases": {case.case_id: case.__dict__ for case in cases}, + } + + +def attribute_failures(*results: dict[str, Any]) -> dict[str, Any]: + """Cluster baseline failures into category counts and a per-case breakdown. + + Args: + *results: One or more evaluated evalsets (baseline train / val). + + Returns: + A dict with ``counts`` (category -> frequency) and ``by_case`` + (case id -> failure categories), covering only failing cases. + """ + counts: Counter[str] = Counter() + by_case: dict[str, list[str]] = {} + for result in results: + for case_id, case in result["cases"].items(): + if case["passed"]: + continue + failure_types = case["failure_types"] or ["unknown"] + by_case[case_id] = failure_types + counts.update(failure_types) + return {"counts": dict(counts), "by_case": by_case} + + +def optimize_prompt(baseline: str, cfg: dict[str, Any], run_dir: Path) -> tuple[str, dict[str, Any]]: + """Produce a candidate prompt for the current (fake/trace) mode. + + In fake/trace mode this applies the deterministic ``candidate_patch`` from + the config instead of invoking :class:`AgentOptimizer`, so the example stays + reproducible without an API key. The returned status distinguishes optimizer + *availability* from *invocation* to avoid implying a real search happened. + + Args: + baseline: The baseline prompt text. + cfg: The optimizer config. + run_dir: Directory for candidate prompt snapshots. + + Returns: + A tuple of ``(candidate_text, status_dict)``. + """ + candidate = baseline.rstrip() + "\n" + "\n".join(cfg["optimize"]["candidate_patch"]) + "\n" + candidate_path = run_dir / "candidate_prompt.md" + candidate_path.write_text(candidate, encoding="utf-8") + return candidate, { + "status": "SCRIPTED_CANDIDATE", + "algorithm": cfg["optimize"]["algorithm"], + "agent_optimizer_available": AgentOptimizer is not None and TargetPrompt is not None, + "agent_optimizer_invoked": False, + "note": "fake/trace mode applies a deterministic patch; see examples/optimization/quickstart for a live GEPA run.", + "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), + "cost_usd": 0.0, + "tokens": 0, + } + + +def diff_cases(baseline: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]: + """Compute per-case deltas (new_pass / new_fail / score_up / score_down / same).""" + delta = {} + for case_id, cand in candidate["cases"].items(): + base = baseline["cases"][case_id] + if not base["passed"] and cand["passed"]: + kind = "new_pass" + elif base["passed"] and not cand["passed"]: + kind = "new_fail" + elif cand["score"] > base["score"]: + kind = "score_up" + elif cand["score"] < base["score"]: + kind = "score_down" + else: + kind = "same" + delta[case_id] = { + "kind": kind, + "baseline_score": base["score"], + "candidate_score": cand["score"], + "delta": round(cand["score"] - base["score"], 4), + "baseline_passed": base["passed"], + "candidate_passed": cand["passed"], + } + return delta + + +def gate_decision( + baseline_train: dict[str, Any], + candidate_train: dict[str, Any], + baseline_val: dict[str, Any], + candidate_val: dict[str, Any], + val_delta: dict[str, Any], + cfg: dict[str, Any], + cost_usd: float, +) -> dict[str, Any]: + """Run the validation-first acceptance gate and return its decision. + + Five independent, configurable checks must all pass to ACCEPT: + + 1. validation mean-score gain meets ``min_val_score_gain``; + 2. no new hard failure appears on validation; + 3. no *key* validation case regresses (new_fail / score_down); + 4. not overfitting (train up while validation down); + 5. optimization cost stays within ``max_cost_usd``. + + Args: + baseline_train: Baseline train evaluation. + candidate_train: Candidate train evaluation. + baseline_val: Baseline validation evaluation. + candidate_val: Candidate validation evaluation. + val_delta: Per-case validation deltas from :func:`diff_cases`. + cfg: Optimizer config (the ``gate`` block). + cost_usd: Optimization cost in USD. + + Returns: + A decision dict with ``accepted`` / ``decision`` / ``reason`` and the + per-check breakdown. + """ + gate = cfg["gate"] + train_gain = round(candidate_train["mean_score"] - baseline_train["mean_score"], 4) + val_gain = round(candidate_val["mean_score"] - baseline_val["mean_score"], 4) + new_hard_fails = [ + case_id + for case_id, case in candidate_val["cases"].items() + if case["hard_fail"] and not baseline_val["cases"][case_id]["hard_fail"] + ] + # A "critical" regression is one on a case explicitly marked key=true in + # case_meta.json, not merely any validation case. + critical_regressions = [ + case_id + for case_id, diff in val_delta.items() + if candidate_val["cases"][case_id]["key"] and diff["kind"] in {"new_fail", "score_down"} + ] + checks = [ + { + "name": "validation_gain_threshold", + "passed": val_gain >= gate["min_val_score_gain"], + "detail": f"val_gain={val_gain:+.4f}, required>={gate['min_val_score_gain']:+.4f}", + }, + { + "name": "no_new_hard_fail", + "passed": not (gate["reject_on_new_hard_fail"] and new_hard_fails), + "detail": f"new_hard_fails={new_hard_fails}", + }, + { + "name": "no_critical_regression", + "passed": not (gate["reject_on_critical_regression"] and critical_regressions), + "detail": f"critical_regressions={critical_regressions}", + }, + { + "name": "not_overfit_train_up_val_down", + "passed": not (gate["reject_overfit_train_up_val_down"] and train_gain > 0 and val_gain < 0), + "detail": f"train_gain={train_gain:+.4f}, val_gain={val_gain:+.4f}", + }, + { + "name": "cost_budget", + "passed": cost_usd <= gate["max_cost_usd"], + "detail": f"cost_usd={cost_usd:.4f}, budget={gate['max_cost_usd']:.4f}", + }, + ] + accepted = all(check["passed"] for check in checks) + return { + "accepted": accepted, + "decision": "ACCEPT" if accepted else "REJECT", + "reason": "all gates passed" if accepted else "; ".join(check["name"] for check in checks if not check["passed"]), + "train_gain": train_gain, + "val_gain": val_gain, + "checks": checks, + } + + +def narrate_report(report: dict[str, Any]) -> str: + """Render a data-driven, plain-language Chinese summary of the run. + + The narrative is derived entirely from the gate decision, validation deltas + and failure attribution, so it stays correct for any input (unlike a static + paragraph). It is deterministic and needs no model, keeping the no-key path + reproducible. + + Args: + report: The fully assembled report dict. + + Returns: + A short multi-sentence Chinese summary. + """ + gate = report["gate"] + verb = "接受" if gate["decision"] == "ACCEPT" else "拒绝" + parts = [ + f"本次({report['run']['mode']} 模式)决定**{verb}**候选 prompt。" + f"训练集均分 {report['baseline']['train']['mean_score']}→{report['candidate']['train']['mean_score']}" + f"({gate['train_gain']:+.4f}),验证集 {report['baseline']['val']['mean_score']}→" + f"{report['candidate']['val']['mean_score']}({gate['val_gain']:+.4f})。" + ] + if gate["train_gain"] > 0 and gate["val_gain"] < 0: + parts.append("训练涨但验证跌,呈现过拟合特征。") + + new_pass = [cid for cid, d in report["delta"]["val"].items() if d["kind"] == "new_pass"] + new_fail = [cid for cid, d in report["delta"]["val"].items() if d["kind"] == "new_fail"] + if new_pass: + parts.append(f"验证集新增通过:{'、'.join(new_pass)}。") + if new_fail: + parts.append(f"⚠️ 验证集新增失败:{'、'.join(new_fail)}。") + + failed_checks = [c["name"] for c in gate["checks"] if not c["passed"]] + if gate["decision"] == "REJECT" and failed_checks: + parts.append("被以下 gate 拦截:" + "、".join(failed_checks) + "。") + elif gate["decision"] == "ACCEPT": + parts.append("五项 gate 全部通过:验证集提升达标、无过拟合、关键 case 未退化、无新增 hard fail、成本在预算内。") + + counts = report["failure_attribution"]["counts"] + if counts: + top = "、".join(f"{k}×{v}" for k, v in counts.items()) + parts.append(f"baseline 失败归因:{top}。") + return "".join(parts) + + +def build_report( + cfg: dict[str, Any], + baseline_prompt: str, + candidate_prompt: str, + artifacts: dict[str, Any], + sdk_smoke: dict[str, Any], +) -> dict[str, Any]: + """Assemble the full audit report from run metadata and computed artifacts.""" + return { + "run": { + "timestamp": datetime.now(timezone.utc).isoformat(), + "mode": cfg["mode"], + "seed": cfg["seed"], + "sdk_bridge": { + "evalset_validated_with_trpc_sdk": EvalSet is not None, + "agent_evaluator_available": AgentEvaluator is not None, + "agent_optimizer_available": AgentOptimizer is not None, + "agent_evaluator_trace_smoke": sdk_smoke, + }, + "repro": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json", + "optimizer_config": "optimizer.json", + "prompt_source": cfg["target_prompt"]["path"], + }, + }, + "prompt_audit": { + "target": cfg["target_prompt"], + "baseline_sha256": sha256_text(baseline_prompt), + "candidate_sha256": sha256_text(candidate_prompt), + "baseline_snapshot": "runs/latest/baseline_prompt.md", + "candidate_snapshot": "runs/latest/candidate_prompt.md", + }, + **artifacts, + } + + +def write_markdown(report: dict[str, Any], path: Path) -> None: + """Write the human-readable Markdown report, including the data-driven summary.""" + gate = report["gate"] + lines = [ + "# Optimization Report", + "", + "## 人话总结", + "", + narrate_report(report), + "", + f"- Mode: `{report['run']['mode']}`", + f"- Decision: **{gate['decision']}**", + f"- Reason: {gate['reason']}", + f"- Baseline train score: {report['baseline']['train']['mean_score']}", + f"- Candidate train score: {report['candidate']['train']['mean_score']}", + f"- Baseline val score: {report['baseline']['val']['mean_score']}", + f"- Candidate val score: {report['candidate']['val']['mean_score']}", + f"- Train gain: {gate['train_gain']:+.4f}", + f"- Val gain: {gate['val_gain']:+.4f}", + "", + "## Failure Attribution", + "", + ] + for name, count in report["failure_attribution"]["counts"].items(): + lines.append(f"- {name}: {count}") + lines.extend(["", "## Validation Delta", ""]) + for case_id, diff in report["delta"]["val"].items(): + lines.append(f"- `{case_id}`: {diff['kind']} ({diff['baseline_score']} -> {diff['candidate_score']}, delta {diff['delta']:+.4f})") + lines.extend(["", "## Gate Checks", ""]) + for check in gate["checks"]: + mark = "PASS" if check["passed"] else "FAIL" + lines.append(f"- {mark} `{check['name']}`: {check['detail']}") + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> None: + """Run the full evaluation + optimization loop and persist the audit report.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--train", default="train.evalset.json") + parser.add_argument("--val", default="val.evalset.json") + parser.add_argument("--optimizer", default="optimizer.json") + parser.add_argument("--prompt", default="prompts/baseline_system.md") + args = parser.parse_args() + + logging.basicConfig( + level=os.environ.get("YUN_LOG_LEVEL", "INFO"), + format="%(asctime)s %(levelname)s %(name)s | %(message)s", + ) + + start = time.perf_counter() + cfg = load_json(HERE / args.optimizer) + train = validate_evalset(HERE / args.train) + val = validate_evalset(HERE / args.val) + case_meta = {k: v for k, v in load_json(HERE / cfg.get("case_meta", "case_meta.json")).items() if not k.startswith("_")} + logger.info("加载完成 mode=%s seed=%s train_cases=%d val_cases=%d", + cfg["mode"], cfg["seed"], len(train["eval_cases"]), len(val["eval_cases"])) + + sdk_smoke = sdk_trace_smoke(HERE / args.train) + logger.info("SDK trace 冒烟: %s", sdk_smoke["status"]) + baseline_prompt = (HERE / args.prompt).read_text(encoding="utf-8") + + run_dir = HERE / "runs" / "latest" + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "baseline_prompt.md").write_text(baseline_prompt, encoding="utf-8") + + # Phase 1 — baseline evaluation. + baseline_train = evaluate_evalset(train, baseline_prompt, cfg, case_meta) + baseline_val = evaluate_evalset(val, baseline_prompt, cfg, case_meta) + logger.info("baseline 均分 train=%.4f val=%.4f", baseline_train["mean_score"], baseline_val["mean_score"]) + + # Phase 2 — failure attribution over baseline failures only. + failures = attribute_failures(baseline_train, baseline_val) + logger.info("baseline 失败归因: %s", failures["counts"]) + + # Phase 3 — optimization (scripted candidate in fake/trace mode). + candidate_prompt, opt_status = optimize_prompt(baseline_prompt, cfg, run_dir) + logger.info("优化器: status=%s invoked=%s", opt_status["status"], opt_status["agent_optimizer_invoked"]) + + # Phase 4 — candidate validation + diff. + candidate_train = evaluate_evalset(train, candidate_prompt, cfg, case_meta) + candidate_val = evaluate_evalset(val, candidate_prompt, cfg, case_meta) + logger.info("candidate 均分 train=%.4f val=%.4f", candidate_train["mean_score"], candidate_val["mean_score"]) + train_delta = diff_cases(baseline_train, candidate_train) + val_delta = diff_cases(baseline_val, candidate_val) + + # Phase 5 — acceptance gate. + gate = gate_decision(baseline_train, candidate_train, baseline_val, candidate_val, + val_delta, cfg, opt_status["cost_usd"]) + for check in gate["checks"]: + logger.info("gate %-30s %s | %s", check["name"], "PASS" if check["passed"] else "FAIL", check["detail"]) + logger.info("gate 决策: %s (%s)", gate["decision"], gate["reason"]) + duration = round(time.perf_counter() - start, 4) + + # Phase 6 — audit persistence. + report = build_report( + cfg, + baseline_prompt, + candidate_prompt, + { + "baseline": {"train": baseline_train, "val": baseline_val}, + "candidate": {"train": candidate_train, "val": candidate_val}, + "delta": {"train": train_delta, "val": val_delta}, + "failure_attribution": failures, + "optimizer": opt_status, + "gate": gate, + "audit": { + "duration_seconds": duration, + "cost_usd": opt_status["cost_usd"], + "tokens": opt_status["tokens"], + "config_snapshot": cfg, + }, + }, + sdk_smoke, + ) + write_json(HERE / "optimization_report.json", report) + write_markdown(report, HERE / "optimization_report.md") + logger.info("已写出 optimization_report.json / .md,用时 %.4fs", duration) + print(f"{gate['decision']}: {gate['reason']}") + print("wrote optimization_report.json and optimization_report.md") + + +if __name__ == "__main__": + main() diff --git a/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md b/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md new file mode 100644 index 0000000..b462b93 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md @@ -0,0 +1,7 @@ +You are a concise life assistant. + +Rules: +- Answer directly when possible. +- Use the weather tool for weather questions. +- Do not invent data. +- Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md b/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md new file mode 100644 index 0000000..ec588a4 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md @@ -0,0 +1,11 @@ +You are a concise life assistant. + +Rules: +- Answer directly when possible. +- Use the weather tool for weather questions. +- Do not invent data. +- Keep responses short. + +Optimization candidate: +- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions. +- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified. diff --git a/examples/optimization/eval_optimize_loop/train.evalset.json b/examples/optimization/eval_optimize_loop/train.evalset.json new file mode 100644 index 0000000..84564db --- /dev/null +++ b/examples/optimization/eval_optimize_loop/train.evalset.json @@ -0,0 +1,75 @@ +{ + "eval_set_id": "eval_optimize_loop_train", + "name": "Evaluation optimization loop train set", + "description": "Three trace-mode training cases: two optimizable failures and one intentionally ineffective optimization case.", + "eval_cases": [ + { + "eval_id": "train_ip_lookup_optimizable", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-1", + "user_content": {"role": "user", "parts": [{"text": "查询我的公网 IP"}]}, + "final_response": {"role": "model", "parts": [{"text": "我无法确定你的公网 IP。"}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-1", + "user_content": {"role": "user", "parts": [{"text": "查询我的公网 IP"}]}, + "final_response": {"role": "model", "parts": [{"text": "203.0.113.10"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "get_my_public_ip", "args": {"source": "commercial"}} + ] + } + } + ] + }, + { + "eval_id": "train_calendar_optimizable", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-2", + "user_content": {"role": "user", "parts": [{"text": "2026-10-01 是不是休息日?"}]}, + "final_response": {"role": "model", "parts": [{"text": "这天大概率是节日,但我没有查询。"}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-2", + "user_content": {"role": "user", "parts": [{"text": "2026-10-01 是不是休息日?"}]}, + "final_response": {"role": "model", "parts": [{"text": "休息日"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "query_holiday_calendar", "args": {"date": "2026-10-01", "holiday_type": "legal"}} + ] + } + } + ] + }, + { + "eval_id": "train_strict_json_ineffective", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-3", + "user_content": {"role": "user", "parts": [{"text": "只返回 JSON:status ok"}]}, + "final_response": {"role": "model", "parts": [{"text": "status ok"}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-3", + "user_content": {"role": "user", "parts": [{"text": "只返回 JSON:status ok"}]}, + "final_response": {"role": "model", "parts": [{"text": "{\"status\":\"ok\"}"}]}, + "intermediate_data": {"tool_uses": []} + } + ] + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/val.evalset.json b/examples/optimization/eval_optimize_loop/val.evalset.json new file mode 100644 index 0000000..4b2787a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/val.evalset.json @@ -0,0 +1,83 @@ +{ + "eval_set_id": "eval_optimize_loop_val", + "name": "Evaluation optimization loop validation set", + "description": "Three trace-mode validation cases: one new pass, one hard regression, and one soft degradation.", + "eval_cases": [ + { + "eval_id": "val_search_fallback_new_pass", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-1", + "user_content": {"role": "user", "parts": [{"text": "默认搜索不够时,查一下 Go 最新版本"}]}, + "final_response": {"role": "model", "parts": [{"text": "Go 有新版本,但我没有足够信息确认。"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "websearch", "args": {"query": "Go 最新版本"}} + ] + } + } + ], + "conversation": [ + { + "invocation_id": "expected-1", + "user_content": {"role": "user", "parts": [{"text": "默认搜索不够时,查一下 Go 最新版本"}]}, + "final_response": {"role": "model", "parts": [{"text": "Go 1.26"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "uapi_search", "args": {"query": "Go 最新版本"}} + ] + } + } + ] + }, + { + "eval_id": "val_smalltalk_no_tool_regression", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-2", + "user_content": {"role": "user", "parts": [{"text": "在吗"}]}, + "final_response": {"role": "model", "parts": [{"text": "在。"}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-2", + "user_content": {"role": "user", "parts": [{"text": "在吗"}]}, + "final_response": {"role": "model", "parts": [{"text": "在"}]}, + "intermediate_data": {"tool_uses": []} + } + ] + }, + { + "eval_id": "val_weather_soft_degradation", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-3", + "user_content": {"role": "user", "parts": [{"text": "北京天气怎么样?"}]}, + "final_response": {"role": "model", "parts": [{"text": "北京今天有天气信息。"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "get_current_weather", "args": {"city": "北京"}} + ] + } + } + ], + "conversation": [ + { + "invocation_id": "expected-3", + "user_content": {"role": "user", "parts": [{"text": "北京天气怎么样?"}]}, + "final_response": {"role": "model", "parts": [{"text": "北京"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "get_current_weather", "args": {"city": "北京"}} + ] + } + } + ] + } + ] +} From 781cb4a388b90250bc909e8ee128210a5dd65f87 Mon Sep 17 00:00:00 2001 From: Abel Song <2730343900@qq.com> Date: Tue, 30 Jun 2026 23:03:58 +0800 Subject: [PATCH 2/2] feat(optimization): add reproducible eval optimize loop --- .../eval_optimize_loop/.gitignore | 1 + .../optimization/eval_optimize_loop/README.md | 139 ++- .../eval_optimize_loop/agent/__init__.py | 1 + .../eval_optimize_loop/agent/agent.py | 142 +++ .../eval_optimize_loop/case_meta.json | 40 +- .../optimization_report.json | 291 +++--- .../eval_optimize_loop/optimization_report.md | 42 +- .../eval_optimize_loop/optimizer.json | 28 +- .../eval_optimize_loop/optimizer.sdk.json | 70 ++ .../prompts/baseline_system.md | 7 - .../eval_optimize_loop/prompts/system.md | 6 + .../optimization/eval_optimize_loop/run.py | 955 +++++++++--------- .../runs/latest/baseline_prompt.md | 7 +- .../runs/latest/candidate_prompt.md | 11 +- .../eval_optimize_loop/train.evalset.json | 32 +- .../eval_optimize_loop/val.evalset.json | 40 +- 16 files changed, 1058 insertions(+), 754 deletions(-) create mode 100644 examples/optimization/eval_optimize_loop/agent/__init__.py create mode 100644 examples/optimization/eval_optimize_loop/agent/agent.py create mode 100644 examples/optimization/eval_optimize_loop/optimizer.sdk.json delete mode 100644 examples/optimization/eval_optimize_loop/prompts/baseline_system.md create mode 100644 examples/optimization/eval_optimize_loop/prompts/system.md diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore index 3e35535..ed5fe59 100644 --- a/examples/optimization/eval_optimize_loop/.gitignore +++ b/examples/optimization/eval_optimize_loop/.gitignore @@ -2,3 +2,4 @@ # runs/latest prompt snapshots are kept in VCS as example deliverables. __pycache__/ _sdk_eval_metrics.json +runs/latest/agent_optimizer/ diff --git a/examples/optimization/eval_optimize_loop/README.md b/examples/optimization/eval_optimize_loop/README.md index 1c69e29..fc06299 100644 --- a/examples/optimization/eval_optimize_loop/README.md +++ b/examples/optimization/eval_optimize_loop/README.md @@ -1,71 +1,110 @@ -# eval_optimize_loop +# Evaluation + Optimization Loop -> Part of the `examples/optimization` series. Where [`quickstart`](../quickstart) -> drives a live `AgentOptimizer` (GEPA) against a real agent, this example focuses -> on the **closed loop around** optimization — baseline evaluation, failure -> attribution, candidate validation, an acceptance gate, and an auditable report — -> and runs fully reproducibly in fake/trace mode **without an API key**. +## 1. Purpose -This example implements a reproducible Evaluation + Optimization closed loop: +This example implements the issue requirement for a reproducible Evaluation + Optimization pipeline. It is not only an `AgentOptimizer` quickstart: it wraps optimization with baseline evaluation, failure attribution, validation regression, gate decisions, and audit artifacts. + +The default `fake` mode runs without model credentials. The `live` mode uses a real `LlmAgent` bridge and invokes `AgentOptimizer.optimize` against a `TargetPrompt`. + +## 2. Pipeline Stages + +The pipeline runs six stages: + +1. Baseline evaluation: score train and validation sets separately, including metric scores, pass/fail, reasons, and key trace fields. +2. Failure attribution: cluster failures into `final_response_mismatch`, `tool_call_error`, `parameter_error`, `llm_rubric_not_met`, `knowledge_recall_insufficient`, and `format_error`. +3. Optimization execution: fake mode applies a deterministic candidate; live mode calls `AgentOptimizer.optimize` with `TargetPrompt.add_path("system_prompt", ...)`. +4. Candidate validation: rerun train and validation sets and compute per-case deltas such as `new_pass`, `new_fail`, `score_up`, and `score_down`. +5. Acceptance gate: require validation gain, no new hard fail, no key-case regression, no train-up/validation-down overfit, and cost within budget. +6. Audit persistence: write prompt snapshots, scores, deltas, gate reasons, cost, duration, seed, and config snapshots. + +## 3. Directory Layout ```text -baseline evaluation - -> failure attribution - -> prompt candidate generation - -> validation regression - -> acceptance gate - -> auditable JSON/Markdown report +examples/optimization/eval_optimize_loop/ +├── agent/ +│ ├── __init__.py +│ └── agent.py +├── prompts/ +│ └── system.md +├── train.evalset.json +├── val.evalset.json +├── case_meta.json +├── optimizer.json +├── optimizer.sdk.json +├── run.py +├── optimization_report.json +└── optimization_report.md ``` -Run without API keys: +## 4. Inputs + +- `train.evalset.json`: training evaluation set. +- `val.evalset.json`: validation evaluation set; it must be a different file from train. +- `optimizer.json`: outer-loop configuration for mode, metrics, fake candidate patch, and gate thresholds. +- `prompts/system.md`: baseline prompt source registered as the optimization target. +- `case_meta.json`: out-of-schema metadata for key cases, rubric kinds, and attribution hints. +- `optimizer.sdk.json`: live-only SDK optimizer config passed to `AgentOptimizer.optimize`. + +## 5. Outputs + +- `optimization_report.json`: machine-readable audit report with baseline, candidate, delta, gate, attribution, optimizer status, cost, duration, seed, and config snapshot. +- `optimization_report.md`: human-readable decision summary. +- `runs/latest/baseline_prompt.md`: exact baseline prompt snapshot. +- `runs/latest/candidate_prompt.md`: candidate prompt snapshot. +- `runs/latest/agent_optimizer/`: live-only raw SDK artifacts, including `RoundRecord`-backed round files, `result.json`, `summary.txt`, and `best_prompts/`. + +## 6. Run Modes + +Fake mode: ```bash -# First run may spend time on a one-off `uv sync`; the loop itself is ~seconds. -uv run python examples/optimization/eval_optimize_loop/run.py +python examples/optimization/eval_optimize_loop/run.py --mode fake ``` -Set `YUN_LOG_LEVEL=DEBUG` for more verbose logs (default `INFO`). +Live mode: -Inputs: +```bash +set TRPC_AGENT_API_KEY=... +set TRPC_AGENT_BASE_URL=... +set TRPC_AGENT_MODEL_NAME=... +python examples/optimization/eval_optimize_loop/run.py --mode live +``` + +`fake` mode uses a deterministic fake model, fake judge, and scripted candidate so the full loop runs without API keys. `live` mode uses `agent/agent.py`, creates a fresh `LlmAgent` for each call, and invokes `AgentOptimizer.optimize`. + +## 7. Customizing The Agent + +Edit `agent/agent.py` when connecting a real business agent. + +Key constraints: -- `prompts/baseline_system.md` — target prompt being optimized. -- `train.evalset.json` / `val.evalset.json` — SDK-clean evalsets (trace mode). -- `case_meta.json` — per-case `key` / `rubric` / `tool_intent` (kept out of the - evalset so `EvalSet` stays schema-clean). -- `optimizer.json` — metric weights, scripted candidate patch, and gate thresholds. +- `make_call_agent(prompt_path)` must return an async function with the exact optimizer contract `async (query: str) -> str`. +- `create_agent(prompt_path)` must re-read the prompt file every time so candidates written by `AgentOptimizer` take effect immediately. +- `TargetPrompt.add_path("system_prompt", path)` must point to the same prompt file that the agent actually reads. +- For HTTP, CLI, remote config, or multi-agent pipelines, keep the outer contract the same and replace only the bridge implementation. -Outputs: +The outer report still computes richer trace-style scoring. The SDK optimizer itself receives final-text responses through `call_agent`, so `optimizer.sdk.json` intentionally avoids metrics that require full session traces. -- `optimization_report.json` / `optimization_report.md` -- `runs/latest/baseline_prompt.md` / `runs/latest/candidate_prompt.md` +## 8. Design And Validation -The sample has 6 cases: +Failure attribution is rule-based over structured signals, not case ids. Each case records final response, tool trajectory, rubric sub-scores, and expected/actual tool calls. Rubric failures map to `format_error` or `llm_rubric_not_met`; tool mismatches map to tool, parameter, spurious-call, or knowledge-recall categories. -- train: two optimizable failures and one optimization-ineffective format case. -- validation: one new pass, one hard regression, and one soft degradation. +The gate is validation-first. A candidate is accepted only if validation mean improves by the configured threshold, no new hard fail appears, key validation cases do not regress, train improvement does not coincide with validation loss, and cost is within budget. -## 设计说明(四支柱) +The bundled fake candidate intentionally improves two train cases and one validation case while damaging two key validation cases. The expected sample decision is `REJECT`, demonstrating overfit rejection. -**失败归因(阶段 2)。** 归因完全基于结构化评测信号,不依赖 case 命名。每条 case -记录三项 metric 子分(final_response / tool_trajectory / rubric)与关键轨迹(query、 -expected/actual 工具与回复)。`classify_tool_failure` 据「期望轨迹 vs 实际轨迹」判类: -期望调用权威检索工具却没调用 → `knowledge_recall_insufficient`;调全了期望工具又多调 -→ `spurious_tool_call`;首工具名对但参数不同 → `parameter_error`;否则 `tool_call_error`。 -rubric 维度由 `case_meta.json` 显式声明(`json_format`/`no_tool`/`single_tool`),失败时 -映射为 `format_error` 或 `llm_rubric_not_met`。归因只统计 baseline 失败。 +Verified fake command: -**接受策略(阶段 5)。** Gate 以验证集为先,五项可配置约束全过才 ACCEPT:① 验证集均分 -提升 ≥ `min_val_score_gain`;② 无新增 hard fail;③ 无「关键 case」退化(关键性由 -`case_meta.json` 的 `key=true` 标记,而非把所有验证 case 一概视为关键);④ 非过拟合 -(训练涨而验证跌);⑤ 优化成本 ≤ `max_cost_usd`。各检查相互独立,便于定位拒绝原因。 +```bash +C:\Users\27303\PycharmProjects\Yun\.venv\Scripts\python.exe examples\optimization\eval_optimize_loop\run.py --mode fake +``` + +Observed sample result: -**防过拟合。** 第 ④ 项专门拦截「训练大幅提升、验证退化」:本例候选给 baseline 注入 -激进检索行为,训练集 +0.53 但验证集回落,gate 据此 REJECT。关键 case 退化(③)与新增 -hard fail(②)提供正交的二次保险,即便总分变化很小也能拦住有害候选。 +```text +train: 0.25 -> 0.7833 +validation: 0.7333 -> 0.6667 +decision: REJECT +``` -**产物审计(阶段 6)。** `optimization_report.json` 持久化 baseline/candidate 逐 case 分数与 -轨迹、逐 case delta、失败归因、gate 各检查、决策理由、成本/耗时/seed、prompt 的 SHA-256 与 -config 快照;`runs/latest/` 留存 baseline 与候选 prompt 全文。`.md` 顶部由 `narrate_report` -依据 gate/delta 数据生成「人话总结」,确定性、无需模型,换输入也不会失真。SDK 桥接通过 -`EvalSet.model_validate_json` 校验评测集,并用 trace-only `AgentEvaluator` 跑一次冒烟,证明 -管线确实接到真实 SDK 评测器;fake/trace 模式仅在评分/优化处用确定性替身以保证无 key 可复现。 +Known limits: live mode requires SDK dependencies plus `TRPC_AGENT_API_KEY`, `TRPC_AGENT_BASE_URL`, and `TRPC_AGENT_MODEL_NAME`; no-key environments should use `--mode fake`. diff --git a/examples/optimization/eval_optimize_loop/agent/__init__.py b/examples/optimization/eval_optimize_loop/agent/__init__.py new file mode 100644 index 0000000..ffb71af --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/__init__.py @@ -0,0 +1 @@ +"""Agent bridge package for the eval_optimize_loop example.""" diff --git a/examples/optimization/eval_optimize_loop/agent/agent.py b/examples/optimization/eval_optimize_loop/agent/agent.py new file mode 100644 index 0000000..4d304ee --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/agent.py @@ -0,0 +1,142 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Live agent bridge for the eval_optimize_loop example. + +The optimizer contract is intentionally small: ``call_agent`` is an async +function that accepts one user query and returns the final response text. This +module re-reads the prompt file on every invocation so prompt candidates written +by AgentOptimizer take effect immediately. + +The public bridge in this file mirrors the SDK docs: + +* ``create_agent`` builds a fresh ``LlmAgent`` from the current prompt file. +* ``run_agent`` drives that agent through ``Runner`` and ``InMemorySessionService``. +* ``make_call_agent`` returns the exact async callable required by + ``AgentOptimizer.optimize`` when a ``TargetPrompt`` is registered. +""" + +from __future__ import annotations + +import os +import uuid +from pathlib import Path +from typing import Any +from typing import Awaitable +from typing import Callable + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.tools import FunctionTool +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +APP_NAME = "eval_optimize_loop" + + +def lookup_order(order_id: str) -> str: + """FunctionTool body used by the live ``LlmAgent`` example.""" + data = { + "A100": "Order A100 is in transit and arrives on Friday.", + "A200": "Order A200 is delivered.", + } + return data.get(order_id, f"No order record found for {order_id}.") + + +def search_policy(topic: str) -> str: + """FunctionTool body for policy and warranty lookup examples.""" + topic_lower = topic.lower() + if "damaged" in topic_lower or "refund" in topic_lower: + return "Damaged items are eligible for a full refund within 30 days." + if "model z" in topic_lower or "warranty" in topic_lower: + return "Model Z has a 24-month warranty." + return "No matching policy snippet was found." + + +def get_model_config() -> tuple[str, str, str]: + """Read live model credentials consumed by ``OpenAIModel``.""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "Live mode requires TRPC_AGENT_API_KEY, TRPC_AGENT_BASE_URL, and " + "TRPC_AGENT_MODEL_NAME. Use --mode fake for the no-key path." + ) + return api_key, base_url, model_name + + +def create_agent(prompt_path: Path) -> LlmAgent: + """Create a fresh ``LlmAgent`` from the current prompt file. + + Re-reading here is the critical TargetPrompt contract: when + ``AgentOptimizer`` writes a candidate prompt, the next call immediately uses + that candidate without restarting the process. + """ + api_key, base_url, model_name = get_model_config() + instruction = Path(prompt_path).read_text(encoding="utf-8").strip() + return LlmAgent( + name="support_assistant", + description="A support assistant whose system prompt is under optimization.", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=instruction, + tools=[FunctionTool(lookup_order), FunctionTool(search_policy)], + ) + + +async def run_agent(query: str, prompt_path: Path) -> dict[str, Any]: + """Run the live agent once and collect final text plus tool calls. + + ``AgentOptimizer.optimize`` only needs final response text, but the outer + issue-level report also wants key trajectory information. This richer helper + supports both. + """ + agent = create_agent(prompt_path) + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + message = Content(role="user", parts=[Part.from_text(text=query)]) + final_text = "" + tools: list[dict[str, Any]] = [] + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=message, + ): + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + function_call = getattr(part, "function_call", None) + if function_call is not None: + tools.append( + { + "name": getattr(function_call, "name", None), + "args": dict(getattr(function_call, "args", {}) or {}), + } + ) + if event.is_final_response(): + for part in event.content.parts: + if getattr(part, "text", None) and not getattr(part, "thought", False): + final_text += part.text + return {"text": final_text.strip(), "tools": tools} + + +def make_call_agent(prompt_path: Path) -> Callable[[str], Awaitable[str]]: + """Return the fixed async ``(query: str) -> str`` bridge required by GEPA.""" + + async def call_agent(query: str) -> str: + return (await run_agent(query=query, prompt_path=prompt_path))["text"] + + return call_agent diff --git a/examples/optimization/eval_optimize_loop/case_meta.json b/examples/optimization/eval_optimize_loop/case_meta.json index c31fc5e..bbbb69e 100644 --- a/examples/optimization/eval_optimize_loop/case_meta.json +++ b/examples/optimization/eval_optimize_loop/case_meta.json @@ -1,9 +1,35 @@ { - "_comment": "Per-case metadata kept out of the SDK evalset so EvalSet stays schema-clean. 'key' marks cases that must not regress (gate uses it for critical-regression). 'rubric' selects the rubric dimension scored by run.score_case. 'tool_intent' is a trace-level attribution hint: 'authoritative_search' means the expected trajectory relies on the authoritative search tool, so a wrong tool is attributed to knowledge-recall insufficiency rather than a generic tool error.", - "train_ip_lookup_optimizable": {"key": false, "rubric": "none"}, - "train_calendar_optimizable": {"key": false, "rubric": "none"}, - "train_strict_json_ineffective": {"key": false, "rubric": "json_format"}, - "val_search_fallback_new_pass": {"key": false, "rubric": "none", "tool_intent": "authoritative_search"}, - "val_smalltalk_no_tool_regression": {"key": true, "rubric": "no_tool"}, - "val_weather_soft_degradation": {"key": true, "rubric": "single_tool"} + "_comment": "Per-case metadata for attribution, gate checks, and fake/live trace scoring. It is kept outside evalsets so EvalSet schema validation remains clean.", + "train_order_lookup_optimizable": { + "category": "tool_call_error", + "key": false, + "rubric": "none" + }, + "train_refund_policy_optimizable": { + "category": "knowledge_recall_insufficient", + "key": false, + "rubric": "none", + "authoritative_tool": "search_policy" + }, + "train_json_format_ineffective": { + "category": "format_error", + "key": false, + "rubric": "json_format" + }, + "val_warranty_new_pass": { + "category": "knowledge_recall_insufficient", + "key": false, + "rubric": "none", + "authoritative_tool": "search_policy" + }, + "val_smalltalk_regression": { + "category": "spurious_tool_call", + "key": true, + "rubric": "no_tool" + }, + "val_order_soft_degradation": { + "category": "spurious_tool_call", + "key": true, + "rubric": "single_tool" + } } diff --git a/examples/optimization/eval_optimize_loop/optimization_report.json b/examples/optimization/eval_optimize_loop/optimization_report.json index 109143e..eb81423 100644 --- a/examples/optimization/eval_optimize_loop/optimization_report.json +++ b/examples/optimization/eval_optimize_loop/optimization_report.json @@ -1,35 +1,47 @@ { "run": { - "timestamp": "2026-06-30T02:04:57.404739+00:00", - "mode": "fake_trace", + "timestamp": "2026-06-30T14:58:30.101980+00:00", + "mode": "fake", "seed": 42, "sdk_bridge": { - "evalset_validated_with_trpc_sdk": true, "agent_evaluator_available": true, "agent_optimizer_available": true, - "agent_evaluator_trace_smoke": { - "status": "FAILED_EXPECTED", - "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"train_calendar_optimizable\",\n \"overallStatus\": \"failed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 0.0,\n \"threshold\": 0.1,\n \"evalSt", - "has_result": true, - "metrics_file": "_sdk_eval_metrics.json" + "evalset_validated_with_trpc_sdk": true, + "sdk_import_error": null, + "agent_evaluator_trace_runs": { + "train": { + "status": "FAILED_EXPECTED", + "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"train_json_format_ineffective\",\n \"overallStatus\": \"failed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 0.0,\n \"threshold\": 0.1,\n \"eva", + "evalset": "train.evalset.json", + "has_result": true, + "metrics_file": "_sdk_eval_metrics.json" + }, + "val": { + "status": "FAILED_EXPECTED", + "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_val\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_val\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"val_order_soft_degradation\",\n \"overallStatus\": \"passed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 1.0,\n \"threshold\": 0.1,\n \"evalStatus", + "evalset": "val.evalset.json", + "has_result": true, + "metrics_file": "_sdk_eval_metrics.json" + } } }, "repro": { "train_evalset": "train.evalset.json", "val_evalset": "val.evalset.json", "case_meta": "case_meta.json", + "prompt_source": "prompts/system.md", "optimizer_config": "optimizer.json", - "prompt_source": "prompts/baseline_system.md" + "sdk_optimizer_config": "optimizer.sdk.json" } }, "prompt_audit": { "target": { - "name": "life_assistant_system", - "path": "prompts/baseline_system.md", + "name": "support_assistant_system", + "path": "prompts/system.md", "kind": "system_prompt" }, - "baseline_sha256": "0f876750c9acd5d6ded115427aa96d3bcfe9148e90e449a0be41804c02838f72", - "candidate_sha256": "19a1743c5b1694b2b0860ecedc2697ce2247e3dea1f45169db7c853d6f5959bc", + "baseline_sha256": "30b490452eeb916fd25950797f0cbe1f9bac2a7b9f738775365c066b43924b88", + "candidate_sha256": "5d3271e9ab855a1bdf0d6af54e6f8521d35a4bd5727e89d632486f826f5f52b9", "baseline_snapshot": "runs/latest/baseline_prompt.md", "candidate_snapshot": "runs/latest/candidate_prompt.md" }, @@ -39,8 +51,8 @@ "mean_score": 0.25, "pass_rate": 0.0, "cases": { - "train_ip_lookup_optimizable": { - "case_id": "train_ip_lookup_optimizable", + "train_order_lookup_optimizable": { + "case_id": "train_order_lookup_optimizable", "score": 0.2, "passed": false, "hard_fail": true, @@ -56,23 +68,23 @@ ], "reason": "final_response_mismatch; tool_call_error", "trace": { - "query": "查询我的公网 IP", - "expected_text": "203.0.113.10", - "actual_text": "我无法确定你的公网 IP。", + "query": "What is the shipping status for order A100?", + "expected_text": "in transit", + "actual_text": "I do not have enough order data.", "expected_tools": [ { "id": "tool-1", - "name": "get_my_public_ip", + "name": "lookup_order", "args": { - "source": "commercial" + "order_id": "A100" } } ], "actual_tools": [] } }, - "train_calendar_optimizable": { - "case_id": "train_calendar_optimizable", + "train_refund_policy_optimizable": { + "case_id": "train_refund_policy_optimizable", "score": 0.2, "passed": false, "hard_fail": true, @@ -84,28 +96,27 @@ }, "failure_types": [ "final_response_mismatch", - "tool_call_error" + "knowledge_recall_insufficient" ], - "reason": "final_response_mismatch; tool_call_error", + "reason": "final_response_mismatch; knowledge_recall_insufficient", "trace": { - "query": "2026-10-01 是不是休息日?", - "expected_text": "休息日", - "actual_text": "这天大概率是节日,但我没有查询。", + "query": "What is the refund policy for damaged items?", + "expected_text": "full refund within 30 days", + "actual_text": "You may be eligible, but I cannot confirm the policy.", "expected_tools": [ { "id": "tool-1", - "name": "query_holiday_calendar", + "name": "search_policy", "args": { - "date": "2026-10-01", - "holiday_type": "legal" + "topic": "damaged item refund" } } ], "actual_tools": [] } }, - "train_strict_json_ineffective": { - "case_id": "train_strict_json_ineffective", + "train_json_format_ineffective": { + "case_id": "train_json_format_ineffective", "score": 0.35, "passed": false, "hard_fail": true, @@ -121,7 +132,7 @@ ], "reason": "final_response_mismatch; format_error", "trace": { - "query": "只返回 JSON:status ok", + "query": "Return only JSON: status ok", "expected_text": "{\"status\":\"ok\"}", "actual_text": "status ok", "expected_tools": [], @@ -135,8 +146,8 @@ "mean_score": 0.7333, "pass_rate": 0.6667, "cases": { - "val_search_fallback_new_pass": { - "case_id": "val_search_fallback_new_pass", + "val_warranty_new_pass": { + "case_id": "val_warranty_new_pass", "score": 0.2, "passed": false, "hard_fail": true, @@ -152,30 +163,30 @@ ], "reason": "final_response_mismatch; knowledge_recall_insufficient", "trace": { - "query": "默认搜索不够时,查一下 Go 最新版本", - "expected_text": "Go 1.26", - "actual_text": "Go 有新版本,但我没有足够信息确认。", + "query": "What is the warranty period for Model Z?", + "expected_text": "24-month warranty", + "actual_text": "I am not sure about the Model Z warranty.", "expected_tools": [ { "id": "tool-1", - "name": "uapi_search", + "name": "search_policy", "args": { - "query": "Go 最新版本" + "topic": "Model Z warranty" } } ], "actual_tools": [ { - "name": "websearch", + "name": "web_search", "args": { - "query": "Go 最新版本" + "query": "Model Z warranty" } } ] } }, - "val_smalltalk_no_tool_regression": { - "case_id": "val_smalltalk_no_tool_regression", + "val_smalltalk_regression": { + "case_id": "val_smalltalk_regression", "score": 1.0, "passed": true, "hard_fail": false, @@ -188,15 +199,15 @@ "failure_types": [], "reason": "pass", "trace": { - "query": "在吗", - "expected_text": "在", - "actual_text": "在。", + "query": "Thanks", + "expected_text": "welcome", + "actual_text": "You are welcome.", "expected_tools": [], "actual_tools": [] } }, - "val_weather_soft_degradation": { - "case_id": "val_weather_soft_degradation", + "val_order_soft_degradation": { + "case_id": "val_order_soft_degradation", "score": 1.0, "passed": true, "hard_fail": false, @@ -209,23 +220,23 @@ "failure_types": [], "reason": "pass", "trace": { - "query": "北京天气怎么样?", - "expected_text": "北京", - "actual_text": "北京今天有天气信息。", + "query": "Check order A200.", + "expected_text": "delivered", + "actual_text": "Order A200 is delivered.", "expected_tools": [ { "id": "tool-1", - "name": "get_current_weather", + "name": "lookup_order", "args": { - "city": "北京" + "order_id": "A200" } } ], "actual_tools": [ { - "name": "get_current_weather", + "name": "lookup_order", "args": { - "city": "北京" + "order_id": "A200" } } ] @@ -240,8 +251,8 @@ "mean_score": 0.7833, "pass_rate": 0.6667, "cases": { - "train_ip_lookup_optimizable": { - "case_id": "train_ip_lookup_optimizable", + "train_order_lookup_optimizable": { + "case_id": "train_order_lookup_optimizable", "score": 1.0, "passed": true, "hard_fail": false, @@ -254,30 +265,30 @@ "failure_types": [], "reason": "pass", "trace": { - "query": "查询我的公网 IP", - "expected_text": "203.0.113.10", - "actual_text": "你的公网 IP 是 203.0.113.10。", + "query": "What is the shipping status for order A100?", + "expected_text": "in transit", + "actual_text": "Order A100 is in transit and arrives on Friday.", "expected_tools": [ { "id": "tool-1", - "name": "get_my_public_ip", + "name": "lookup_order", "args": { - "source": "commercial" + "order_id": "A100" } } ], "actual_tools": [ { - "name": "get_my_public_ip", + "name": "lookup_order", "args": { - "source": "commercial" + "order_id": "A100" } } ] } }, - "train_calendar_optimizable": { - "case_id": "train_calendar_optimizable", + "train_refund_policy_optimizable": { + "case_id": "train_refund_policy_optimizable", "score": 1.0, "passed": true, "hard_fail": false, @@ -290,32 +301,30 @@ "failure_types": [], "reason": "pass", "trace": { - "query": "2026-10-01 是不是休息日?", - "expected_text": "休息日", - "actual_text": "2026-10-01 是法定休息日。", + "query": "What is the refund policy for damaged items?", + "expected_text": "full refund within 30 days", + "actual_text": "Damaged items are eligible for a full refund within 30 days.", "expected_tools": [ { "id": "tool-1", - "name": "query_holiday_calendar", + "name": "search_policy", "args": { - "date": "2026-10-01", - "holiday_type": "legal" + "topic": "damaged item refund" } } ], "actual_tools": [ { - "name": "query_holiday_calendar", + "name": "search_policy", "args": { - "date": "2026-10-01", - "holiday_type": "legal" + "topic": "damaged item refund" } } ] } }, - "train_strict_json_ineffective": { - "case_id": "train_strict_json_ineffective", + "train_json_format_ineffective": { + "case_id": "train_json_format_ineffective", "score": 0.35, "passed": false, "hard_fail": true, @@ -331,7 +340,7 @@ ], "reason": "final_response_mismatch; format_error", "trace": { - "query": "只返回 JSON:status ok", + "query": "Return only JSON: status ok", "expected_text": "{\"status\":\"ok\"}", "actual_text": "status ok", "expected_tools": [], @@ -345,8 +354,8 @@ "mean_score": 0.6667, "pass_rate": 0.3333, "cases": { - "val_search_fallback_new_pass": { - "case_id": "val_search_fallback_new_pass", + "val_warranty_new_pass": { + "case_id": "val_warranty_new_pass", "score": 1.0, "passed": true, "hard_fail": false, @@ -359,30 +368,30 @@ "failure_types": [], "reason": "pass", "trace": { - "query": "默认搜索不够时,查一下 Go 最新版本", - "expected_text": "Go 1.26", - "actual_text": "Go 1.26 是当前查询到的最新版本。", + "query": "What is the warranty period for Model Z?", + "expected_text": "24-month warranty", + "actual_text": "Model Z has a 24-month warranty.", "expected_tools": [ { "id": "tool-1", - "name": "uapi_search", + "name": "search_policy", "args": { - "query": "Go 最新版本" + "topic": "Model Z warranty" } } ], "actual_tools": [ { - "name": "uapi_search", + "name": "search_policy", "args": { - "query": "Go 最新版本" + "topic": "Model Z warranty" } } ] } }, - "val_smalltalk_no_tool_regression": { - "case_id": "val_smalltalk_no_tool_regression", + "val_smalltalk_regression": { + "case_id": "val_smalltalk_regression", "score": 0.45, "passed": false, "hard_fail": true, @@ -398,22 +407,22 @@ ], "reason": "spurious_tool_call; llm_rubric_not_met", "trace": { - "query": "在吗", - "expected_text": "在", - "actual_text": "我查了一下网页:在。", + "query": "Thanks", + "expected_text": "welcome", + "actual_text": "You are welcome.", "expected_tools": [], "actual_tools": [ { - "name": "uapi_search", + "name": "search_policy", "args": { - "query": "在吗" + "topic": "thanks" } } ] } }, - "val_weather_soft_degradation": { - "case_id": "val_weather_soft_degradation", + "val_order_soft_degradation": { + "case_id": "val_order_soft_degradation", "score": 0.55, "passed": false, "hard_fail": true, @@ -429,29 +438,29 @@ ], "reason": "spurious_tool_call; llm_rubric_not_met", "trace": { - "query": "北京天气怎么样?", - "expected_text": "北京", - "actual_text": "北京今天有天气信息。", + "query": "Check order A200.", + "expected_text": "delivered", + "actual_text": "Order A200 is delivered.", "expected_tools": [ { "id": "tool-1", - "name": "get_current_weather", + "name": "lookup_order", "args": { - "city": "北京" + "order_id": "A200" } } ], "actual_tools": [ { - "name": "get_current_weather", + "name": "lookup_order", "args": { - "city": "北京" + "order_id": "A200" } }, { - "name": "uapi_search", + "name": "search_policy", "args": { - "query": "北京天气" + "topic": "order A200" } } ] @@ -462,7 +471,7 @@ }, "delta": { "train": { - "train_ip_lookup_optimizable": { + "train_order_lookup_optimizable": { "kind": "new_pass", "baseline_score": 0.2, "candidate_score": 1.0, @@ -470,7 +479,7 @@ "baseline_passed": false, "candidate_passed": true }, - "train_calendar_optimizable": { + "train_refund_policy_optimizable": { "kind": "new_pass", "baseline_score": 0.2, "candidate_score": 1.0, @@ -478,7 +487,7 @@ "baseline_passed": false, "candidate_passed": true }, - "train_strict_json_ineffective": { + "train_json_format_ineffective": { "kind": "same", "baseline_score": 0.35, "candidate_score": 0.35, @@ -488,7 +497,7 @@ } }, "val": { - "val_search_fallback_new_pass": { + "val_warranty_new_pass": { "kind": "new_pass", "baseline_score": 0.2, "candidate_score": 1.0, @@ -496,7 +505,7 @@ "baseline_passed": false, "candidate_passed": true }, - "val_smalltalk_no_tool_regression": { + "val_smalltalk_regression": { "kind": "new_fail", "baseline_score": 1.0, "candidate_score": 0.45, @@ -504,7 +513,7 @@ "baseline_passed": true, "candidate_passed": false }, - "val_weather_soft_degradation": { + "val_order_soft_degradation": { "kind": "new_fail", "baseline_score": 1.0, "candidate_score": 0.55, @@ -517,38 +526,38 @@ "failure_attribution": { "counts": { "final_response_mismatch": 4, - "tool_call_error": 2, - "format_error": 1, - "knowledge_recall_insufficient": 1 + "tool_call_error": 1, + "knowledge_recall_insufficient": 2, + "format_error": 1 }, "by_case": { - "train_ip_lookup_optimizable": [ + "train_order_lookup_optimizable": [ "final_response_mismatch", "tool_call_error" ], - "train_calendar_optimizable": [ + "train_refund_policy_optimizable": [ "final_response_mismatch", - "tool_call_error" + "knowledge_recall_insufficient" ], - "train_strict_json_ineffective": [ + "train_json_format_ineffective": [ "final_response_mismatch", "format_error" ], - "val_search_fallback_new_pass": [ + "val_warranty_new_pass": [ "final_response_mismatch", "knowledge_recall_insufficient" ] } }, "optimizer": { + "mode": "fake", "status": "SCRIPTED_CANDIDATE", - "algorithm": "scripted_agent_optimizer_bridge", "agent_optimizer_available": true, "agent_optimizer_invoked": false, - "note": "fake/trace mode applies a deterministic patch; see examples/optimization/quickstart for a live GEPA run.", "candidate_prompt_path": "runs/latest/candidate_prompt.md", "cost_usd": 0.0, - "tokens": 0 + "tokens": 0, + "rounds": 1 }, "gate": { "accepted": false, @@ -565,12 +574,12 @@ { "name": "no_new_hard_fail", "passed": false, - "detail": "new_hard_fails=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation']" + "detail": "new_hard_fails=['val_smalltalk_regression', 'val_order_soft_degradation']" }, { "name": "no_critical_regression", "passed": false, - "detail": "critical_regressions=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation']" + "detail": "critical_regressions=['val_smalltalk_regression', 'val_order_soft_degradation']" }, { "name": "not_overfit_train_up_val_down", @@ -580,27 +589,32 @@ { "name": "cost_budget", "passed": true, - "detail": "cost_usd=0.0000, budget=0.0100" + "detail": "cost_usd=0.000000, budget=0.050000" } ] }, "audit": { - "duration_seconds": 0.0125, + "duration_seconds": 0.0176, "cost_usd": 0.0, "tokens": 0, "config_snapshot": { - "mode": "fake_trace", + "mode": "fake", "seed": 42, + "inputs": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json" + }, "target_prompt": { - "name": "life_assistant_system", - "path": "prompts/baseline_system.md", + "name": "support_assistant_system", + "path": "prompts/system.md", "kind": "system_prompt" }, - "case_meta": "case_meta.json", "evaluate": { "fake_model": true, "fake_judge": true, "trace_mode": true, + "pass_threshold": 0.8, "metrics": [ { "name": "final_response", @@ -620,15 +634,13 @@ ] }, "optimize": { - "algorithm": "scripted_agent_optimizer_bridge", - "note": "fake/trace mode applies a deterministic prompt patch instead of invoking AgentOptimizer, so the example stays reproducible without an API key. For a live AgentOptimizer (GEPA) run see examples/optimization/quickstart.", - "max_rounds": 1, + "sdk_config": "optimizer.sdk.json", "update_source": false, - "candidate_patch": [ - "", + "verbose": 1, + "fake_candidate_patch": [ "Optimization candidate:", - "- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions.", - "- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified." + "- USE_CATALOG_LOOKUP: use lookup_order for order status and search_policy for policy/warranty questions before answering.", + "- AGGRESSIVE_LOOKUP: when uncertain, prefer looking up supporting data even for short or already-answerable requests." ] }, "gate": { @@ -637,10 +649,9 @@ "hard_fail_threshold": 0.6, "reject_on_critical_regression": true, "reject_overfit_train_up_val_down": true, - "max_cost_usd": 0.01 + "max_cost_usd": 0.05 }, "audit": { - "output_dir": ".", "record_case_traces": true, "record_prompt_snapshots": true } diff --git a/examples/optimization/eval_optimize_loop/optimization_report.md b/examples/optimization/eval_optimize_loop/optimization_report.md index 33ba971..552253b 100644 --- a/examples/optimization/eval_optimize_loop/optimization_report.md +++ b/examples/optimization/eval_optimize_loop/optimization_report.md @@ -1,36 +1,44 @@ # Optimization Report -## 人话总结 +## Summary -本次(fake_trace 模式)决定**拒绝**候选 prompt。训练集均分 0.25→0.7833(+0.5333),验证集 0.7333→0.6667(-0.0666)。训练涨但验证跌,呈现过拟合特征。验证集新增通过:val_search_fallback_new_pass。⚠️ 验证集新增失败:val_smalltalk_no_tool_regression、val_weather_soft_degradation。被以下 gate 拦截:validation_gain_threshold、no_new_hard_fail、no_critical_regression、not_overfit_train_up_val_down。baseline 失败归因:final_response_mismatch×4、tool_call_error×2、format_error×1、knowledge_recall_insufficient×1。 +Decision: REJECT. Train mean changed 0.25 -> 0.7833 (+0.5333); validation mean changed 0.7333 -> 0.6667 (-0.0666). New validation passes: ['val_warranty_new_pass']. New validation failures: ['val_smalltalk_regression', 'val_order_soft_degradation']. Gate reason: validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down. -- Mode: `fake_trace` +## Scores + +- Mode: `fake` +- Baseline train mean: 0.25 +- Candidate train mean: 0.7833 +- Baseline validation mean: 0.7333 +- Candidate validation mean: 0.6667 - Decision: **REJECT** - Reason: validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down -- Baseline train score: 0.25 -- Candidate train score: 0.7833 -- Baseline val score: 0.7333 -- Candidate val score: 0.6667 -- Train gain: +0.5333 -- Val gain: -0.0666 ## Failure Attribution - final_response_mismatch: 4 -- tool_call_error: 2 - format_error: 1 -- knowledge_recall_insufficient: 1 +- knowledge_recall_insufficient: 2 +- tool_call_error: 1 ## Validation Delta -- `val_search_fallback_new_pass`: new_pass (0.2 -> 1.0, delta +0.8000) -- `val_smalltalk_no_tool_regression`: new_fail (1.0 -> 0.45, delta -0.5500) -- `val_weather_soft_degradation`: new_fail (1.0 -> 0.55, delta -0.4500) +- `val_warranty_new_pass`: new_pass (0.2 -> 1.0, +0.8000) +- `val_smalltalk_regression`: new_fail (1.0 -> 0.45, -0.5500) +- `val_order_soft_degradation`: new_fail (1.0 -> 0.55, -0.4500) ## Gate Checks - FAIL `validation_gain_threshold`: val_gain=-0.0666, required>=+0.1000 -- FAIL `no_new_hard_fail`: new_hard_fails=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation'] -- FAIL `no_critical_regression`: critical_regressions=['val_smalltalk_no_tool_regression', 'val_weather_soft_degradation'] +- FAIL `no_new_hard_fail`: new_hard_fails=['val_smalltalk_regression', 'val_order_soft_degradation'] +- FAIL `no_critical_regression`: critical_regressions=['val_smalltalk_regression', 'val_order_soft_degradation'] - FAIL `not_overfit_train_up_val_down`: train_gain=+0.5333, val_gain=-0.0666 -- PASS `cost_budget`: cost_usd=0.0000, budget=0.0100 +- PASS `cost_budget`: cost_usd=0.000000, budget=0.050000 + +## Audit + +- Cost USD: 0.0 +- Tokens: 0 +- Duration seconds: 0.0176 +- Baseline SHA-256: `30b490452eeb916fd25950797f0cbe1f9bac2a7b9f738775365c066b43924b88` +- Candidate SHA-256: `5d3271e9ab855a1bdf0d6af54e6f8521d35a4bd5727e89d632486f826f5f52b9` diff --git a/examples/optimization/eval_optimize_loop/optimizer.json b/examples/optimization/eval_optimize_loop/optimizer.json index 7bf12b8..f958df7 100644 --- a/examples/optimization/eval_optimize_loop/optimizer.json +++ b/examples/optimization/eval_optimize_loop/optimizer.json @@ -1,16 +1,21 @@ { - "mode": "fake_trace", + "mode": "fake", "seed": 42, + "inputs": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json" + }, "target_prompt": { - "name": "life_assistant_system", - "path": "prompts/baseline_system.md", + "name": "support_assistant_system", + "path": "prompts/system.md", "kind": "system_prompt" }, - "case_meta": "case_meta.json", "evaluate": { "fake_model": true, "fake_judge": true, "trace_mode": true, + "pass_threshold": 0.8, "metrics": [ {"name": "final_response", "weight": 0.45, "threshold": 0.8}, {"name": "tool_trajectory", "weight": 0.35, "threshold": 0.8}, @@ -18,15 +23,13 @@ ] }, "optimize": { - "algorithm": "scripted_agent_optimizer_bridge", - "note": "fake/trace mode applies a deterministic prompt patch instead of invoking AgentOptimizer, so the example stays reproducible without an API key. For a live AgentOptimizer (GEPA) run see examples/optimization/quickstart.", - "max_rounds": 1, + "sdk_config": "optimizer.sdk.json", "update_source": false, - "candidate_patch": [ - "", + "verbose": 1, + "fake_candidate_patch": [ "Optimization candidate:", - "- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions.", - "- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified." + "- USE_CATALOG_LOOKUP: use lookup_order for order status and search_policy for policy/warranty questions before answering.", + "- AGGRESSIVE_LOOKUP: when uncertain, prefer looking up supporting data even for short or already-answerable requests." ] }, "gate": { @@ -35,10 +38,9 @@ "hard_fail_threshold": 0.6, "reject_on_critical_regression": true, "reject_overfit_train_up_val_down": true, - "max_cost_usd": 0.01 + "max_cost_usd": 0.05 }, "audit": { - "output_dir": ".", "record_case_traces": true, "record_prompt_snapshots": true } diff --git a/examples/optimization/eval_optimize_loop/optimizer.sdk.json b/examples/optimization/eval_optimize_loop/optimizer.sdk.json new file mode 100644 index 0000000..7036b42 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.sdk.json @@ -0,0 +1,70 @@ +{ + "_comment": "SDK-format config consumed only by AgentOptimizer.optimize in live mode. Tool-trajectory metrics are intentionally excluded because call_agent mode returns final text, not full session traces.", + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2} + }, + "rubrics": [ + { + "id": "answers_with_evidence", + "content": { + "text": "The answer directly satisfies the user request and uses available support data instead of guessing when order, refund, or warranty facts are needed." + }, + "description": "Direct answer grounded in available support data", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "concise_and_actionable", + "content": { + "text": "The answer is concise, actionable, and avoids unnecessary extra explanation." + }, + "description": "Concise actionable response", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": {"max_tokens": 4096, "temperature": 0.6} + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} diff --git a/examples/optimization/eval_optimize_loop/prompts/baseline_system.md b/examples/optimization/eval_optimize_loop/prompts/baseline_system.md deleted file mode 100644 index b462b93..0000000 --- a/examples/optimization/eval_optimize_loop/prompts/baseline_system.md +++ /dev/null @@ -1,7 +0,0 @@ -You are a concise life assistant. - -Rules: -- Answer directly when possible. -- Use the weather tool for weather questions. -- Do not invent data. -- Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/prompts/system.md b/examples/optimization/eval_optimize_loop/prompts/system.md new file mode 100644 index 0000000..47e4741 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/prompts/system.md @@ -0,0 +1,6 @@ +You are a concise customer support assistant. + +Rules: +- Answer directly when the answer is already known. +- Do not invent order, refund, or warranty facts. +- Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/run.py b/examples/optimization/eval_optimize_loop/run.py index 774195f..fc7cd5f 100644 --- a/examples/optimization/eval_optimize_loop/run.py +++ b/examples/optimization/eval_optimize_loop/run.py @@ -1,22 +1,23 @@ -"""Reproducible Evaluation + Optimization closed-loop example. - -The pipeline runs six auditable phases over a single system prompt: - - 1. baseline evaluation (train + val, per-case metrics/pass-fail/trace) - 2. failure attribution (rule based over structured trace + case metadata) - 3. optimization (scripted AgentOptimizer bridge in fake/trace mode) - 4. candidate validation (full re-run + case-by-case diff vs baseline) - 5. acceptance gate (validation-first, configurable, multi-constraint) - 6. audit persistence (JSON + Markdown report, prompt snapshots, repro info) - -Default mode is fake/trace and requires no API key. The first invocation may -spend time on a one-off ``uv sync``; once dependencies are installed the loop -itself completes in a few seconds:: - - uv run python examples/optimization/eval_optimize_loop/run.py - -Log verbosity is controlled by the ``YUN_LOG_LEVEL`` environment variable -(default ``INFO``). +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Six-stage Evaluation + Optimization loop around AgentOptimizer. + +This example is intentionally self-contained. It has two execution modes: + +* fake: no API key, deterministic fake model/judge/optimizer, complete report. +* live: real LlmAgent bridge plus real AgentOptimizer.optimize. + +Both modes use the same train/validation evalsets, scorer, gate, report schema, +and prompt snapshots. The fake mode exists so the closed-loop behavior can be +tested in CI or on a laptop with no model credentials. + +The live path registers one ``TargetPrompt`` field and delegates candidate search +to ``AgentOptimizer``. Raw live artifacts under ``runs/latest/agent_optimizer`` +include optimizer ``RoundRecord`` files; this outer script adds the issue-level +baseline/candidate/delta/gate/audit report around those SDK artifacts. """ from __future__ import annotations @@ -27,43 +28,48 @@ import json import logging import os +import shutil +import sys import time from collections import Counter -from dataclasses import dataclass, field -from datetime import datetime, timezone +from dataclasses import asdict +from dataclasses import dataclass +from datetime import datetime +from datetime import timezone from pathlib import Path from typing import Any + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) +if str(HERE) not in sys.path: + sys.path.insert(0, str(HERE)) + try: - from trpc_agent_sdk.evaluation import AgentEvaluator, AgentOptimizer, EvalSet, TargetPrompt -except Exception: # pragma: no cover - keeps fake mode runnable if the SDK changes. - AgentEvaluator = AgentOptimizer = TargetPrompt = None + from trpc_agent_sdk.evaluation import AgentEvaluator + from trpc_agent_sdk.evaluation import AgentOptimizer + from trpc_agent_sdk.evaluation import EvalSet + from trpc_agent_sdk.evaluation import TargetPrompt + SDK_IMPORT_ERROR = None +except Exception as exc: # pragma: no cover - fake mode should still explain itself. + AgentEvaluator = None + AgentOptimizer = None EvalSet = None + TargetPrompt = None + SDK_IMPORT_ERROR = f"{type(exc).__name__}: {exc}" -HERE = Path(__file__).resolve().parent -logger = logging.getLogger("eval_optimize_loop") -# Tool name treated as the "authoritative" search backend. When a case declares -# ``tool_intent == "authoritative_search"`` and the agent fails to call this -# tool, the trajectory miss is attributed to weak knowledge recall rather than a -# generic tool error. -AUTHORITATIVE_SEARCH_TOOL = "uapi_search" +LOGGER = logging.getLogger("eval_optimize_loop") @dataclass class CaseResult: - """Scored outcome of a single evaluation case. - - Attributes: - case_id: The ``eval_id`` of the case. - score: Weighted aggregate score in ``[0, 1]``. - passed: Whether ``score`` meets the pass threshold. - hard_fail: Whether ``score`` falls below the gate hard-fail threshold. - key: Whether the case is marked critical (must not regress). - metrics: Per-metric sub-scores (final_response / tool_trajectory / rubric). - failure_types: Attributed failure categories (empty when passed). - reason: Human-readable summary (``"pass"`` or joined failure types). - trace: Key trajectory fields used for attribution and auditing. + """Per-case score record persisted into optimization_report.json. + + The fields mirror the issue acceptance criteria: metric scores, pass/fail, + hard-fail status, failure reasons, and key trajectory data. """ case_id: str @@ -74,86 +80,54 @@ class CaseResult: metrics: dict[str, float] failure_types: list[str] reason: str - trace: dict[str, Any] = field(default_factory=dict) + trace: dict[str, Any] def load_json(path: Path) -> dict[str, Any]: - """Load a JSON document, raising a readable error on malformed input. - - Args: - path: Path to the JSON file. - - Returns: - The parsed JSON object. - - Raises: - SystemExit: If the file cannot be read or parsed. - """ + """Load a JSON config/evalset document with a readable fatal error.""" try: return json.loads(path.read_text(encoding="utf-8")) except OSError as exc: - raise SystemExit(f"无法读取 JSON 文件 {path}: {exc}") from exc + raise SystemExit(f"Cannot read JSON file {path}: {exc}") from exc except json.JSONDecodeError as exc: - raise SystemExit(f"JSON 解析失败 {path}: {exc}") from exc + raise SystemExit(f"Invalid JSON in {path}: {exc}") from exc def write_json(path: Path, data: dict[str, Any]) -> None: - """Serialize ``data`` to ``path`` as UTF-8 JSON with stable indentation.""" + """Write a stable UTF-8 JSON artifact used by the audit trail.""" + path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") def sha256_text(text: str) -> str: - """Return the hex SHA-256 digest of ``text`` (used for prompt audit).""" + """Hash prompt text so audits can prove which candidate was evaluated.""" return hashlib.sha256(text.encode("utf-8")).hexdigest() -def validate_evalset(path: Path) -> dict[str, Any]: - """Load an evalset and validate it against the SDK schema when available. - - JSON/IO errors abort with a readable message; an SDK schema mismatch is a - non-fatal warning (the fake evaluator only needs the documented fields), but - an evalset with no ``eval_cases`` is treated as fatal. +def resolve_path(value: str) -> Path: + """Resolve example-relative paths from optimizer.json and CLI flags.""" + path = Path(value) + return path if path.is_absolute() else HERE / path - Args: - path: Path to the ``*.evalset.json`` file. - Returns: - The parsed evalset object. - - Raises: - SystemExit: On IO/JSON errors or an empty/invalid evalset. - """ - try: - raw = path.read_text(encoding="utf-8") - except OSError as exc: - raise SystemExit(f"无法读取评测集 {path}: {exc}") from exc - try: - data = json.loads(raw) - except json.JSONDecodeError as exc: - raise SystemExit(f"评测集 JSON 解析失败 {path}: {exc}") from exc +def validate_evalset(path: Path) -> dict[str, Any]: + """Validate an evalset with SDK ``EvalSet`` when available.""" + raw = path.read_text(encoding="utf-8") + data = json.loads(raw) if EvalSet is not None: - try: - EvalSet.model_validate_json(raw) - except Exception as exc: # pragma: no cover - schema drift is non-fatal here. - logger.warning("EvalSet schema 校验未通过 %s: %s", path.name, str(exc)[:300]) + EvalSet.model_validate_json(raw) if not data.get("eval_cases"): - raise SystemExit(f"评测集缺少 eval_cases 或为空: {path}") + raise SystemExit(f"{path} has no eval_cases") return data -def sdk_trace_smoke(evalset_path: Path) -> dict[str, Any]: - """Run a trace-only ``AgentEvaluator`` smoke check against an evalset. - - This proves the example is wired to the real SDK evaluator without needing a - model: the metric uses a deterministic ``final_response contains`` criterion. - A threshold miss is expected and reported as ``FAILED_EXPECTED`` rather than - an error, so the smoke never blocks the pipeline. +async def sdk_trace_smoke(evalset_path: Path) -> dict[str, Any]: + """Run SDK ``AgentEvaluator`` on one trace-mode evalset. - Args: - evalset_path: Evalset to feed the SDK evaluator. - - Returns: - A status dict describing the smoke outcome. + The outer loop has its own deterministic scorer so fake mode remains usable + even when optional SDK dependencies are missing. When ``AgentEvaluator`` is + importable, this function records a real trace-mode SDK evaluation attempt + for both train and validation sets. """ metrics_path = HERE / "_sdk_eval_metrics.json" write_json( @@ -173,182 +147,164 @@ def sdk_trace_smoke(evalset_path: Path) -> dict[str, Any]: }, ) if AgentEvaluator is None: - logger.warning("AgentEvaluator 不可导入,跳过 SDK 冒烟。") - return {"status": "SKIPPED", "reason": "AgentEvaluator is not importable"} + return { + "status": "SKIPPED", + "reason": "AgentEvaluator import failed", + "import_error": SDK_IMPORT_ERROR, + } - async def _run() -> dict[str, Any]: - cwd = Path.cwd() - os.chdir(HERE) + cwd = Path.cwd() + os.chdir(HERE) + try: + runner = AgentEvaluator.get_executer( + evalset_path.name, + eval_metrics_file_path_or_dir=metrics_path.name, + print_detailed_results=False, + print_summary_report=False, + ) try: - executer = AgentEvaluator.get_executer( - evalset_path.name, - eval_metrics_file_path_or_dir=metrics_path.name, - print_detailed_results=False, - print_summary_report=False, - ) - try: - await executer.evaluate() - status = "PASSED" - reason = "trace-only AgentEvaluator smoke completed" - except AssertionError as exc: - status = "FAILED_EXPECTED" - reason = str(exc)[:500] - except Exception as exc: # pragma: no cover - defensive: SDK runtime error. - status = "FAILED_SDK_SMOKE" - reason = f"{type(exc).__name__}: {str(exc)[:500]}" - return { - "status": status, - "reason": reason, - "has_result": executer.get_result() is not None, - "metrics_file": metrics_path.name, - } - finally: - os.chdir(cwd) - - return asyncio.run(_run()) + await asyncio.wait_for(runner.evaluate(), timeout=30) + status = "PASSED" + reason = "AgentEvaluator trace-mode evaluation completed" + except AssertionError as exc: + status = "FAILED_EXPECTED" + reason = str(exc)[:500] + except Exception as exc: # pragma: no cover - SDK runtime drift. + status = "FAILED_SDK_SMOKE" + reason = f"{type(exc).__name__}: {str(exc)[:500]}" + return { + "status": status, + "reason": reason, + "evalset": evalset_path.name, + "has_result": runner.get_result() is not None, + "metrics_file": metrics_path.name, + } + finally: + os.chdir(cwd) -def invocation_text(invocation: dict[str, Any], field_name: str) -> str: - """Concatenate the text parts of a conversation field (``user_content`` etc.).""" - content = invocation[field_name] +def text_field(invocation: dict[str, Any], field_name: str) -> str: + """Extract concatenated text from an EvalCase invocation field.""" + content = invocation.get(field_name) or {} return "".join(part.get("text", "") for part in content.get("parts", [])) def expected_tools(invocation: dict[str, Any]) -> list[dict[str, Any]]: - """Return the expected ``tool_uses`` list from an expected invocation.""" - data = invocation.get("intermediate_data") or {} - return data.get("tool_uses") or [] + """Return expected tool calls from an EvalCase trace invocation.""" + return (invocation.get("intermediate_data") or {}).get("tool_uses") or [] -def fake_agent(prompt: str, query: str) -> dict[str, Any]: - """Deterministic stand-in for a real agent, keyed off prompt feature flags. +def normalize_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Normalize tool calls for order-sensitive trajectory comparison.""" + return [{"name": item.get("name"), "args": item.get("args", {})} for item in tools] - The fake reads two scripted capability flags from the prompt - (``USE_UAPI_TOOLS`` / ``AGGRESSIVE_SEARCH``) so the optimization candidate - produces observable behavior changes without any model call. - Args: - prompt: The system prompt (baseline or candidate) currently under test. - query: The user query for this case. +def fake_agent(prompt: str, query: str) -> dict[str, Any]: + """Deterministic fake model used by fake mode. - Returns: - A dict with ``text`` (final response) and ``tools`` (tool-call list). + The fake reads prompt feature flags written by the scripted optimizer. This + gives repeatable behavior changes without a model key or remote service. """ - uses_uapi = "USE_UAPI_TOOLS" in prompt - aggressive_search = "AGGRESSIVE_SEARCH" in prompt + use_catalog = "USE_CATALOG_LOOKUP" in prompt + aggressive_lookup = "AGGRESSIVE_LOOKUP" in prompt - if "公网 IP" in query: - if uses_uapi: - return {"text": "你的公网 IP 是 203.0.113.10。", "tools": [{"name": "get_my_public_ip", "args": {"source": "commercial"}}]} - return {"text": "我无法确定你的公网 IP。", "tools": []} + if "shipping status for order A100" in query: + if use_catalog: + return { + "text": "Order A100 is in transit and arrives on Friday.", + "tools": [{"name": "lookup_order", "args": {"order_id": "A100"}}], + } + return {"text": "I do not have enough order data.", "tools": []} - if "2026-10-01" in query: - if uses_uapi: - return {"text": "2026-10-01 是法定休息日。", "tools": [{"name": "query_holiday_calendar", "args": {"date": "2026-10-01", "holiday_type": "legal"}}]} - return {"text": "这天大概率是节日,但我没有查询。", "tools": []} + if "refund policy for damaged items" in query: + if use_catalog: + return { + "text": "Damaged items are eligible for a full refund within 30 days.", + "tools": [{"name": "search_policy", "args": {"topic": "damaged item refund"}}], + } + return {"text": "You may be eligible, but I cannot confirm the policy.", "tools": []} - if "只返回 JSON" in query: + if "Return only JSON" in query: return {"text": "status ok", "tools": []} - if "Go 最新版本" in query: - if uses_uapi: - return {"text": "Go 1.26 是当前查询到的最新版本。", "tools": [{"name": "uapi_search", "args": {"query": "Go 最新版本"}}]} - return {"text": "Go 有新版本,但我没有足够信息确认。", "tools": [{"name": "websearch", "args": {"query": "Go 最新版本"}}]} + if "warranty period for Model Z" in query: + if use_catalog: + return { + "text": "Model Z has a 24-month warranty.", + "tools": [{"name": "search_policy", "args": {"topic": "Model Z warranty"}}], + } + return { + "text": "I am not sure about the Model Z warranty.", + "tools": [{"name": "web_search", "args": {"query": "Model Z warranty"}}], + } - if query == "在吗": - if aggressive_search: - return {"text": "我查了一下网页:在。", "tools": [{"name": "uapi_search", "args": {"query": "在吗"}}]} - return {"text": "在。", "tools": []} + if query.strip() == "Thanks": + if aggressive_lookup: + return { + "text": "You are welcome.", + "tools": [{"name": "search_policy", "args": {"topic": "thanks"}}], + } + return {"text": "You are welcome.", "tools": []} - if "北京天气" in query: - if aggressive_search: + if "order A200" in query: + if aggressive_lookup: return { - "text": "北京今天有天气信息。", + "text": "Order A200 is delivered.", "tools": [ - {"name": "get_current_weather", "args": {"city": "北京"}}, - {"name": "uapi_search", "args": {"query": "北京天气"}}, + {"name": "lookup_order", "args": {"order_id": "A200"}}, + {"name": "search_policy", "args": {"topic": "order A200"}}, ], } - return {"text": "北京今天有天气信息。", "tools": [{"name": "get_current_weather", "args": {"city": "北京"}}]} - - return {"text": "收到。", "tools": []} - - -def normalize_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Project tool calls to ``{name, args}`` for order-sensitive comparison.""" - return [{"name": tool.get("name"), "args": tool.get("args", {})} for tool in tools] - - -def rubric_score(meta: dict[str, Any], actual: dict[str, Any]) -> float: - """Score the rubric dimension a case declares in ``case_meta.json``. - - The rubric kind is data driven (not inferred from the case id): + return { + "text": "Order A200 is delivered.", + "tools": [{"name": "lookup_order", "args": {"order_id": "A200"}}], + } - * ``json_format`` -> 1.0 if the reply is a JSON object, else 0.0 - * ``no_tool`` -> 1.0 if no tool was called, else 0.0 - * ``single_tool`` -> 0.5 if more than one tool was called, else 1.0 - * ``none`` / unset -> 1.0 + return {"text": "I can help with support questions.", "tools": []} - Args: - meta: The case metadata entry. - actual: The agent output (``text`` + ``tools``). - Returns: - A rubric score in ``[0, 1]``. - """ +def rubric_score(meta: dict[str, Any], output: dict[str, Any]) -> float: + """Score the case-specific fake judge rubric declared in case_meta.json.""" kind = meta.get("rubric", "none") if kind == "json_format": - return 1.0 if actual["text"].strip().startswith("{") else 0.0 + try: + json.loads(output["text"]) + return 1.0 + except json.JSONDecodeError: + return 0.0 if kind == "no_tool": - return 0.0 if actual["tools"] else 1.0 + return 1.0 if not output["tools"] else 0.0 if kind == "single_tool": - return 0.5 if len(actual["tools"]) > 1 else 1.0 + return 1.0 if len(output["tools"]) <= 1 else 0.5 return 1.0 def classify_tool_failure( actual: list[dict[str, Any]], expected: list[dict[str, Any]], - tool_intent: str, + meta: dict[str, Any], ) -> str | None: - """Attribute a tool-trajectory mismatch to a specific failure category. - - Classification uses only the structured trajectory (expected vs actual tool - calls) plus the case's declared ``tool_intent`` — never the case id: - - * ``knowledge_recall_insufficient`` — the case relies on the - authoritative search tool but the agent did not call it. - * ``spurious_tool_call`` — the agent issued every expected call *and* - extra ones (over-calling), including calling tools when none were - expected. - * ``parameter_error`` — the leading tool name matches but arguments - differ. - * ``tool_call_error`` — a wrong or missing tool otherwise. - - Args: - actual: Actual tool-call list. - expected: Expected tool-call list. - tool_intent: Attribution hint from ``case_meta.json``. - - Returns: - A failure category, or ``None`` when trajectories match. - """ - a, e = normalize_tools(actual), normalize_tools(expected) - if a == e: + """Cluster tool trajectory failures into issue-required categories.""" + actual_norm = normalize_tools(actual) + expected_norm = normalize_tools(expected) + if actual_norm == expected_norm: return None - a_names = {tool["name"] for tool in a} - e_names = {tool["name"] for tool in e} - if tool_intent == "authoritative_search" and AUTHORITATIVE_SEARCH_TOOL not in a_names: + actual_names = {item["name"] for item in actual_norm} + expected_names = {item["name"] for item in expected_norm} + authoritative = meta.get("authoritative_tool") + if authoritative and authoritative not in actual_names: return "knowledge_recall_insufficient" - if a and all(tool in a for tool in e) and len(a) > len(e): + if actual_norm and all(item in actual_norm for item in expected_norm) and len(actual_norm) > len(expected_norm): return "spurious_tool_call" - if a and e and a[0]["name"] == e[0]["name"]: + if actual_norm and expected_norm and actual_norm[0]["name"] == expected_norm[0]["name"]: return "parameter_error" + if expected_names and not actual_names: + return "tool_call_error" return "tool_call_error" def classify_rubric_failure(meta: dict[str, Any]) -> str: - """Map a failed rubric dimension to its attribution category.""" + """Map failed rubric dimensions to human-auditable failure labels.""" if meta.get("rubric") == "json_format": return "format_error" return "llm_rubric_not_met" @@ -359,167 +315,120 @@ def failure_types_for( final_score: float, tool_score: float, rubric: float, - actual: dict[str, Any], + output: dict[str, Any], expected: list[dict[str, Any]], ) -> list[str]: - """Collect all failure categories for a case from its sub-scores. - - Args: - meta: Case metadata (``rubric`` / ``tool_intent``). - final_score: Final-response sub-score. - tool_score: Tool-trajectory sub-score. - rubric: Rubric sub-score. - actual: Agent output. - expected: Expected tool calls. - - Returns: - An ordered, de-duplicated list of failure category labels. - """ + """Collect all failure labels for one case from metric sub-scores.""" failures: list[str] = [] if final_score < 1.0: failures.append("final_response_mismatch") if tool_score < 1.0: - label = classify_tool_failure(actual["tools"], expected, meta.get("tool_intent", "none")) + label = classify_tool_failure(output["tools"], expected, meta) if label: failures.append(label) if rubric < 1.0: failures.append(classify_rubric_failure(meta)) - return failures + return list(dict.fromkeys(failures)) + + +async def produce_output(query: str, prompt_path: Path, mode: str) -> dict[str, Any]: + """Run either the fake agent or the live ``LlmAgent`` bridge.""" + prompt_text = prompt_path.read_text(encoding="utf-8") + if mode == "live": + from agent.agent import run_agent + + return await run_agent(query=query, prompt_path=prompt_path) + return fake_agent(prompt_text, query) def score_case( case: dict[str, Any], - prompt: str, + output: dict[str, Any], cfg: dict[str, Any], case_meta: dict[str, Any], ) -> CaseResult: - """Evaluate one case against a prompt and return its scored result. - - Args: - case: An ``eval_cases`` entry from the evalset. - prompt: The system prompt under test. - cfg: The optimizer config (metric weights, gate thresholds). - case_meta: Mapping of ``eval_id`` to per-case metadata. - - Returns: - A :class:`CaseResult` with metrics, attribution and trace. - """ + """Score one EvalCase against already-produced model output.""" invocation = case["conversation"][0] - query = invocation_text(invocation, "user_content") - expected_text = invocation_text(invocation, "final_response") + case_id = case["eval_id"] + query = text_field(invocation, "user_content") + expected_text = text_field(invocation, "final_response") expected = expected_tools(invocation) - meta = case_meta.get(case["eval_id"], {}) - actual = fake_agent(prompt, query) + meta = case_meta.get(case_id, {}) - final_score = 1.0 if expected_text.lower() in actual["text"].lower() else 0.0 - tool_score = 1.0 if normalize_tools(actual["tools"]) == normalize_tools(expected) else 0.0 - rubric = rubric_score(meta, actual) - weights = {m["name"]: m["weight"] for m in cfg["evaluate"]["metrics"]} + final_score = 1.0 if expected_text.lower() in output["text"].lower() else 0.0 + tool_score = 1.0 if normalize_tools(output["tools"]) == normalize_tools(expected) else 0.0 + rubric = rubric_score(meta, output) + weights = {item["name"]: item["weight"] for item in cfg["evaluate"]["metrics"]} score = round( final_score * weights["final_response"] + tool_score * weights["tool_trajectory"] + rubric * weights["rubric"], 4, ) - passed = score >= 0.8 - failure_types = failure_types_for(meta, final_score, tool_score, rubric, actual, expected) - hard_fail = score < cfg["gate"]["hard_fail_threshold"] + failures = failure_types_for(meta, final_score, tool_score, rubric, output, expected) + passed = score >= cfg["evaluate"].get("pass_threshold", 0.8) return CaseResult( - case_id=case["eval_id"], + case_id=case_id, score=score, passed=passed, - hard_fail=hard_fail, + hard_fail=score < cfg["gate"]["hard_fail_threshold"], key=bool(meta.get("key", False)), metrics={ "final_response": final_score, "tool_trajectory": tool_score, "rubric": rubric, }, - failure_types=failure_types, - reason="pass" if passed else "; ".join(failure_types), + failure_types=failures, + reason="pass" if passed else "; ".join(failures or ["unknown"]), trace={ "query": query, "expected_text": expected_text, - "actual_text": actual["text"], + "actual_text": output["text"], "expected_tools": expected, - "actual_tools": actual["tools"], + "actual_tools": output["tools"], }, ) -def evaluate_evalset( +async def evaluate_evalset( evalset: dict[str, Any], - prompt: str, + prompt_path: Path, cfg: dict[str, Any], case_meta: dict[str, Any], + mode: str, ) -> dict[str, Any]: - """Score every case in an evalset and aggregate mean score and pass rate.""" - cases = [score_case(case, prompt, cfg, case_meta) for case in evalset["eval_cases"]] - mean = round(sum(case.score for case in cases) / len(cases), 4) + """Evaluate all cases in one train/validation evalset.""" + cases: list[CaseResult] = [] + for case in evalset["eval_cases"]: + query = text_field(case["conversation"][0], "user_content") + output = await produce_output(query=query, prompt_path=prompt_path, mode=mode) + cases.append(score_case(case, output, cfg, case_meta)) + mean = round(sum(item.score for item in cases) / len(cases), 4) return { "eval_set_id": evalset["eval_set_id"], "mean_score": mean, - "pass_rate": round(sum(case.passed for case in cases) / len(cases), 4), - "cases": {case.case_id: case.__dict__ for case in cases}, + "pass_rate": round(sum(item.passed for item in cases) / len(cases), 4), + "cases": {item.case_id: asdict(item) for item in cases}, } def attribute_failures(*results: dict[str, Any]) -> dict[str, Any]: - """Cluster baseline failures into category counts and a per-case breakdown. - - Args: - *results: One or more evaluated evalsets (baseline train / val). - - Returns: - A dict with ``counts`` (category -> frequency) and ``by_case`` - (case id -> failure categories), covering only failing cases. - """ + """Cluster baseline failures and count each explanation type.""" counts: Counter[str] = Counter() by_case: dict[str, list[str]] = {} for result in results: for case_id, case in result["cases"].items(): if case["passed"]: continue - failure_types = case["failure_types"] or ["unknown"] - by_case[case_id] = failure_types - counts.update(failure_types) + failures = case["failure_types"] or ["unknown"] + by_case[case_id] = failures + counts.update(failures) return {"counts": dict(counts), "by_case": by_case} -def optimize_prompt(baseline: str, cfg: dict[str, Any], run_dir: Path) -> tuple[str, dict[str, Any]]: - """Produce a candidate prompt for the current (fake/trace) mode. - - In fake/trace mode this applies the deterministic ``candidate_patch`` from - the config instead of invoking :class:`AgentOptimizer`, so the example stays - reproducible without an API key. The returned status distinguishes optimizer - *availability* from *invocation* to avoid implying a real search happened. - - Args: - baseline: The baseline prompt text. - cfg: The optimizer config. - run_dir: Directory for candidate prompt snapshots. - - Returns: - A tuple of ``(candidate_text, status_dict)``. - """ - candidate = baseline.rstrip() + "\n" + "\n".join(cfg["optimize"]["candidate_patch"]) + "\n" - candidate_path = run_dir / "candidate_prompt.md" - candidate_path.write_text(candidate, encoding="utf-8") - return candidate, { - "status": "SCRIPTED_CANDIDATE", - "algorithm": cfg["optimize"]["algorithm"], - "agent_optimizer_available": AgentOptimizer is not None and TargetPrompt is not None, - "agent_optimizer_invoked": False, - "note": "fake/trace mode applies a deterministic patch; see examples/optimization/quickstart for a live GEPA run.", - "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), - "cost_usd": 0.0, - "tokens": 0, - } - - def diff_cases(baseline: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]: - """Compute per-case deltas (new_pass / new_fail / score_up / score_down / same).""" - delta = {} + """Compare candidate against baseline case-by-case.""" + delta: dict[str, Any] = {} for case_id, cand in candidate["cases"].items(): base = baseline["cases"][case_id] if not base["passed"] and cand["passed"]: @@ -552,29 +461,7 @@ def gate_decision( cfg: dict[str, Any], cost_usd: float, ) -> dict[str, Any]: - """Run the validation-first acceptance gate and return its decision. - - Five independent, configurable checks must all pass to ACCEPT: - - 1. validation mean-score gain meets ``min_val_score_gain``; - 2. no new hard failure appears on validation; - 3. no *key* validation case regresses (new_fail / score_down); - 4. not overfitting (train up while validation down); - 5. optimization cost stays within ``max_cost_usd``. - - Args: - baseline_train: Baseline train evaluation. - candidate_train: Candidate train evaluation. - baseline_val: Baseline validation evaluation. - candidate_val: Candidate validation evaluation. - val_delta: Per-case validation deltas from :func:`diff_cases`. - cfg: Optimizer config (the ``gate`` block). - cost_usd: Optimization cost in USD. - - Returns: - A decision dict with ``accepted`` / ``decision`` / ``reason`` and the - per-check breakdown. - """ + """Apply the configurable validation-first acceptance gate.""" gate = cfg["gate"] train_gain = round(candidate_train["mean_score"] - baseline_train["mean_score"], 4) val_gain = round(candidate_val["mean_score"] - baseline_val["mean_score"], 4) @@ -583,12 +470,10 @@ def gate_decision( for case_id, case in candidate_val["cases"].items() if case["hard_fail"] and not baseline_val["cases"][case_id]["hard_fail"] ] - # A "critical" regression is one on a case explicitly marked key=true in - # case_meta.json, not merely any validation case. critical_regressions = [ case_id - for case_id, diff in val_delta.items() - if candidate_val["cases"][case_id]["key"] and diff["kind"] in {"new_fail", "score_down"} + for case_id, delta in val_delta.items() + if candidate_val["cases"][case_id]["key"] and delta["kind"] in {"new_fail", "score_down"} ] checks = [ { @@ -614,90 +499,137 @@ def gate_decision( { "name": "cost_budget", "passed": cost_usd <= gate["max_cost_usd"], - "detail": f"cost_usd={cost_usd:.4f}, budget={gate['max_cost_usd']:.4f}", + "detail": f"cost_usd={cost_usd:.6f}, budget={gate['max_cost_usd']:.6f}", }, ] - accepted = all(check["passed"] for check in checks) + accepted = all(item["passed"] for item in checks) return { "accepted": accepted, "decision": "ACCEPT" if accepted else "REJECT", - "reason": "all gates passed" if accepted else "; ".join(check["name"] for check in checks if not check["passed"]), + "reason": "all gates passed" if accepted else "; ".join(item["name"] for item in checks if not item["passed"]), "train_gain": train_gain, "val_gain": val_gain, "checks": checks, } -def narrate_report(report: dict[str, Any]) -> str: - """Render a data-driven, plain-language Chinese summary of the run. - - The narrative is derived entirely from the gate decision, validation deltas - and failure attribution, so it stays correct for any input (unlike a static - paragraph). It is deterministic and needs no model, keeping the no-key path - reproducible. - - Args: - report: The fully assembled report dict. - - Returns: - A short multi-sentence Chinese summary. - """ - gate = report["gate"] - verb = "接受" if gate["decision"] == "ACCEPT" else "拒绝" - parts = [ - f"本次({report['run']['mode']} 模式)决定**{verb}**候选 prompt。" - f"训练集均分 {report['baseline']['train']['mean_score']}→{report['candidate']['train']['mean_score']}" - f"({gate['train_gain']:+.4f}),验证集 {report['baseline']['val']['mean_score']}→" - f"{report['candidate']['val']['mean_score']}({gate['val_gain']:+.4f})。" +def precheck_live_mode() -> None: + """Fail fast before live mode spends time evaluating a broken environment.""" + if AgentOptimizer is None or TargetPrompt is None: + raise SystemExit( + "Live mode requires trpc_agent_sdk.evaluation.AgentOptimizer and " + f"TargetPrompt. SDK import error: {SDK_IMPORT_ERROR}" + ) + missing = [ + name + for name in ("TRPC_AGENT_API_KEY", "TRPC_AGENT_BASE_URL", "TRPC_AGENT_MODEL_NAME") + if not os.getenv(name) ] - if gate["train_gain"] > 0 and gate["val_gain"] < 0: - parts.append("训练涨但验证跌,呈现过拟合特征。") + if missing: + raise SystemExit( + "Live mode requires model credentials before baseline evaluation: " + + ", ".join(missing) + + ". Use --mode fake for the no-key path." + ) - new_pass = [cid for cid, d in report["delta"]["val"].items() if d["kind"] == "new_pass"] - new_fail = [cid for cid, d in report["delta"]["val"].items() if d["kind"] == "new_fail"] - if new_pass: - parts.append(f"验证集新增通过:{'、'.join(new_pass)}。") - if new_fail: - parts.append(f"⚠️ 验证集新增失败:{'、'.join(new_fail)}。") - failed_checks = [c["name"] for c in gate["checks"] if not c["passed"]] - if gate["decision"] == "REJECT" and failed_checks: - parts.append("被以下 gate 拦截:" + "、".join(failed_checks) + "。") - elif gate["decision"] == "ACCEPT": - parts.append("五项 gate 全部通过:验证集提升达标、无过拟合、关键 case 未退化、无新增 hard fail、成本在预算内。") +def optimizer_fake(baseline_prompt: str, cfg: dict[str, Any], candidate_path: Path) -> tuple[str, dict[str, Any]]: + """Create a deterministic fake candidate without invoking AgentOptimizer.""" + candidate = baseline_prompt.rstrip() + "\n\n" + "\n".join(cfg["optimize"]["fake_candidate_patch"]) + "\n" + candidate_path.write_text(candidate, encoding="utf-8") + return candidate, { + "mode": "fake", + "status": "SCRIPTED_CANDIDATE", + "agent_optimizer_available": AgentOptimizer is not None, + "agent_optimizer_invoked": False, + "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), + "cost_usd": 0.0, + "tokens": 0, + "rounds": 1, + } - counts = report["failure_attribution"]["counts"] - if counts: - top = "、".join(f"{k}×{v}" for k, v in counts.items()) - parts.append(f"baseline 失败归因:{top}。") - return "".join(parts) + +async def optimizer_live( + source_prompt_path: Path, + train_path: Path, + val_path: Path, + cfg: dict[str, Any], + candidate_path: Path, + run_dir: Path, +) -> tuple[str, dict[str, Any]]: + """Invoke SDK ``AgentOptimizer.optimize`` for the registered TargetPrompt. + + The source prompt is the snapshot under ``runs/latest``. ``update_source`` is + configurable but defaults to false so the example produces candidates for + review rather than silently overwriting the baseline prompt. + """ + if AgentOptimizer is None or TargetPrompt is None: + raise SystemExit("Live mode requires trpc_agent_sdk.evaluation.AgentOptimizer and TargetPrompt.") + from agent.agent import make_call_agent + + sdk_config = resolve_path(cfg["optimize"]["sdk_config"]) + optimizer_dir = run_dir / "agent_optimizer" + target = TargetPrompt().add_path("system_prompt", str(source_prompt_path)) + started = time.perf_counter() + result = await AgentOptimizer.optimize( + config_path=str(sdk_config), + call_agent=make_call_agent(source_prompt_path), + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(optimizer_dir), + update_source=bool(cfg["optimize"].get("update_source", False)), + verbose=int(cfg["optimize"].get("verbose", 1)), + ) + best = (result.best_prompts or {}).get("system_prompt") + if not best: + best = source_prompt_path.read_text(encoding="utf-8") + candidate_path.write_text(best, encoding="utf-8") + token_usage = getattr(result, "total_token_usage", None) or {} + return best, { + "mode": "live", + "status": getattr(result, "status", "UNKNOWN"), + "finish_reason": getattr(result, "finish_reason", None), + "agent_optimizer_available": True, + "agent_optimizer_invoked": True, + "sdk_output_dir": optimizer_dir.relative_to(HERE).as_posix(), + "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), + "cost_usd": round(float(getattr(result, "total_llm_cost", 0.0) or 0.0), 6), + "tokens": token_usage.get("total", 0) if isinstance(token_usage, dict) else 0, + "rounds": getattr(result, "total_rounds", None), + "duration_seconds": round(time.perf_counter() - started, 4), + } def build_report( + *, + mode: str, cfg: dict[str, Any], baseline_prompt: str, candidate_prompt: str, + sdk_evaluator_runs: dict[str, Any], artifacts: dict[str, Any], - sdk_smoke: dict[str, Any], ) -> dict[str, Any]: - """Assemble the full audit report from run metadata and computed artifacts.""" + """Assemble the machine-readable issue-level audit report.""" return { "run": { "timestamp": datetime.now(timezone.utc).isoformat(), - "mode": cfg["mode"], + "mode": mode, "seed": cfg["seed"], "sdk_bridge": { - "evalset_validated_with_trpc_sdk": EvalSet is not None, "agent_evaluator_available": AgentEvaluator is not None, "agent_optimizer_available": AgentOptimizer is not None, - "agent_evaluator_trace_smoke": sdk_smoke, + "evalset_validated_with_trpc_sdk": EvalSet is not None, + "sdk_import_error": SDK_IMPORT_ERROR, + "agent_evaluator_trace_runs": sdk_evaluator_runs, }, "repro": { - "train_evalset": "train.evalset.json", - "val_evalset": "val.evalset.json", - "case_meta": "case_meta.json", - "optimizer_config": "optimizer.json", + "train_evalset": cfg["inputs"]["train_evalset"], + "val_evalset": cfg["inputs"]["val_evalset"], + "case_meta": cfg["inputs"]["case_meta"], "prompt_source": cfg["target_prompt"]["path"], + "optimizer_config": "optimizer.json", + "sdk_optimizer_config": cfg["optimize"].get("sdk_config"), }, }, "prompt_audit": { @@ -711,126 +643,201 @@ def build_report( } -def write_markdown(report: dict[str, Any], path: Path) -> None: - """Write the human-readable Markdown report, including the data-driven summary.""" +def render_summary(report: dict[str, Any]) -> str: + """Create a short human-readable decision summary for Markdown.""" gate = report["gate"] + new_pass = [case_id for case_id, item in report["delta"]["val"].items() if item["kind"] == "new_pass"] + new_fail = [case_id for case_id, item in report["delta"]["val"].items() if item["kind"] == "new_fail"] + return ( + f"Decision: {gate['decision']}. " + f"Train mean changed {report['baseline']['train']['mean_score']} -> " + f"{report['candidate']['train']['mean_score']} ({gate['train_gain']:+.4f}); " + f"validation mean changed {report['baseline']['val']['mean_score']} -> " + f"{report['candidate']['val']['mean_score']} ({gate['val_gain']:+.4f}). " + f"New validation passes: {new_pass or 'none'}. " + f"New validation failures: {new_fail or 'none'}. " + f"Gate reason: {gate['reason']}." + ) + + +def write_markdown(report: dict[str, Any], path: Path) -> None: + """Write the human-readable optimization_report.md artifact.""" lines = [ "# Optimization Report", "", - "## 人话总结", + "## Summary", "", - narrate_report(report), + render_summary(report), + "", + "## Scores", "", f"- Mode: `{report['run']['mode']}`", - f"- Decision: **{gate['decision']}**", - f"- Reason: {gate['reason']}", - f"- Baseline train score: {report['baseline']['train']['mean_score']}", - f"- Candidate train score: {report['candidate']['train']['mean_score']}", - f"- Baseline val score: {report['baseline']['val']['mean_score']}", - f"- Candidate val score: {report['candidate']['val']['mean_score']}", - f"- Train gain: {gate['train_gain']:+.4f}", - f"- Val gain: {gate['val_gain']:+.4f}", + f"- Baseline train mean: {report['baseline']['train']['mean_score']}", + f"- Candidate train mean: {report['candidate']['train']['mean_score']}", + f"- Baseline validation mean: {report['baseline']['val']['mean_score']}", + f"- Candidate validation mean: {report['candidate']['val']['mean_score']}", + f"- Decision: **{report['gate']['decision']}**", + f"- Reason: {report['gate']['reason']}", "", "## Failure Attribution", "", ] - for name, count in report["failure_attribution"]["counts"].items(): + for name, count in sorted(report["failure_attribution"]["counts"].items()): lines.append(f"- {name}: {count}") lines.extend(["", "## Validation Delta", ""]) - for case_id, diff in report["delta"]["val"].items(): - lines.append(f"- `{case_id}`: {diff['kind']} ({diff['baseline_score']} -> {diff['candidate_score']}, delta {diff['delta']:+.4f})") + for case_id, item in report["delta"]["val"].items(): + lines.append( + f"- `{case_id}`: {item['kind']} " + f"({item['baseline_score']} -> {item['candidate_score']}, {item['delta']:+.4f})" + ) lines.extend(["", "## Gate Checks", ""]) - for check in gate["checks"]: - mark = "PASS" if check["passed"] else "FAIL" - lines.append(f"- {mark} `{check['name']}`: {check['detail']}") + for check in report["gate"]["checks"]: + status = "PASS" if check["passed"] else "FAIL" + lines.append(f"- {status} `{check['name']}`: {check['detail']}") + lines.extend( + [ + "", + "## Audit", + "", + f"- Cost USD: {report['audit']['cost_usd']}", + f"- Tokens: {report['audit']['tokens']}", + f"- Duration seconds: {report['audit']['duration_seconds']}", + f"- Baseline SHA-256: `{report['prompt_audit']['baseline_sha256']}`", + f"- Candidate SHA-256: `{report['prompt_audit']['candidate_sha256']}`", + ] + ) path.write_text("\n".join(lines) + "\n", encoding="utf-8") -def main() -> None: - """Run the full evaluation + optimization loop and persist the audit report.""" - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--train", default="train.evalset.json") - parser.add_argument("--val", default="val.evalset.json") - parser.add_argument("--optimizer", default="optimizer.json") - parser.add_argument("--prompt", default="prompts/baseline_system.md") - args = parser.parse_args() - +async def run_pipeline(args: argparse.Namespace) -> None: + """Run all six issue-required stages and persist report artifacts.""" logging.basicConfig( - level=os.environ.get("YUN_LOG_LEVEL", "INFO"), + level=os.environ.get("EVAL_OPT_LOG_LEVEL", "INFO"), format="%(asctime)s %(levelname)s %(name)s | %(message)s", ) - - start = time.perf_counter() - cfg = load_json(HERE / args.optimizer) - train = validate_evalset(HERE / args.train) - val = validate_evalset(HERE / args.val) - case_meta = {k: v for k, v in load_json(HERE / cfg.get("case_meta", "case_meta.json")).items() if not k.startswith("_")} - logger.info("加载完成 mode=%s seed=%s train_cases=%d val_cases=%d", - cfg["mode"], cfg["seed"], len(train["eval_cases"]), len(val["eval_cases"])) - - sdk_smoke = sdk_trace_smoke(HERE / args.train) - logger.info("SDK trace 冒烟: %s", sdk_smoke["status"]) - baseline_prompt = (HERE / args.prompt).read_text(encoding="utf-8") + started = time.perf_counter() + cfg = load_json(resolve_path(args.optimizer)) + mode = args.mode or cfg.get("mode", "fake") + if mode not in {"fake", "live"}: + raise SystemExit("--mode must be fake or live") + if mode == "live": + precheck_live_mode() + + train_path = resolve_path(args.train or cfg["inputs"]["train_evalset"]) + val_path = resolve_path(args.val or cfg["inputs"]["val_evalset"]) + if train_path.resolve() == val_path.resolve(): + raise SystemExit("train and validation evalset paths must be different") + prompt_source = resolve_path(args.prompt or cfg["target_prompt"]["path"]) + case_meta = { + key: value + for key, value in load_json(resolve_path(cfg["inputs"]["case_meta"])).items() + if not key.startswith("_") + } + train = validate_evalset(train_path) + val = validate_evalset(val_path) run_dir = HERE / "runs" / "latest" + if run_dir.exists(): + shutil.rmtree(run_dir) run_dir.mkdir(parents=True, exist_ok=True) - (run_dir / "baseline_prompt.md").write_text(baseline_prompt, encoding="utf-8") + baseline_path = run_dir / "baseline_prompt.md" + candidate_path = run_dir / "candidate_prompt.md" + baseline_prompt = prompt_source.read_text(encoding="utf-8") + baseline_path.write_text(baseline_prompt, encoding="utf-8") - # Phase 1 — baseline evaluation. - baseline_train = evaluate_evalset(train, baseline_prompt, cfg, case_meta) - baseline_val = evaluate_evalset(val, baseline_prompt, cfg, case_meta) - logger.info("baseline 均分 train=%.4f val=%.4f", baseline_train["mean_score"], baseline_val["mean_score"]) + LOGGER.info("loaded mode=%s train_cases=%d val_cases=%d", mode, len(train["eval_cases"]), len(val["eval_cases"])) + sdk_evaluator_runs = { + "train": await sdk_trace_smoke(train_path), + "val": await sdk_trace_smoke(val_path), + } + LOGGER.info( + "AgentEvaluator trace runs: train=%s val=%s", + sdk_evaluator_runs["train"]["status"], + sdk_evaluator_runs["val"]["status"], + ) + + baseline_train = await evaluate_evalset(train, baseline_path, cfg, case_meta, mode) + baseline_val = await evaluate_evalset(val, baseline_path, cfg, case_meta, mode) + LOGGER.info("baseline mean train=%.4f val=%.4f", baseline_train["mean_score"], baseline_val["mean_score"]) - # Phase 2 — failure attribution over baseline failures only. failures = attribute_failures(baseline_train, baseline_val) - logger.info("baseline 失败归因: %s", failures["counts"]) - # Phase 3 — optimization (scripted candidate in fake/trace mode). - candidate_prompt, opt_status = optimize_prompt(baseline_prompt, cfg, run_dir) - logger.info("优化器: status=%s invoked=%s", opt_status["status"], opt_status["agent_optimizer_invoked"]) + if mode == "live": + candidate_prompt, optimizer_status = await optimizer_live( + source_prompt_path=baseline_path, + train_path=train_path, + val_path=val_path, + cfg=cfg, + candidate_path=candidate_path, + run_dir=run_dir, + ) + else: + candidate_prompt, optimizer_status = optimizer_fake(baseline_prompt, cfg, candidate_path) + LOGGER.info( + "optimizer status=%s invoked=%s", + optimizer_status["status"], + optimizer_status["agent_optimizer_invoked"], + ) + + candidate_train = await evaluate_evalset(train, candidate_path, cfg, case_meta, mode) + candidate_val = await evaluate_evalset(val, candidate_path, cfg, case_meta, mode) + LOGGER.info("candidate mean train=%.4f val=%.4f", candidate_train["mean_score"], candidate_val["mean_score"]) - # Phase 4 — candidate validation + diff. - candidate_train = evaluate_evalset(train, candidate_prompt, cfg, case_meta) - candidate_val = evaluate_evalset(val, candidate_prompt, cfg, case_meta) - logger.info("candidate 均分 train=%.4f val=%.4f", candidate_train["mean_score"], candidate_val["mean_score"]) train_delta = diff_cases(baseline_train, candidate_train) val_delta = diff_cases(baseline_val, candidate_val) + gate = gate_decision( + baseline_train, + candidate_train, + baseline_val, + candidate_val, + val_delta, + cfg, + optimizer_status["cost_usd"], + ) - # Phase 5 — acceptance gate. - gate = gate_decision(baseline_train, candidate_train, baseline_val, candidate_val, - val_delta, cfg, opt_status["cost_usd"]) - for check in gate["checks"]: - logger.info("gate %-30s %s | %s", check["name"], "PASS" if check["passed"] else "FAIL", check["detail"]) - logger.info("gate 决策: %s (%s)", gate["decision"], gate["reason"]) - duration = round(time.perf_counter() - start, 4) - - # Phase 6 — audit persistence. + duration = round(time.perf_counter() - started, 4) report = build_report( - cfg, - baseline_prompt, - candidate_prompt, - { + mode=mode, + cfg=cfg, + baseline_prompt=baseline_prompt, + candidate_prompt=candidate_prompt, + sdk_evaluator_runs=sdk_evaluator_runs, + artifacts={ "baseline": {"train": baseline_train, "val": baseline_val}, "candidate": {"train": candidate_train, "val": candidate_val}, "delta": {"train": train_delta, "val": val_delta}, "failure_attribution": failures, - "optimizer": opt_status, + "optimizer": optimizer_status, "gate": gate, "audit": { "duration_seconds": duration, - "cost_usd": opt_status["cost_usd"], - "tokens": opt_status["tokens"], + "cost_usd": optimizer_status["cost_usd"], + "tokens": optimizer_status["tokens"], "config_snapshot": cfg, }, }, - sdk_smoke, ) write_json(HERE / "optimization_report.json", report) write_markdown(report, HERE / "optimization_report.md") - logger.info("已写出 optimization_report.json / .md,用时 %.4fs", duration) print(f"{gate['decision']}: {gate['reason']}") print("wrote optimization_report.json and optimization_report.md") +def parse_args() -> argparse.Namespace: + """Parse CLI flags for fake/live mode and alternate input files.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--mode", choices=["fake", "live"], default=None) + parser.add_argument("--optimizer", default="optimizer.json") + parser.add_argument("--train", default=None) + parser.add_argument("--val", default=None) + parser.add_argument("--prompt", default=None) + return parser.parse_args() + + +def main() -> None: + """Entrypoint used by README commands and CI smoke checks.""" + asyncio.run(run_pipeline(parse_args())) + + if __name__ == "__main__": main() diff --git a/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md b/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md index b462b93..47e4741 100644 --- a/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md +++ b/examples/optimization/eval_optimize_loop/runs/latest/baseline_prompt.md @@ -1,7 +1,6 @@ -You are a concise life assistant. +You are a concise customer support assistant. Rules: -- Answer directly when possible. -- Use the weather tool for weather questions. -- Do not invent data. +- Answer directly when the answer is already known. +- Do not invent order, refund, or warranty facts. - Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md b/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md index ec588a4..152a493 100644 --- a/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md +++ b/examples/optimization/eval_optimize_loop/runs/latest/candidate_prompt.md @@ -1,11 +1,10 @@ -You are a concise life assistant. +You are a concise customer support assistant. Rules: -- Answer directly when possible. -- Use the weather tool for weather questions. -- Do not invent data. +- Answer directly when the answer is already known. +- Do not invent order, refund, or warranty facts. - Keep responses short. Optimization candidate: -- USE_UAPI_TOOLS: use get_my_public_ip, uapi_search, and query_holiday_calendar for IP/search/calendar questions. -- AGGRESSIVE_SEARCH: prefer uapi_search whenever a query looks underspecified. +- USE_CATALOG_LOOKUP: use lookup_order for order status and search_policy for policy/warranty questions before answering. +- AGGRESSIVE_LOOKUP: when uncertain, prefer looking up supporting data even for short or already-answerable requests. diff --git a/examples/optimization/eval_optimize_loop/train.evalset.json b/examples/optimization/eval_optimize_loop/train.evalset.json index 84564db..d1b5004 100644 --- a/examples/optimization/eval_optimize_loop/train.evalset.json +++ b/examples/optimization/eval_optimize_loop/train.evalset.json @@ -1,63 +1,63 @@ { "eval_set_id": "eval_optimize_loop_train", "name": "Evaluation optimization loop train set", - "description": "Three trace-mode training cases: two optimizable failures and one intentionally ineffective optimization case.", + "description": "Three training cases: two optimizable failures and one intentionally ineffective JSON-format failure.", "eval_cases": [ { - "eval_id": "train_ip_lookup_optimizable", + "eval_id": "train_order_lookup_optimizable", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-1", - "user_content": {"role": "user", "parts": [{"text": "查询我的公网 IP"}]}, - "final_response": {"role": "model", "parts": [{"text": "我无法确定你的公网 IP。"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the shipping status for order A100?"}]}, + "final_response": {"role": "model", "parts": [{"text": "I do not have enough order data."}]}, "intermediate_data": {"tool_uses": []} } ], "conversation": [ { "invocation_id": "expected-1", - "user_content": {"role": "user", "parts": [{"text": "查询我的公网 IP"}]}, - "final_response": {"role": "model", "parts": [{"text": "203.0.113.10"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the shipping status for order A100?"}]}, + "final_response": {"role": "model", "parts": [{"text": "in transit"}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "get_my_public_ip", "args": {"source": "commercial"}} + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A100"}} ] } } ] }, { - "eval_id": "train_calendar_optimizable", + "eval_id": "train_refund_policy_optimizable", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-2", - "user_content": {"role": "user", "parts": [{"text": "2026-10-01 是不是休息日?"}]}, - "final_response": {"role": "model", "parts": [{"text": "这天大概率是节日,但我没有查询。"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the refund policy for damaged items?"}]}, + "final_response": {"role": "model", "parts": [{"text": "You may be eligible, but I cannot confirm the policy."}]}, "intermediate_data": {"tool_uses": []} } ], "conversation": [ { "invocation_id": "expected-2", - "user_content": {"role": "user", "parts": [{"text": "2026-10-01 是不是休息日?"}]}, - "final_response": {"role": "model", "parts": [{"text": "休息日"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the refund policy for damaged items?"}]}, + "final_response": {"role": "model", "parts": [{"text": "full refund within 30 days"}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "query_holiday_calendar", "args": {"date": "2026-10-01", "holiday_type": "legal"}} + {"id": "tool-1", "name": "search_policy", "args": {"topic": "damaged item refund"}} ] } } ] }, { - "eval_id": "train_strict_json_ineffective", + "eval_id": "train_json_format_ineffective", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-3", - "user_content": {"role": "user", "parts": [{"text": "只返回 JSON:status ok"}]}, + "user_content": {"role": "user", "parts": [{"text": "Return only JSON: status ok"}]}, "final_response": {"role": "model", "parts": [{"text": "status ok"}]}, "intermediate_data": {"tool_uses": []} } @@ -65,7 +65,7 @@ "conversation": [ { "invocation_id": "expected-3", - "user_content": {"role": "user", "parts": [{"text": "只返回 JSON:status ok"}]}, + "user_content": {"role": "user", "parts": [{"text": "Return only JSON: status ok"}]}, "final_response": {"role": "model", "parts": [{"text": "{\"status\":\"ok\"}"}]}, "intermediate_data": {"tool_uses": []} } diff --git a/examples/optimization/eval_optimize_loop/val.evalset.json b/examples/optimization/eval_optimize_loop/val.evalset.json index 4b2787a..4449ae1 100644 --- a/examples/optimization/eval_optimize_loop/val.evalset.json +++ b/examples/optimization/eval_optimize_loop/val.evalset.json @@ -1,19 +1,19 @@ { "eval_set_id": "eval_optimize_loop_val", "name": "Evaluation optimization loop validation set", - "description": "Three trace-mode validation cases: one new pass, one hard regression, and one soft degradation.", + "description": "Three validation cases: one new pass, one hard regression, and one soft degradation.", "eval_cases": [ { - "eval_id": "val_search_fallback_new_pass", + "eval_id": "val_warranty_new_pass", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-1", - "user_content": {"role": "user", "parts": [{"text": "默认搜索不够时,查一下 Go 最新版本"}]}, - "final_response": {"role": "model", "parts": [{"text": "Go 有新版本,但我没有足够信息确认。"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the warranty period for Model Z?"}]}, + "final_response": {"role": "model", "parts": [{"text": "I am not sure about the Model Z warranty."}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "websearch", "args": {"query": "Go 最新版本"}} + {"id": "tool-1", "name": "web_search", "args": {"query": "Model Z warranty"}} ] } } @@ -21,47 +21,47 @@ "conversation": [ { "invocation_id": "expected-1", - "user_content": {"role": "user", "parts": [{"text": "默认搜索不够时,查一下 Go 最新版本"}]}, - "final_response": {"role": "model", "parts": [{"text": "Go 1.26"}]}, + "user_content": {"role": "user", "parts": [{"text": "What is the warranty period for Model Z?"}]}, + "final_response": {"role": "model", "parts": [{"text": "24-month warranty"}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "uapi_search", "args": {"query": "Go 最新版本"}} + {"id": "tool-1", "name": "search_policy", "args": {"topic": "Model Z warranty"}} ] } } ] }, { - "eval_id": "val_smalltalk_no_tool_regression", + "eval_id": "val_smalltalk_regression", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-2", - "user_content": {"role": "user", "parts": [{"text": "在吗"}]}, - "final_response": {"role": "model", "parts": [{"text": "在。"}]}, + "user_content": {"role": "user", "parts": [{"text": "Thanks"}]}, + "final_response": {"role": "model", "parts": [{"text": "You are welcome."}]}, "intermediate_data": {"tool_uses": []} } ], "conversation": [ { "invocation_id": "expected-2", - "user_content": {"role": "user", "parts": [{"text": "在吗"}]}, - "final_response": {"role": "model", "parts": [{"text": "在"}]}, + "user_content": {"role": "user", "parts": [{"text": "Thanks"}]}, + "final_response": {"role": "model", "parts": [{"text": "welcome"}]}, "intermediate_data": {"tool_uses": []} } ] }, { - "eval_id": "val_weather_soft_degradation", + "eval_id": "val_order_soft_degradation", "eval_mode": "trace", "actual_conversation": [ { "invocation_id": "actual-3", - "user_content": {"role": "user", "parts": [{"text": "北京天气怎么样?"}]}, - "final_response": {"role": "model", "parts": [{"text": "北京今天有天气信息。"}]}, + "user_content": {"role": "user", "parts": [{"text": "Check order A200."}]}, + "final_response": {"role": "model", "parts": [{"text": "Order A200 is delivered."}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "get_current_weather", "args": {"city": "北京"}} + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A200"}} ] } } @@ -69,11 +69,11 @@ "conversation": [ { "invocation_id": "expected-3", - "user_content": {"role": "user", "parts": [{"text": "北京天气怎么样?"}]}, - "final_response": {"role": "model", "parts": [{"text": "北京"}]}, + "user_content": {"role": "user", "parts": [{"text": "Check order A200."}]}, + "final_response": {"role": "model", "parts": [{"text": "delivered"}]}, "intermediate_data": { "tool_uses": [ - {"id": "tool-1", "name": "get_current_weather", "args": {"city": "北京"}} + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A200"}} ] } }