From b54d0263cb0a5a48809a713e6f4f7b6de6df1f0c Mon Sep 17 00:00:00 2001 From: han12580 <3418728804@qq.com> Date: Wed, 1 Jul 2026 20:21:33 +0800 Subject: [PATCH] examples: add evaluation + optimization closed-loop pipeline Add a reproducible Evaluation + Optimization pipeline under examples/optimization/eval_optimize_loop that wires AgentEvaluator and AgentOptimizer into a single auditable loop: baseline evaluation, failure attribution, prompt optimization over a three-field TargetPrompt, candidate validation with per-case delta, a configurable acceptance gate, and audit artifacts. The pipeline records per-case metric scores, pass/fail, failure reasons and key trajectory during baseline evaluation, clusters failures into six categories, re-runs the validation set on the candidate to distinguish newly passed, newly failed, improved and regressed cases, and rejects overfitting candidates that improve on train but regress on validation. It emits optimization_report.json and optimization_report.md, and persists per-round candidate prompts, cost, duration and the reproducibility config. A default offline fake backend runs the whole loop deterministically without an API key in a few seconds, while a real backend drives a live multi-agent setup and the real GEPA optimizer when model credentials are provided. Six sample cases cover the optimizable, ineffective and regressing scenarios. Fixes #91 RELEASE NOTES: Add an evaluation + optimization closed-loop example under examples/optimization/eval_optimize_loop. Co-Authored-By: Claude Opus 4.8 --- .../eval_optimize_loop/.gitignore | 3 + .../optimization/eval_optimize_loop/DESIGN.md | 35 ++ .../optimization/eval_optimize_loop/README.md | 120 +++++ .../eval_optimize_loop/agent/__init__.py | 29 ++ .../eval_optimize_loop/agent/config.py | 36 ++ .../eval_optimize_loop/agent/fake_backend.py | 130 ++++++ .../eval_optimize_loop/agent/orchestrator.py | 99 ++++ .../agent/prompts/router.md | 9 + .../eval_optimize_loop/agent/prompts/skill.md | 10 + .../agent/prompts/system.md | 12 + .../eval_optimize_loop/config.json | 9 + .../data/train.evalset.json | 58 +++ .../eval_optimize_loop/data/val.evalset.json | 58 +++ .../eval_optimize_loop/eval_metrics.json | 14 + .../optimization_report.json | 431 ++++++++++++++++++ .../eval_optimize_loop/optimization_report.md | 63 +++ .../eval_optimize_loop/optimizer.json | 38 ++ .../eval_optimize_loop/pipeline/__init__.py | 6 + .../pipeline/attribution.py | 160 +++++++ .../eval_optimize_loop/pipeline/evaluate.py | 171 +++++++ .../eval_optimize_loop/pipeline/gate.py | 146 ++++++ .../eval_optimize_loop/pipeline/optimize.py | 172 +++++++ .../eval_optimize_loop/pipeline/report.py | 279 ++++++++++++ .../eval_optimize_loop/run_pipeline.py | 171 +++++++ .../real_sample.optimization_report.json | 397 ++++++++++++++++ .../real_sample.optimization_report.md | 56 +++ 26 files changed, 2712 insertions(+) create mode 100644 examples/optimization/eval_optimize_loop/.gitignore create mode 100644 examples/optimization/eval_optimize_loop/DESIGN.md create mode 100644 examples/optimization/eval_optimize_loop/README.md create mode 100644 examples/optimization/eval_optimize_loop/agent/__init__.py create mode 100644 examples/optimization/eval_optimize_loop/agent/config.py create mode 100644 examples/optimization/eval_optimize_loop/agent/fake_backend.py create mode 100644 examples/optimization/eval_optimize_loop/agent/orchestrator.py create mode 100644 examples/optimization/eval_optimize_loop/agent/prompts/router.md create mode 100644 examples/optimization/eval_optimize_loop/agent/prompts/skill.md create mode 100644 examples/optimization/eval_optimize_loop/agent/prompts/system.md create mode 100644 examples/optimization/eval_optimize_loop/config.json create mode 100644 examples/optimization/eval_optimize_loop/data/train.evalset.json create mode 100644 examples/optimization/eval_optimize_loop/data/val.evalset.json create mode 100644 examples/optimization/eval_optimize_loop/eval_metrics.json create mode 100644 examples/optimization/eval_optimize_loop/optimization_report.json create mode 100644 examples/optimization/eval_optimize_loop/optimization_report.md create mode 100644 examples/optimization/eval_optimize_loop/optimizer.json create mode 100644 examples/optimization/eval_optimize_loop/pipeline/__init__.py create mode 100644 examples/optimization/eval_optimize_loop/pipeline/attribution.py create mode 100644 examples/optimization/eval_optimize_loop/pipeline/evaluate.py create mode 100644 examples/optimization/eval_optimize_loop/pipeline/gate.py create mode 100644 examples/optimization/eval_optimize_loop/pipeline/optimize.py create mode 100644 examples/optimization/eval_optimize_loop/pipeline/report.py create mode 100644 examples/optimization/eval_optimize_loop/run_pipeline.py create mode 100644 examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json create mode 100644 examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore new file mode 100644 index 0000000..1f08172 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/.gitignore @@ -0,0 +1,3 @@ +runs/ +__pycache__/ +*.pyc diff --git a/examples/optimization/eval_optimize_loop/DESIGN.md b/examples/optimization/eval_optimize_loop/DESIGN.md new file mode 100644 index 0000000..c0c899f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/DESIGN.md @@ -0,0 +1,35 @@ +# 方案设计说明 + +本示例把 `AgentEvaluator` 与 `AgentOptimizer` 串成「评测 → 失败归因 → prompt 优化 +→ 回归验证 → 产物审计」的自动闭环,目标是判断一次优化是否**真的值得进生产**, +而非仅仅让训练分变高。 + +## 失败归因方法 + +Baseline 评测后,对每条未达标 case 做单标签归因,落入六类之一:最终回复不匹配、 +工具调用错误、参数错误、LLM rubric 不达标、知识召回不足、格式不符合要求。real 模式 +用一个 LLM 裁判读『题面 / 期望答案 / 实际答复 / 运行错误』输出 `{category, reason}`; +fake 模式用确定性规则(拒答→知识召回不足;数值不符→参数错误;数值对但串不匹配→ +格式不符;运行报错→工具调用错误)。两种后端输出结构一致,并聚类成类别计数, +指导优化器聚焦真正的缺陷,且每条失败都附一句可解释原因。 + +## 接受策略 + +优化产出候选后,在**验证集**上逐 case 与 baseline 对比(新增通过 / 新增失败 / +分升 / 分降 / 不变),再过一道可配置 gate:验证集均分提升需 ≥ 阈值、不得新增 +hard fail(原通过转失败)、关键 case 不得退化、成本不超预算。任一规则不过即拒绝。 +拒绝是有效的负决策(退出码 2),不是流程错误。 + +## 防过拟合策略 + +严格 train/val 隔离:优化器只在训练集反思,验证集仅用于接受判定,从不参与改写。 +候选会在训练集上重跑一次,报告显式给出 `overfitting_signal`——「训练涨而验证不涨」 +即高亮。示例内置过拟合候选:它学会乘法(训练 +0.33)却带入「大数默认按乘法」副作用, +使一条大数加法验证题由通过转失败,`forbid_new_hard_fail` 据此拒绝。 + +## 产物审计方式 + +每次运行落到独立 `runs//`,记录 baseline/candidate 逐 case 评测、失败归因、 +逐 case delta、gate 决策与逐规则理由、候选 prompt 全文、成本、耗时、随机种子与数据 +路径,输出 `optimization_report.json` 与 `optimization_report.md`,使「为何接受/拒绝」 +完全可复现、可审计。 diff --git a/examples/optimization/eval_optimize_loop/README.md b/examples/optimization/eval_optimize_loop/README.md new file mode 100644 index 0000000..7c05fb7 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/README.md @@ -0,0 +1,120 @@ +# eval_optimize_loop · 评测 + 优化自动闭环 + +把 `AgentEvaluator`(评测)与 `AgentOptimizer`(优化)串成一条可复现、可审计的闭环: + +``` +Baseline 评测 → 失败归因 → prompt 优化 → 候选验证(逐 case delta) → 接受门控 → 审计落盘 +``` + +它回答的不是"能不能跑一次优化",而是"这次优化**是否真的值得接受**"——是否提升、 +是否牺牲其他指标、是否过拟合、是否值得回写源 prompt。 + +## 快速开始 + +```bash +# 离线 fake 模式:无需 API Key,确定性、可复现,秒级完成(默认) +python run_pipeline.py + +# 真实模式:需要 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +python run_pipeline.py --mode real +``` + +产物写入 `runs//`,含 `optimization_report.json` 与 `optimization_report.md`。 +退出码:**0 = 接受候选**,**2 = 拒绝候选**(拒绝是有效负决策,便于 CI 判定)。 + +仓库已提交两份示例输出: +- 规范样例(fake 模式,确定性):[optimization_report.json](optimization_report.json) / + [optimization_report.md](optimization_report.md) —— 完整复现"成功/无效/退化"三场景与过拟合拒绝。 +- 真实链路样例(real 模式,实跑于一个 OpenAI 兼容端点): + [samples/real_sample.optimization_report.md](samples/real_sample.optimization_report.md)。 + 该次真实模型在验证集 baseline 已 3/3 满分(真实 LLM 不受 fake 的 `@cap` 能力标记约束, + 乘法本就会做),故 GEPA 0 轮无可优化,门控据此以"无提升"正确拒绝——这也是一种有效负决策。 + +## 双后端设计(为什么有 fake) + +issue 要求 "没有真实 API Key 时也能跑通核心流程" 且 "fake 模式 ≤ 3 分钟"。因此 +pipeline 是**双后端**:编排、评测、门控、报告四层两档**完全共用真实代码**,fake 只替换 +两个要花钱/联网的点。 + +| 阶段 | fake(默认·离线) | real(配 Key) | +|---|---|---| +| Agent 推理 | `agent/fake_backend.py` 确定性求解 | `agent/orchestrator.py` 真实多 agent + `LlmAgent` | +| 评测打分 | 真实 `AgentEvaluator` + text-contains | 同左 | +| 失败归因 | 确定性规则桩 | 纯 LLM 裁判 | +| 优化 | 脚本化候选 | 真实 `AgentOptimizer`(GEPA) | + +fake 后端从 prompt 文件里的 `` 能力标记决定行为,于是"改 prompt"被 +映射成"改能力集合",让每条 case 的 pass/fail 随候选确定性翻转——这正是稳定复现三类 +场景所需的可控信号。 + +**无 Key 跑通的三种途径**(issue 要求 fake judge / fake model / trace mode): +- **fake model**:默认档,`call_agent` 换成确定性求解器(本示例采用)。 +- **fake judge**:评测 metric 默认用 `final_response` 文本匹配,不调用任何裁判模型; + 若改用 LLM-rubric 指标,把判分入口替换为规则桩即可(`pipeline/attribution.py` 的 + `classify_fake` 即是失败归因的 fake judge 实现)。 +- **trace mode**:evalset case 可携带预录 `intermediate_data`,`AgentEvaluator` 以 + trace 直接评分而不驱动 agent(SDK 原生支持,无需模型)。 + +## 优化目标(三字段 TargetPrompt) + +对应 `agent/prompts/` 三个文件,`round_robin` 每轮只改一个便于归因: + +- `router` — [router.md](agent/prompts/router.md):题型分流 +- `system_prompt` — [system.md](agent/prompts/system.md):输出格式约束 +- `skill` — [skill.md](agent/prompts/skill.md):解题题型能力 + +## 样例 case 与三类场景 + +6 条 case(3 训练 / 3 验证,见 [data/](data/)),覆盖 issue 要求的三类: + +| 场景 | case | baseline → candidate | +|---|---|---| +| 可优化成功 | `train_mul_car` / `val_mul_box` | FAIL → PASS(学会乘法) | +| 优化无效 | `train_discount_shirt` | FAIL → FAIL(折扣仍不会) | +| 优化后退化 | `val_add_class` | PASS → FAIL(大数加法被过拟合规则误算) | + +候选在训练集 +0.33 却在验证集出现新增失败 → **门控拒绝**,正是"训练涨、验证退"的 +过拟合必须挡下的情形。 + +## 失败归因(六类) + +最终回复不匹配 / 工具调用错误 / 参数错误 / LLM rubric 不达标 / 知识召回不足 / +格式不符合要求。每条失败至少给出一条可解释原因,并聚类成类别计数(见报告第 2 节)。 + +## 接受门控(可配置,[config.json](config.json)) + +```json +"gate": { + "min_val_score_delta": 0.05, // 验证集均分提升需 ≥ 此值 + "forbid_new_hard_fail": true, // 不得新增 hard fail(原通过转失败) + "key_case_ids": ["val_add_class"],// 关键 case 不得退化 + "cost_budget_usd": 1.0 // 成本预算 +} +``` + +任一规则不过即拒绝。 + +## 目录结构 + +``` +eval_optimize_loop/ +├── run_pipeline.py # 入口:六阶段编排 +├── config.json # gate + seed +├── optimizer.json # AgentOptimizer(GEPA) 配置(real 模式) +├── eval_metrics.json # 共享评测 metric(final_response contains) +├── DESIGN.md # 300–500 字方案说明 +├── optimization_report.json # 示例输出(fake 模式) +├── optimization_report.md +├── data/ # train.evalset.json / val.evalset.json(6 条) +├── agent/ +│ ├── orchestrator.py # real 档:router→solver(system+skill) +│ ├── fake_backend.py # fake 档:确定性求解器 +│ ├── config.py # real 档模型配置(读环境变量) +│ └── prompts/ # router.md / system.md / skill.md +└── pipeline/ + ├── evaluate.py # AgentEvaluator → 结构化逐 case + ├── attribution.py # 六类失败归因(LLM / 规则) + ├── optimize.py # AgentOptimizer 包装 + 脚本化候选 + ├── gate.py # 逐 case delta + 接受门控 + └── report.py # optimization_report.{json,md} +``` diff --git a/examples/optimization/eval_optimize_loop/agent/__init__.py b/examples/optimization/eval_optimize_loop/agent/__init__.py new file mode 100644 index 0000000..cd2de93 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/__init__.py @@ -0,0 +1,29 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""被优化的 agent:三个 prompt 字段(router/system/skill) + 双后端(real/fake)。""" + +from __future__ import annotations + +from .orchestrator import ( + ROUTER_PROMPT_PATH, + SKILL_PROMPT_PATH, + SYSTEM_PROMPT_PATH, +) + +# TargetPrompt 字段名 -> prompt 文件路径。pipeline 各阶段共用这一份映射: +# real 模式喂给 AgentOptimizer 的 TargetPrompt,fake 模式喂给确定性求解器。 +PROMPT_PATHS = { + "router": ROUTER_PROMPT_PATH, + "system_prompt": SYSTEM_PROMPT_PATH, + "skill": SKILL_PROMPT_PATH, +} + +__all__ = [ + "PROMPT_PATHS", + "ROUTER_PROMPT_PATH", + "SYSTEM_PROMPT_PATH", + "SKILL_PROMPT_PATH", +] diff --git a/examples/optimization/eval_optimize_loop/agent/config.py b/examples/optimization/eval_optimize_loop/agent/config.py new file mode 100644 index 0000000..89c44aa --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/config.py @@ -0,0 +1,36 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""真实模式的模型配置:从环境变量读取,供 orchestrator 与 LLM judge 共用。""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name),缺失时抛出清晰错误。 + + 仅在 real 模式下被调用;fake 模式不会触碰模型配置,因此无 key 也能跑。 + """ + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + missing = [ + name + for name, val in ( + ("TRPC_AGENT_API_KEY", api_key), + ("TRPC_AGENT_BASE_URL", base_url), + ("TRPC_AGENT_MODEL_NAME", model_name), + ) + if not val + ] + if missing: + raise RuntimeError( + "real 模式需要以下环境变量: " + + ", ".join(missing) + + "。若无 API Key,请改用 fake 模式:python run_pipeline.py --mode fake" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/eval_optimize_loop/agent/fake_backend.py b/examples/optimization/eval_optimize_loop/agent/fake_backend.py new file mode 100644 index 0000000..b76ff9a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/fake_backend.py @@ -0,0 +1,130 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""确定性 fake agent 后端:无 API Key 也能跑通完整 pipeline。 + +设计目标 +-------- +issue 要求 fake / trace 模式下 3 分钟内跑通闭环,并且能稳定复现三类场景: +可优化成功、优化无效、优化后退化(过拟合)。为此本模块提供一个**不依赖 +任何真实 LLM** 的求解器:它只从当前 prompt 文件里解析「能力标记」(``@cap:`` +注释行),据此决定这次能不能解题、格式对不对。 + +于是"改 prompt"这个动作被映射成"改能力集合",从而让每条 case 的 pass/fail +可以随 prompt 候选确定性翻转——这正是演示评测→优化闭环所需要的可控信号, +同时又完全离线、可复现(固定 seed 无关,无随机)。 + +能力标记(写在 prompts/*.md 里的 ````) +------------------------------------------------------ +- ``op-add`` / ``op-mul`` / ``op-discount`` : 求解器掌握的运算(一般放在 skill.md) +- ``fmt-answer-prefix`` : 最终答复以「答案:」开头(system.md) +- ``fmt-unit-suffix`` : 数字后带单位(system.md) +- ``route-ok`` : 路由器能正确分流(router.md) +- ``assume-mul-default`` : **过拟合副作用**——对含大操作数(>=10) + 的加法题过度使用乘法,故意制造回归 + +真实模式请改用 :mod:`agent.orchestrator`(真正的多 agent + LlmAgent)。 +""" + +from __future__ import annotations + +import re +from pathlib import Path + + +_CAP_RE = re.compile(r"@cap:\s*([a-z0-9\-]+)", re.IGNORECASE) +_NUM_RE = re.compile(r"\d+(?:\.\d+)?") + +REFUSAL = "抱歉,我暂时无法解答这道题。" + + +def read_caps(*prompt_paths: Path) -> set[str]: + """从若干 prompt 文件里解析全部能力标记,合成一个能力集合。""" + caps: set[str] = set() + for path in prompt_paths: + try: + text = Path(path).read_text(encoding="utf-8") + except FileNotFoundError: + continue + caps.update(m.group(1).lower() for m in _CAP_RE.finditer(text)) + return caps + + +def _detect_operation(query: str) -> str: + """从题面关键词判断运算类型:discount / mul / add。""" + if "折" in query: + return "discount" + if "每" in query: # “每小时”“每盒” 这类单价/速率题 → 乘法 + return "mul" + return "add" + + +def _detect_unit(query: str) -> str: + """从题面关键词判断单位。顺序敏感:先匹配更具体的单位。""" + if "公里" in query: + return "公里" + if "元" in query: + return "元" + if any(k in query for k in ("人", "男生", "女生", "名")): + return "人" + return "个" + + +def _format_number(value: float) -> str: + """整数值去掉小数尾巴:150.0 -> '150'。""" + if value == int(value): + return str(int(value)) + return str(value) + + +def solve(query: str, caps: set[str]) -> str: + """确定性求解:根据能力集合返回最终答复文本。 + + 这是 fake agent 的全部"智能"。改动 prompt(即改动 ``caps``)会确定性地 + 改变返回值,从而让评测分数随候选 prompt 翻转。 + """ + operation = _detect_operation(query) + unit = _detect_unit(query) + numbers = [float(n) for n in _NUM_RE.findall(query)] + + # 1) 能力缺失 → 如实拒答(映射到"知识召回不足"类失败) + required_cap = {"add": "op-add", "mul": "op-mul", "discount": "op-discount"}[operation] + if required_cap not in caps: + return REFUSAL + if len(numbers) < 2: + return REFUSAL + + a, b = numbers[0], numbers[1] + + # 2) 计算数值 + if operation == "add": + # 过拟合副作用:assume-mul-default 让求解器对含大操作数的加法题 + # 过度使用乘法 → 大数加法题被算错(制造验证集回归)。 + if "assume-mul-default" in caps and (a >= 10 or b >= 10): + result = a * b + else: + result = a + b + elif operation == "mul": + result = a * b + else: # discount:原价 * 折数/10 + result = a * (b / 10.0) + + # 3) 套用格式 + body = _format_number(result) + if "fmt-unit-suffix" in caps: + body = f"{body} {unit}" + if "fmt-answer-prefix" in caps: + body = f"答案:{body}" + return body + + +async def call_agent_fake(query: str, prompt_paths: dict[str, Path]) -> str: + """框架回调(fake 版):读当前 prompt 能力集合 → 确定性求解。 + + 与真实 ``call_agent`` 保持同签名(``query -> str``),由 pipeline 通过 + ``functools.partial`` 绑定 ``prompt_paths`` 后传入 AgentEvaluator。 + """ + caps = read_caps(*prompt_paths.values()) + return solve(query, caps) diff --git a/examples/optimization/eval_optimize_loop/agent/orchestrator.py b/examples/optimization/eval_optimize_loop/agent/orchestrator.py new file mode 100644 index 0000000..152ca3a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/orchestrator.py @@ -0,0 +1,99 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""真实模式的多 agent 链路:router → solver(system + skill) → 最终答复。 + +链路形态:: + + 用户问题 → router(分流) → solver(用 system.md 定格式 + skill.md 定题型) → 答复 + +这是 real 模式(配了 TRPC_AGENT_* 时)真正驱动的 agent。三个 prompt 文件正好 +对应 TargetPrompt 的三个优化字段:router / system_prompt / skill。 + +prompt 热加载:每次 invoke 都重读 prompt 文件——优化器写入候选后下一次调用 +即生效,无需重启进程。 + +fake 模式不会用到本文件,改用 :mod:`agent.fake_backend` 的确定性求解器。 +""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +_PROMPTS_DIR = Path(__file__).parent / "prompts" +ROUTER_PROMPT_PATH = _PROMPTS_DIR / "router.md" +SYSTEM_PROMPT_PATH = _PROMPTS_DIR / "system.md" +SKILL_PROMPT_PATH = _PROMPTS_DIR / "skill.md" + +APP_NAME = "eval_optimize_loop" + + +def _create_agent(name: str, instruction: str) -> LlmAgent: + """构造一个 LlmAgent,instruction 由调用方现读现拼。""" + api_key, base_url, model_name = get_model_config() + return LlmAgent( + name=name, + description=f"eval_optimize_loop {name}", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=instruction, + generate_content_config=GenerateContentConfig( + temperature=0.2, top_p=0.9, max_output_tokens=1024, + ), + ) + + +async def _run_one(agent: LlmAgent, user_text: str) -> str: + """跑一个 agent 拿最终回答;每次独立 Runner/Session 保证评测隔离。""" + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "pipeline" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def call_agent_real(query: str) -> str: + """框架回调(real 版):把 query 跑过整条链路,返回最终答复。""" + # 1. router:判定题型(每次重读 router.md) + router = _create_agent("router", ROUTER_PROMPT_PATH.read_text(encoding="utf-8").strip()) + await _run_one(router, f"用户问题:{query}\n\n请判断这是加法、乘法还是折扣问题。") + + # 2. solver:system.md(格式约束)+ skill.md(题型能力)拼成 instruction + solver_instruction = ( + SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + "\n\n## 解题技能\n" + + SKILL_PROMPT_PATH.read_text(encoding="utf-8").strip() + ) + solver = _create_agent("solver", solver_instruction) + return await _run_one(solver, query) diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/router.md b/examples/optimization/eval_optimize_loop/agent/prompts/router.md new file mode 100644 index 0000000..4a9a410 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/router.md @@ -0,0 +1,9 @@ +# 路由器 Prompt + +你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。 + +- 认真读题,识别是加法、乘法还是折扣问题。 +- 把控制权交给下游解题 agent。 + + + diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/skill.md b/examples/optimization/eval_optimize_loop/agent/prompts/skill.md new file mode 100644 index 0000000..657eec3 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/skill.md @@ -0,0 +1,10 @@ +# 解题技能 · Skill Prompt + +这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型, +如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。 + +## 已掌握题型 +- 加法:把题目中的两个数量相加。 + + + diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/system.md b/examples/optimization/eval_optimize_loop/agent/prompts/system.md new file mode 100644 index 0000000..3439801 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/system.md @@ -0,0 +1,12 @@ +# 解题 Agent · System Prompt + +你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。 + +## 输出格式要求 +- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。 +- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。 +- 示例:`答案:11 个` + + + + diff --git a/examples/optimization/eval_optimize_loop/config.json b/examples/optimization/eval_optimize_loop/config.json new file mode 100644 index 0000000..c699a18 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/config.json @@ -0,0 +1,9 @@ +{ + "seed": 42, + "gate": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": ["val_add_class"], + "cost_budget_usd": 1.0 + } +} diff --git a/examples/optimization/eval_optimize_loop/data/train.evalset.json b/examples/optimization/eval_optimize_loop/data/train.evalset.json new file mode 100644 index 0000000..9dbf1c8 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/data/train.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "eol_train", + "name": "评测-优化闭环 · 训练集", + "description": "3 道小学算术应用题,用于反思归因与优化。final_response 中的『答案:X 单位』既作为 contains 匹配的参考答案,也作为失败归因时的期望答案。三条覆盖:baseline 已通过(加法) / 可优化成功(乘法) / 优化无效(折扣)。", + "eval_cases": [ + { + "eval_id": "train_add_apple", + "conversation": [ + { + "invocation_id": "tr1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "train_mul_car", + "conversation": [ + { + "invocation_id": "tr2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶了 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "train_discount_shirt", + "conversation": [ + { + "invocation_id": "tr3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/data/val.evalset.json b/examples/optimization/eval_optimize_loop/data/val.evalset.json new file mode 100644 index 0000000..94d1d3f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/data/val.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "eol_val", + "name": "评测-优化闭环 · 验证集", + "description": "3 道与训练集同分布的算术应用题,用于候选回归与防过拟合门控。三条覆盖:稳定通过(小数加法) / 泛化提升(乘法) / 优化后退化(大数加法被过拟合规则误算)。", + "eval_cases": [ + { + "eval_id": "val_add_orange", + "conversation": [ + { + "invocation_id": "va1", + "user_content": { + "parts": [{"text": "篮子里有 3 个橙子,妈妈又放进 5 个橙子,现在篮子里一共有多少个橙子?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:8 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + }, + { + "eval_id": "val_mul_box", + "conversation": [ + { + "invocation_id": "va2", + "user_content": { + "parts": [{"text": "每盒装 12 个鸡蛋,一共有 5 盒,请问总共有多少个鸡蛋?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:60 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + }, + { + "eval_id": "val_add_class", + "conversation": [ + { + "invocation_id": "va3", + "user_content": { + "parts": [{"text": "三年级二班有 20 名男生和 15 名女生,这个班一共有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:35 人"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/eval_metrics.json b/examples/optimization/eval_optimize_loop/eval_metrics.json new file mode 100644 index 0000000..19684db --- /dev/null +++ b/examples/optimization/eval_optimize_loop/eval_metrics.json @@ -0,0 +1,14 @@ +{ + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 1 +} diff --git a/examples/optimization/eval_optimize_loop/optimization_report.json b/examples/optimization/eval_optimize_loop/optimization_report.json new file mode 100644 index 0000000..3dc052b --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.json @@ -0,0 +1,431 @@ +{ + "schema_version": "eol-v1", + "run": { + "mode": "fake", + "seed": 42, + "started_at": "2026-07-01T20:17:51", + "finished_at": "2026-07-01T20:17:51", + "elapsed_seconds": 0.01, + "train_dataset": "data/train.evalset.json", + "val_dataset": "data/val.evalset.json", + "target_fields": [ + "router", + "system_prompt", + "skill" + ], + "gate_config": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": [ + "val_add_class" + ], + "cost_budget_usd": 1.0 + } + }, + "baseline": { + "train": { + "set_id": "train.evalset", + "pass_count": 1, + "total": 3, + "avg_score": 0.3333, + "cases": { + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_mul_car": { + "passed": false, + "score": 0.0, + "expected": "答案:150 公里", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + } + } + }, + "val": { + "set_id": "val.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "val_mul_box": { + "passed": false, + "score": 0.0, + "expected": "答案:60 个", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + } + } + } + }, + "failure_attribution": { + "train": { + "clusters": { + "知识召回不足": 2 + }, + "cases": { + "train_mul_car": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + }, + "train_discount_shirt": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + } + } + }, + "val": { + "clusters": { + "知识召回不足": 1 + }, + "cases": { + "val_mul_box": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + } + } + } + }, + "candidate": { + "status": "SUCCEEDED", + "stop_reason": "fake_scripted_candidate", + "optimized_fields": [ + "skill" + ], + "rounds": 1, + "cost_usd": 0.0, + "duration_seconds": 0.0001, + "meta": { + "backend": "fake", + "note": "scripted overfitting candidate for offline demo" + }, + "prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt(候选)\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n- 乘法:把题目中的两个数量相乘(新增)。\n\n## 快捷启发式(反思器新增,实为过拟合)\n- 数字较大的题目倾向于按乘法处理,往往更快得到答案。\n\n\n\n\n" + }, + "rounds_detail": [ + { + "round": 1, + "optimized_fields": [ + "skill" + ], + "candidate_prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt(候选)\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n- 乘法:把题目中的两个数量相乘(新增)。\n\n## 快捷启发式(反思器新增,实为过拟合)\n- 数字较大的题目倾向于按乘法处理,往往更快得到答案。\n\n\n\n\n" + }, + "note": "脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用" + } + ] + }, + "candidate_train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + } + } + }, + "candidate_val": { + "set_id": "val.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "val_add_class": { + "passed": false, + "score": 0.0, + "expected": "答案:35 人", + "actual": "答案:300 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:300 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + } + } + }, + "overfitting_signal": { + "train_score_delta": 0.3333, + "val_score_delta": 0.0, + "train_up_val_down": true + }, + "delta": { + "val_score_delta": 0.0, + "train_score_delta": 0.3333, + "baseline_val_pass": "2/3", + "candidate_val_pass": "2/3", + "baseline_train_pass": "1/3", + "candidate_train_pass": "2/3", + "per_case": [ + { + "eval_id": "val_mul_box", + "baseline_passed": false, + "candidate_passed": true, + "baseline_score": 0.0, + "candidate_score": 1.0, + "score_delta": 1.0, + "status": "newly_passed" + }, + { + "eval_id": "val_add_class", + "baseline_passed": true, + "candidate_passed": false, + "baseline_score": 1.0, + "candidate_score": 0.0, + "score_delta": -1.0, + "status": "newly_failed" + }, + { + "eval_id": "val_add_orange", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + } + ], + "newly_passed": [ + "val_mul_box" + ], + "newly_failed": [ + "val_add_class" + ], + "regressed": [ + "val_add_class" + ] + }, + "gate_decision": { + "accepted": false, + "summary": "拒绝候选:命中规则 ['min_val_score_delta', 'forbid_new_hard_fail', 'key_cases_no_regression']。疑似过拟合(验证集出现退化/新增失败)。", + "rules": [ + { + "name": "min_val_score_delta", + "passed": false, + "detail": "验证集平均分 delta=+0.0000,阈值 ≥ +0.0500" + }, + { + "name": "forbid_new_hard_fail", + "passed": false, + "detail": "新增失败 case=['val_add_class']" + }, + { + "name": "key_cases_no_regression", + "passed": false, + "detail": "关键 case=['val_add_class'],其中退化=['val_add_class']" + }, + { + "name": "cost_within_budget", + "passed": true, + "detail": "候选成本=$0.0000,预算 ≤ $1.0000" + } + ] + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/optimization_report.md b/examples/optimization/eval_optimize_loop/optimization_report.md new file mode 100644 index 0000000..13ad235 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.md @@ -0,0 +1,63 @@ +# Evaluation + Optimization 闭环报告 + +- **决策**:❌ 拒绝 (REJECT) +- **结论**:拒绝候选:命中规则 ['min_val_score_delta', 'forbid_new_hard_fail', 'key_cases_no_regression']。疑似过拟合(验证集出现退化/新增失败)。 +- 运行模式:`fake` | seed:`42` | 耗时:0.01s +- 时间:2026-07-01T20:17:51 → 2026-07-01T20:17:51 + +## 1. 分数总览 + +| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 | +|---|---|---|---|---|---| +| 训练集 | 1/3 | 0.333 | 2/3 | 0.667 | +0.333 | +| 验证集 | 2/3 | 0.667 | 2/3 | 0.667 | +0.000 | + +> ⚠️ **过拟合信号**:训练集提升 +0.333,验证集却未提升(+0.000)——候选在训练分布上过度特化。 + +## 2. Baseline 失败归因 + +**训练集** 失败聚类:{'知识召回不足': 2} +- `train_mul_car` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 +- `train_discount_shirt` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +**验证集** 失败聚类:{'知识召回不足': 1} +- `val_mul_box` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +## 3. 候选验证 · 逐 case delta + +| case | baseline | candidate | Δ分 | 状态 | +|---|---|---|---|---| +| `val_mul_box` | FAIL | PASS | +1.000 | 🟢 新增通过 | +| `val_add_class` | PASS | FAIL | -1.000 | 🔴 新增失败 | +| `val_add_orange` | PASS | PASS | +0.000 | ⚪ 不变 | + +- 新增通过:['val_mul_box'] +- 新增失败:['val_add_class'] + +## 4. 门控决策明细 + +| 规则 | 结果 | 说明 | +|---|---|---| +| `min_val_score_delta` | ❌ | 验证集平均分 delta=+0.0000,阈值 ≥ +0.0500 | +| `forbid_new_hard_fail` | ❌ | 新增失败 case=['val_add_class'] | +| `key_cases_no_regression` | ❌ | 关键 case=['val_add_class'],其中退化=['val_add_class'] | +| `cost_within_budget` | ✅ | 候选成本=$0.0000,预算 ≤ $1.0000 | + +## 5. 每轮候选审计 + +- **Round 1**:改写字段 ['skill'] + - 脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用 + +> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。 + +## 6. 候选与成本审计 + +- 优化状态:`SUCCEEDED` | stop_reason:`fake_scripted_candidate` +- 被改写字段:['skill'] | 轮数:1 +- 成本:$0.000000 | 优化耗时:0.0001s +- 后端:`fake` + +> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。 diff --git a/examples/optimization/eval_optimize_loop/optimizer.json b/examples/optimization/eval_optimize_loop/optimizer.json new file mode 100644 index 0000000..1551824 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.json @@ -0,0 +1,38 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": {"max_tokens": 4096, "temperature": 0.6} + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 3, + "skip_perfect_score": false, + "use_merge": true, + "max_metric_calls": 60, + "max_iterations_without_improvement": 6 + } + } +} diff --git a/examples/optimization/eval_optimize_loop/pipeline/__init__.py b/examples/optimization/eval_optimize_loop/pipeline/__init__.py new file mode 100644 index 0000000..9de8f4e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/__init__.py @@ -0,0 +1,6 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""评测-优化闭环 pipeline:evaluate / attribution / optimize / gate / report。""" diff --git a/examples/optimization/eval_optimize_loop/pipeline/attribution.py b/examples/optimization/eval_optimize_loop/pipeline/attribution.py new file mode 100644 index 0000000..3e692b1 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/attribution.py @@ -0,0 +1,160 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""失败归因阶段:把每条失败 case 归入六大类之一,并给出可解释原因。 + +六大类(对齐 issue):: + + 最终回复不匹配 / 工具调用错误 / 参数错误 / + LLM rubric 不达标 / 知识召回不足 / 格式不符合要求 + +两种后端 +-------- +- real(纯 LLM 裁判,issue 指定):用一个 LlmAgent 裁判读『题面/期望/实际』 + 输出 JSON 分类;语义最灵活。 +- fake(离线确定性桩):从『期望文本 vs 实际文本 vs 运行错误』用规则确定性 + 判类。保证无 key 时归因结果稳定、可复现(验收第 4 条要求分类准确率与可解释性)。 + +无论哪种后端,输出结构一致:{eval_id: Attribution},并可聚类成类别计数。 +""" + +from __future__ import annotations + +import json +import re +import uuid +from dataclasses import dataclass +from typing import Optional + +from .evaluate import CaseEval + + +# 六大失败类别(value 为报告中展示的中文标签) +CATEGORIES = { + "final_response_mismatch": "最终回复不匹配", + "tool_call_error": "工具调用错误", + "param_error": "参数错误", + "llm_rubric_fail": "LLM rubric 不达标", + "knowledge_gap": "知识召回不足", + "format_error": "格式不符合要求", +} + +_REFUSAL_MARK = "无法解答" +_NUM_RE = re.compile(r"-?\d+(?:\.\d+)?") + + +@dataclass +class Attribution: + eval_id: str + category: str # CATEGORIES 的 key + category_label: str # 中文标签 + reason: str # 可解释原因(每条失败至少一条) + source: str # "fake" | "llm" + + +def _first_number(text: str) -> Optional[str]: + m = _NUM_RE.search(text or "") + return m.group(0) if m else None + + +def classify_fake(case: CaseEval) -> Attribution: + """确定性归因:仅凭已有评测信号(期望/实际/错误)判类。""" + actual = case.actual_text or "" + expected = case.expected_text or "" + + if case.error: + cat, reason = "tool_call_error", f"运行期报错,链路未产出答复:{case.error[:80]}" + elif not actual: + cat, reason = "final_response_mismatch", "agent 未产出任何最终答复文本。" + elif _REFUSAL_MARK in actual: + cat, reason = "knowledge_gap", "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。" + else: + exp_num, act_num = _first_number(expected), _first_number(actual) + if exp_num is not None and act_num is not None and exp_num != act_num: + cat, reason = "param_error", f"计算结果数值错误:期望 {exp_num},实际 {act_num}(运算或取数有误)。" + elif expected and expected not in actual: + # 数字对得上,但整体串不匹配 → 多半是格式(缺「答案:」前缀或单位) + cat, reason = "format_error", f"数值正确但格式不符:期望包含『{expected}』,实际输出『{actual}』。" + else: + cat, reason = "final_response_mismatch", f"最终答复与期望不一致:期望『{expected}』,实际『{actual}』。" + + return Attribution(case.eval_id, cat, CATEGORIES[cat], reason, source="fake") + + +_JUDGE_INSTRUCTION = ( + "你是评测失败归因裁判。给定一道题的『题面/期望答案/agent 实际答复』," + "把这次失败归入且仅归入以下六类之一,并给出一句可解释原因。\n" + "类别(用括号里的英文 key):最终回复不匹配(final_response_mismatch)、" + "工具调用错误(tool_call_error)、参数错误(param_error)、" + "LLM rubric 不达标(llm_rubric_fail)、知识召回不足(knowledge_gap)、" + "格式不符合要求(format_error)。\n" + "只输出 JSON:{\"category\": \"\", \"reason\": \"<一句话>\"},不要多余文字。" +) + + +async def classify_llm(case: CaseEval) -> Attribution: + """纯 LLM 裁判归因(real 模式)。失败时回退到 fake 规则,保证 pipeline 不中断。""" + try: + from trpc_agent_sdk.agents import LlmAgent + from trpc_agent_sdk.models import OpenAIModel + from trpc_agent_sdk.runners import Runner + from trpc_agent_sdk.sessions import InMemorySessionService + from trpc_agent_sdk.types import Content, GenerateContentConfig, Part + + from agent.config import get_model_config + + api_key, base_url, model_name = get_model_config() + judge = LlmAgent( + name="attribution_judge", + description="failure attribution judge", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=_JUDGE_INSTRUCTION, + generate_content_config=GenerateContentConfig(temperature=0.0, max_output_tokens=256), + ) + session_service = InMemorySessionService() + runner = Runner(app_name="eol_attribution", agent=judge, session_service=session_service) + session_id, user_id = str(uuid.uuid4()), "judge" + await session_service.create_session( + app_name="eol_attribution", user_id=user_id, session_id=session_id, state={}, + ) + prompt = ( + f"题面:{case.query}\n期望答案:{case.expected_text}\n" + f"agent 实际答复:{case.actual_text or '(空)'}\n运行错误:{case.error or '无'}" + ) + content = Content(role="user", parts=[Part.from_text(text=prompt)]) + out = "" + async for event in runner.run_async(user_id=user_id, session_id=session_id, new_message=content): + if event.is_final_response() and event.content and event.content.parts: + for part in event.content.parts: + if part.text and not part.thought: + out += part.text + parsed = json.loads(re.search(r"\{.*\}", out, re.DOTALL).group(0)) + cat = parsed.get("category", "final_response_mismatch") + if cat not in CATEGORIES: + cat = "final_response_mismatch" + reason = str(parsed.get("reason", "")).strip() or "(裁判未给出原因)" + return Attribution(case.eval_id, cat, CATEGORIES[cat], reason, source="llm") + except Exception as exc: # noqa: BLE001 - 归因失败不应中断闭环 + fallback = classify_fake(case) + fallback.reason = f"[LLM 裁判失败回退规则] {fallback.reason}(原因:{exc.__class__.__name__})" + return fallback + + +async def attribute_failures(set_eval, mode: str) -> dict: + """对一个 SetEval 里的所有失败 case 归因,返回明细 + 类别聚类计数。""" + attributions: dict[str, Attribution] = {} + for eval_id, case in set_eval.cases.items(): + if case.passed: + continue + if mode == "real": + attributions[eval_id] = await classify_llm(case) + else: + attributions[eval_id] = classify_fake(case) + + clusters: dict[str, int] = {} + for attr in attributions.values(): + clusters[attr.category_label] = clusters.get(attr.category_label, 0) + 1 + + return {"attributions": attributions, "clusters": clusters} diff --git a/examples/optimization/eval_optimize_loop/pipeline/evaluate.py b/examples/optimization/eval_optimize_loop/pipeline/evaluate.py new file mode 100644 index 0000000..8af463e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/evaluate.py @@ -0,0 +1,171 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""评测阶段:调**真实** AgentEvaluator 对一个 evalset 打分,抽成结构化记录。 + +real / fake 两种模式在这里**共用同一套评测代码**——差别只在传进来的 +``call_agent`` 是真实多 agent 还是确定性求解器。评测器、metric、pass/fail +判定完全一致,这样 fake 的分数与 real 的分数口径可比。 +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Awaitable, Callable + +from trpc_agent_sdk.evaluation import AgentEvaluator +from trpc_agent_sdk.evaluation import EvalStatus +from trpc_agent_sdk.evaluation import get_all_tool_calls + + +CallAgent = Callable[[str], Awaitable[str]] + + +@dataclass +class MetricScore: + """单个 metric 在单条 case 上的结果。""" + + name: str + score: float + passed: bool + threshold: float + reason: str = "" + + +@dataclass +class CaseEval: + """单条 case 的评测结果(跨 metric 汇总)。""" + + eval_id: str + passed: bool + score: float # 主 metric 分(这里是 final_response_avg_score) + metrics: list[MetricScore] = field(default_factory=list) + query: str = "" + expected_text: str = "" + actual_text: str = "" + error: str = "" + trajectory: list[str] = field(default_factory=list) # 关键轨迹(工具调用 + 最终答复) + + +@dataclass +class SetEval: + """一个 evalset 的整体评测结果。""" + + set_id: str + cases: dict[str, CaseEval] + + @property + def pass_count(self) -> int: + return sum(1 for c in self.cases.values() if c.passed) + + @property + def total(self) -> int: + return len(self.cases) + + @property + def avg_score(self) -> float: + if not self.cases: + return 0.0 + return sum(c.score for c in self.cases.values()) / len(self.cases) + + +def _content_text(content) -> str: + if content is None or not getattr(content, "parts", None): + return "" + return "".join(p.text for p in content.parts if getattr(p, "text", None)).strip() + + +def _load_expected(dataset_path: Path) -> dict[str, tuple[str, str]]: + """从 evalset 文件读每条 case 的 (query, expected_text),用于报告与归因。""" + data = json.loads(Path(dataset_path).read_text(encoding="utf-8")) + out: dict[str, tuple[str, str]] = {} + for case in data.get("eval_cases", []): + conv = case.get("conversation", []) + if not conv: + continue + first = conv[0] + query = "".join(p.get("text", "") for p in first.get("user_content", {}).get("parts", [])) + expected = "".join(p.get("text", "") for p in first.get("final_response", {}).get("parts", [])) + out[case["eval_id"]] = (query.strip(), expected.strip()) + return out + + +async def evaluate_set( + dataset_path: Path, + call_agent: CallAgent, + metrics_path: Path, + output_dir: Path, +) -> SetEval: + """对单个 evalset 跑真实 AgentEvaluator,返回结构化 SetEval。""" + executer = AgentEvaluator.get_executer( + str(dataset_path), + call_agent=call_agent, + num_runs=1, + print_detailed_results=False, + eval_metrics_file_path_or_dir=str(metrics_path), + eval_result_output_dir=str(output_dir), + ) + # AgentEvaluator 在有 case 未达标时会抛 AssertionError(_EvaluationCasesFailed), + # 但抛出前已把完整结果写入 executer。我们要的是回归信号(含失败),因此捕获 + # 断言、照常取回结果——失败是评测的正常输出,不是流程错误。 + try: + await executer.evaluate() + except AssertionError: + pass + result = executer.get_result() + + expected_map = _load_expected(dataset_path) + cases: dict[str, CaseEval] = {} + + for set_id, agg in (result.results_by_eval_set_id if result else {}).items(): + for eval_id, case_runs in agg.eval_results_by_eval_id.items(): + case = case_runs[0] # num_runs=1 + metric_scores: list[MetricScore] = [] + for m in case.overall_eval_metric_results: + metric_scores.append( + MetricScore( + name=m.metric_name, + score=float(m.score) if m.score is not None else 0.0, + passed=m.eval_status == EvalStatus.PASSED, + threshold=float(getattr(m, "threshold", 1.0) or 1.0), + reason=(m.details.reason if m.details and m.details.reason else ""), + ) + ) + # 抽取实际 agent 输出文本 + 关键轨迹(取第一个 invocation) + actual_text = "" + trajectory: list[str] = [] + if case.eval_metric_result_per_invocation: + actual_inv = case.eval_metric_result_per_invocation[0].actual_invocation + actual_text = _content_text(actual_inv.final_response) + # 关键轨迹:记录每次工具调用(名称 + 参数摘要),再附最终答复。 + # 单 agent/无工具时轨迹只含最终答复;多 agent + 工具时可见调用链。 + for call in get_all_tool_calls(actual_inv.intermediate_data): + args = getattr(call, "args", None) or {} + trajectory.append(f"tool_call:{getattr(call, 'name', '?')}({args})") + if actual_text: + trajectory.append(f"final_response:{actual_text}") + query, expected = expected_map.get(eval_id, ("", "")) + cases[eval_id] = CaseEval( + eval_id=eval_id, + passed=case.final_eval_status == EvalStatus.PASSED, + score=metric_scores[0].score if metric_scores else 0.0, + metrics=metric_scores, + query=query, + expected_text=expected, + actual_text=actual_text, + error=case.error_message or "", + trajectory=trajectory, + ) + + # 兜底:若某 case 未出现在结果里,也补一条 failed 记录 + for eval_id, (query, expected) in expected_map.items(): + cases.setdefault( + eval_id, + CaseEval(eval_id=eval_id, passed=False, score=0.0, query=query, expected_text=expected), + ) + + return SetEval(set_id=Path(dataset_path).stem, cases=cases) diff --git a/examples/optimization/eval_optimize_loop/pipeline/gate.py b/examples/optimization/eval_optimize_loop/pipeline/gate.py new file mode 100644 index 0000000..46abd46 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/gate.py @@ -0,0 +1,146 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""门控阶段:逐 case delta + 可配置接受策略(含防过拟合)。 + +先把 baseline 验证集与候选验证集逐 case 对比,分出 +新增通过 / 新增失败 / 分数提升 / 分数下降 / 不变;再按 config 里的 gate +规则综合裁决 accept / reject,并给出逐条规则的通过情况与理由。 + +防过拟合的关键在 ``forbid_new_hard_fail`` 与 ``key_case_ids``:只要验证集里 +出现"原本通过、优化后失败"的退化,即便训练集提升、验证集总分持平,也会拒绝。 +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from .evaluate import SetEval + + +@dataclass +class CaseDelta: + eval_id: str + baseline_passed: bool + candidate_passed: bool + baseline_score: float + candidate_score: float + status: str # newly_passed | newly_failed | improved | regressed | unchanged + + @property + def score_delta(self) -> float: + return round(self.candidate_score - self.baseline_score, 4) + + +def compute_delta(baseline: SetEval, candidate: SetEval) -> list[CaseDelta]: + """逐 case 对比 baseline 与候选的验证结果。""" + deltas: list[CaseDelta] = [] + for eval_id, base_case in baseline.cases.items(): + cand_case = candidate.cases.get(eval_id) + if cand_case is None: + continue + bp, cp = base_case.passed, cand_case.passed + bs, cs = base_case.score, cand_case.score + if not bp and cp: + status = "newly_passed" + elif bp and not cp: + status = "newly_failed" + elif cs > bs: + status = "improved" + elif cs < bs: + status = "regressed" + else: + status = "unchanged" + deltas.append(CaseDelta(eval_id, bp, cp, bs, cs, status)) + return deltas + + +@dataclass +class RuleResult: + name: str + passed: bool + detail: str + + +@dataclass +class GateDecision: + accepted: bool + rules: list[RuleResult] = field(default_factory=list) + summary: str = "" + val_score_delta: float = 0.0 + newly_passed: list[str] = field(default_factory=list) + newly_failed: list[str] = field(default_factory=list) + regressed: list[str] = field(default_factory=list) + + +def evaluate_gate( + baseline_val: SetEval, + candidate_val: SetEval, + deltas: list[CaseDelta], + candidate_cost: float, + gate_config: dict, +) -> GateDecision: + """按 gate 配置综合裁决。任一 rule 不过即拒绝。""" + min_delta = float(gate_config.get("min_val_score_delta", 0.0)) + forbid_new_hard_fail = bool(gate_config.get("forbid_new_hard_fail", True)) + key_case_ids = set(gate_config.get("key_case_ids", [])) + cost_budget = float(gate_config.get("cost_budget_usd", float("inf"))) + + val_delta = round(candidate_val.avg_score - baseline_val.avg_score, 4) + newly_failed = [d.eval_id for d in deltas if d.status == "newly_failed"] + newly_passed = [d.eval_id for d in deltas if d.status == "newly_passed"] + regressed = [d.eval_id for d in deltas if d.status in ("newly_failed", "regressed")] + + rules: list[RuleResult] = [] + + # R1: 验证集总分提升 >= 阈值 + rules.append(RuleResult( + "min_val_score_delta", + val_delta >= min_delta, + f"验证集平均分 delta={val_delta:+.4f},阈值 ≥ {min_delta:+.4f}", + )) + + # R2: 不得新增 hard fail(原通过 → 现失败)——防过拟合主闸 + rules.append(RuleResult( + "forbid_new_hard_fail", + (not forbid_new_hard_fail) or (len(newly_failed) == 0), + f"新增失败 case={newly_failed or '无'}", + )) + + # R3: 关键 case 不得退化 + key_regressed = [d.eval_id for d in deltas + if d.eval_id in key_case_ids and d.status in ("newly_failed", "regressed")] + rules.append(RuleResult( + "key_cases_no_regression", + len(key_regressed) == 0, + f"关键 case={sorted(key_case_ids) or '未指定'},其中退化={key_regressed or '无'}", + )) + + # R4: 成本预算 + rules.append(RuleResult( + "cost_within_budget", + candidate_cost <= cost_budget, + f"候选成本=${candidate_cost:.4f},预算 ≤ ${cost_budget:.4f}", + )) + + accepted = all(r.passed for r in rules) + failed_rules = [r.name for r in rules if not r.passed] + if accepted: + summary = f"接受候选:验证集提升 {val_delta:+.4f} 且未触发任何拒绝规则。" + else: + summary = ( + f"拒绝候选:命中规则 {failed_rules}。" + + ("疑似过拟合(验证集出现退化/新增失败)。" if newly_failed else "") + ) + + return GateDecision( + accepted=accepted, + rules=rules, + summary=summary, + val_score_delta=val_delta, + newly_passed=newly_passed, + newly_failed=newly_failed, + regressed=regressed, + ) diff --git a/examples/optimization/eval_optimize_loop/pipeline/optimize.py b/examples/optimization/eval_optimize_loop/pipeline/optimize.py new file mode 100644 index 0000000..ce47301 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/optimize.py @@ -0,0 +1,172 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""优化阶段:产出一个候选 prompt 组(三字段),供后续验证与门控。 + +两种后端 +-------- +- real:调**真实** ``AgentOptimizer.optimize``(GEPA 反思), + ``update_source=False`` 让源文件在优化结束后自动还原到 baseline, + 最优候选从 ``OptimizeResult.best_prompts`` 取回。成本/耗时来自真实运行。 +- fake:脚本化候选。刻意构造一个"训练集提升、验证集退化"的过拟合候选 + (给 skill 增加乘法能力,同时植入 ``assume-mul-default`` 过拟合副作用), + 用来离线、确定性地演示门控拒绝过拟合(验收第 3 条)。 + +两档统一输出 :class:`CandidateResult`(候选文本字典 + 成本 + 耗时 + 元信息)。 +候选的"应用/还原"由 :func:`apply_candidate` / :func:`restore_prompts` 完成—— +验证阶段把候选临时写入源 prompt 文件,评测后还原,real / fake 共用同一机制。 +""" + +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Awaitable, Callable + +from agent import PROMPT_PATHS + + +CallAgent = Callable[[str], Awaitable[str]] + + +@dataclass +class CandidateResult: + prompts: dict[str, str] # field name -> 候选 prompt 全文 + status: str = "SUCCEEDED" + stop_reason: str = "" + cost_usd: float = 0.0 + duration_seconds: float = 0.0 + rounds: int = 0 + optimized_fields: list[str] = field(default_factory=list) + rounds_detail: list[dict] = field(default_factory=list) # 每轮候选 prompt + 分数 + 接受与否 + meta: dict = field(default_factory=dict) + + +def read_baseline_prompts() -> dict[str, str]: + """读当前源 prompt 文件(baseline)文本,keyed by TargetPrompt 字段名。""" + return {name: Path(path).read_text(encoding="utf-8") for name, path in PROMPT_PATHS.items()} + + +def apply_candidate(candidate: dict[str, str]) -> dict[str, str]: + """把候选文本写入源 prompt 文件,返回原始快照供还原。""" + snapshot = read_baseline_prompts() + for name, text in candidate.items(): + if name in PROMPT_PATHS: + Path(PROMPT_PATHS[name]).write_text(text, encoding="utf-8") + return snapshot + + +def restore_prompts(snapshot: dict[str, str]) -> None: + """还原源 prompt 文件到给定快照。""" + for name, text in snapshot.items(): + if name in PROMPT_PATHS: + Path(PROMPT_PATHS[name]).write_text(text, encoding="utf-8") + + +# -------------------------------------------------------------------------- +# fake 后端:脚本化过拟合候选 +# -------------------------------------------------------------------------- +_SKILL_CANDIDATE = """# 解题技能 · Skill Prompt(候选) + +这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型, +如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。 + +## 已掌握题型 +- 加法:把题目中的两个数量相加。 +- 乘法:把题目中的两个数量相乘(新增)。 + +## 快捷启发式(反思器新增,实为过拟合) +- 数字较大的题目倾向于按乘法处理,往往更快得到答案。 + + + + +""" + + +async def optimize_fake(train_path: Path, val_path: Path) -> CandidateResult: + """确定性脚本候选:加乘法能力(真实提升)+ 过拟合副作用(制造验证退化)。""" + start = time.time() + baseline = read_baseline_prompts() + candidate = dict(baseline) + candidate["skill"] = _SKILL_CANDIDATE # 只改 skill 字段;router/system 保持不变 + rounds_detail = [{ + "round": 1, + "optimized_fields": ["skill"], + "candidate_prompts": candidate, + "note": "脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用", + }] + return CandidateResult( + prompts=candidate, + status="SUCCEEDED", + stop_reason="fake_scripted_candidate", + cost_usd=0.0, + duration_seconds=round(time.time() - start, 4), + rounds=1, + optimized_fields=["skill"], + rounds_detail=rounds_detail, + meta={"backend": "fake", "note": "scripted overfitting candidate for offline demo"}, + ) + + +# -------------------------------------------------------------------------- +# real 后端:真实 AgentOptimizer(GEPA) +# -------------------------------------------------------------------------- +async def optimize_real( + config_path: Path, + call_agent: CallAgent, + train_path: Path, + val_path: Path, + output_dir: Path, +) -> CandidateResult: + """调真实 AgentOptimizer;update_source=False 让源文件优化后自动还原。""" + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + + target = TargetPrompt() + for name, path in PROMPT_PATHS.items(): + target.add_path(name, str(path)) + + result = await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=False, # 候选先不落源,交由 pipeline 门控决定是否接受 + verbose=1, + ) + return CandidateResult( + prompts=dict(result.best_prompts), + status=str(result.status), + stop_reason=str(result.stop_reason or ""), + cost_usd=float(result.total_llm_cost), + duration_seconds=float(result.duration_seconds), + rounds=len(result.rounds), + optimized_fields=sorted( + {f for r in result.rounds for f in r.optimized_field_names} + ), + rounds_detail=[ + { + "round": r.round, + "optimized_fields": list(r.optimized_field_names), + "candidate_prompts": dict(r.candidate_prompts), + "validation_pass_rate": r.validation_pass_rate, + "metric_breakdown": dict(r.metric_breakdown), + "accepted": r.accepted, + "acceptance_reason": r.acceptance_reason, + "cost_usd": r.round_llm_cost, + "duration_seconds": r.duration_seconds, + } + for r in result.rounds + ], + meta={ + "backend": "real", + "algorithm": result.algorithm, + "baseline_pass_rate": result.baseline_pass_rate, + "best_pass_rate": result.best_pass_rate, + }, + ) diff --git a/examples/optimization/eval_optimize_loop/pipeline/report.py b/examples/optimization/eval_optimize_loop/pipeline/report.py new file mode 100644 index 0000000..81c28b8 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/report.py @@ -0,0 +1,279 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""审计阶段:把闭环全过程落成结构化 JSON + 人可读 Markdown。 + +产物 +---- +- ``optimization_report.json`` : baseline / candidate / 逐 case delta / + gate 决策 / 失败归因统计 / 成本 / 耗时 / 复现实验配置(seed、mode、数据路径)。 +- ``optimization_report.md`` : 用人能读懂的方式说明是否值得接受。 +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from .evaluate import SetEval +from .gate import CaseDelta, GateDecision +from .optimize import CandidateResult + + +def _set_to_dict(s: SetEval) -> dict: + return { + "set_id": s.set_id, + "pass_count": s.pass_count, + "total": s.total, + "avg_score": round(s.avg_score, 4), + "cases": { + eid: { + "passed": c.passed, + "score": round(c.score, 4), + "expected": c.expected_text, + "actual": c.actual_text, + "error": c.error, + # 每条 case 的 metric 明细(分/阈值/pass-fail/失败原因) + "metrics": [ + { + "name": m.name, + "score": round(m.score, 4), + "passed": m.passed, + "threshold": m.threshold, + "reason": m.reason, + } + for m in c.metrics + ], + "trajectory": c.trajectory, # 关键轨迹 + } + for eid, c in s.cases.items() + }, + } + + +def _attrib_to_dict(attrib: dict) -> dict: + return { + "clusters": attrib["clusters"], + "cases": { + eid: { + "category": a.category, + "category_label": a.category_label, + "reason": a.reason, + "source": a.source, + } + for eid, a in attrib["attributions"].items() + }, + } + + +def build_report( + *, + run_meta: dict, + baseline_train: SetEval, + baseline_val: SetEval, + train_attrib: dict, + val_attrib: dict, + candidate: CandidateResult, + candidate_train: SetEval, + candidate_val: SetEval, + deltas: list[CaseDelta], + train_deltas: list[CaseDelta], + gate: GateDecision, +) -> dict: + """汇总为一个可直接 json.dump 的报告字典。""" + return { + "schema_version": "eol-v1", + "run": run_meta, + "baseline": { + "train": _set_to_dict(baseline_train), + "val": _set_to_dict(baseline_val), + }, + "failure_attribution": { + "train": _attrib_to_dict(train_attrib), + "val": _attrib_to_dict(val_attrib), + }, + "candidate": { + "status": candidate.status, + "stop_reason": candidate.stop_reason, + "optimized_fields": candidate.optimized_fields, + "rounds": candidate.rounds, + "cost_usd": round(candidate.cost_usd, 6), + "duration_seconds": round(candidate.duration_seconds, 4), + "meta": candidate.meta, + "prompts": candidate.prompts, + "rounds_detail": candidate.rounds_detail, # 每轮候选 prompt 审计 + }, + "candidate_train": _set_to_dict(candidate_train), + "candidate_val": _set_to_dict(candidate_val), + "overfitting_signal": { + "train_score_delta": round(candidate_train.avg_score - baseline_train.avg_score, 4), + "val_score_delta": gate.val_score_delta, + "train_up_val_down": ( + candidate_train.avg_score > baseline_train.avg_score + and gate.val_score_delta <= 0 + ), + }, + "delta": { + "val_score_delta": gate.val_score_delta, + "train_score_delta": round(candidate_train.avg_score - baseline_train.avg_score, 4), + "baseline_val_pass": f"{baseline_val.pass_count}/{baseline_val.total}", + "candidate_val_pass": f"{candidate_val.pass_count}/{candidate_val.total}", + "baseline_train_pass": f"{baseline_train.pass_count}/{baseline_train.total}", + "candidate_train_pass": f"{candidate_train.pass_count}/{candidate_train.total}", + "per_case": [ + { + "eval_id": d.eval_id, + "baseline_passed": d.baseline_passed, + "candidate_passed": d.candidate_passed, + "baseline_score": round(d.baseline_score, 4), + "candidate_score": round(d.candidate_score, 4), + "score_delta": d.score_delta, + "status": d.status, + } + for d in deltas + ], + "newly_passed": gate.newly_passed, + "newly_failed": gate.newly_failed, + "regressed": gate.regressed, + }, + "gate_decision": { + "accepted": gate.accepted, + "summary": gate.summary, + "rules": [ + {"name": r.name, "passed": r.passed, "detail": r.detail} + for r in gate.rules + ], + }, + } + + +def save_json(report: dict, path: Path) -> None: + Path(path).write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _status_emoji(status: str) -> str: + return { + "newly_passed": "🟢 新增通过", + "newly_failed": "🔴 新增失败", + "improved": "🔼 分数提升", + "regressed": "🔻 分数下降", + "unchanged": "⚪ 不变", + }.get(status, status) + + +def render_markdown(report: dict) -> str: + r = report + run = r["run"] + gate = r["gate_decision"] + delta = r["delta"] + decision = "✅ 接受 (ACCEPT)" if gate["accepted"] else "❌ 拒绝 (REJECT)" + + lines: list[str] = [] + lines.append("# Evaluation + Optimization 闭环报告") + lines.append("") + lines.append(f"- **决策**:{decision}") + lines.append(f"- **结论**:{gate['summary']}") + lines.append(f"- 运行模式:`{run['mode']}` | seed:`{run['seed']}` | 耗时:{run['elapsed_seconds']}s") + lines.append(f"- 时间:{run['started_at']} → {run['finished_at']}") + lines.append("") + + # 分数总览 + bt, bv = r["baseline"]["train"], r["baseline"]["val"] + ct, cv = r["candidate_train"], r["candidate_val"] + ovf = r["overfitting_signal"] + lines.append("## 1. 分数总览") + lines.append("") + lines.append("| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 |") + lines.append("|---|---|---|---|---|---|") + lines.append( + f"| 训练集 | {bt['pass_count']}/{bt['total']} | {bt['avg_score']:.3f} " + f"| {ct['pass_count']}/{ct['total']} | {ct['avg_score']:.3f} | {ovf['train_score_delta']:+.3f} |" + ) + lines.append( + f"| 验证集 | {bv['pass_count']}/{bv['total']} | {bv['avg_score']:.3f} " + f"| {cv['pass_count']}/{cv['total']} | {cv['avg_score']:.3f} | {ovf['val_score_delta']:+.3f} |" + ) + lines.append("") + if ovf["train_up_val_down"]: + lines.append( + f"> ⚠️ **过拟合信号**:训练集提升 {ovf['train_score_delta']:+.3f}," + f"验证集却未提升({ovf['val_score_delta']:+.3f})——候选在训练分布上过度特化。" + ) + lines.append("") + + # 失败归因 + lines.append("## 2. Baseline 失败归因") + lines.append("") + for split in ("train", "val"): + fa = r["failure_attribution"][split] + base_cases = r["baseline"][split]["cases"] + lines.append(f"**{'训练集' if split == 'train' else '验证集'}** 失败聚类:{fa['clusters'] or '无失败'}") + for eid, a in fa["cases"].items(): + lines.append(f"- `{eid}` → **{a['category_label']}**({a['source']}):{a['reason']}") + traj = base_cases.get(eid, {}).get("trajectory", []) + if traj: + lines.append(f" - 关键轨迹:{' → '.join(traj)}") + lines.append("") + + # 逐 case delta + lines.append("## 3. 候选验证 · 逐 case delta") + lines.append("") + lines.append("| case | baseline | candidate | Δ分 | 状态 |") + lines.append("|---|---|---|---|---|") + for d in delta["per_case"]: + lines.append( + f"| `{d['eval_id']}` | {'PASS' if d['baseline_passed'] else 'FAIL'} " + f"| {'PASS' if d['candidate_passed'] else 'FAIL'} " + f"| {d['score_delta']:+.3f} | {_status_emoji(d['status'])} |" + ) + lines.append("") + lines.append(f"- 新增通过:{delta['newly_passed'] or '无'}") + lines.append(f"- 新增失败:{delta['newly_failed'] or '无'}") + lines.append("") + + # gate 明细 + lines.append("## 4. 门控决策明细") + lines.append("") + lines.append("| 规则 | 结果 | 说明 |") + lines.append("|---|---|---|") + for rule in gate["rules"]: + lines.append(f"| `{rule['name']}` | {'✅' if rule['passed'] else '❌'} | {rule['detail']} |") + lines.append("") + + # 每轮候选审计 + cand = r["candidate"] + lines.append("## 5. 每轮候选审计") + lines.append("") + if cand["rounds_detail"]: + for rd in cand["rounds_detail"]: + fields = rd.get("optimized_fields", []) + extra = "" + if "validation_pass_rate" in rd: + extra = f" | val_pass={rd['validation_pass_rate']} | accepted={rd.get('accepted')}" + lines.append(f"- **Round {rd['round']}**:改写字段 {fields}{extra}") + note = rd.get("note") or rd.get("acceptance_reason") + if note: + lines.append(f" - {note}") + else: + lines.append("- (优化器未产出任何轮次;候选=baseline)") + lines.append("") + lines.append("> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。") + lines.append("") + + # 候选与成本 + lines.append("## 6. 候选与成本审计") + lines.append("") + lines.append(f"- 优化状态:`{cand['status']}` | stop_reason:`{cand['stop_reason']}`") + lines.append(f"- 被改写字段:{cand['optimized_fields']} | 轮数:{cand['rounds']}") + lines.append(f"- 成本:${cand['cost_usd']:.6f} | 优化耗时:{cand['duration_seconds']}s") + lines.append(f"- 后端:`{cand['meta'].get('backend', 'unknown')}`") + lines.append("") + lines.append("> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。") + lines.append("") + return "\n".join(lines) + + +def save_markdown(report: dict, path: Path) -> None: + Path(path).write_text(render_markdown(report), encoding="utf-8") diff --git a/examples/optimization/eval_optimize_loop/run_pipeline.py b/examples/optimization/eval_optimize_loop/run_pipeline.py new file mode 100644 index 0000000..6410c9f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/run_pipeline.py @@ -0,0 +1,171 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Evaluation + Optimization 自动闭环入口。 + +一条命令跑完六阶段: + Baseline 评测 → 失败归因 → 优化执行 → 候选验证(逐 case delta) + → 接受门控 → 审计落盘(optimization_report.json + .md) + +用法 +---- + # 离线 fake 模式(无需 API Key,≤3min,确定性可复现)——默认 + python run_pipeline.py + + # 真实模式(需 TRPC_AGENT_API_KEY / _BASE_URL / _MODEL_NAME) + python run_pipeline.py --mode real + +产物写到 ``runs//``,并在其中生成 optimization_report.{json,md}。 +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import random +import sys +from datetime import datetime +from functools import partial +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +for _p in (str(_REPO_ROOT), str(_HERE)): + if _p not in sys.path: + sys.path.insert(0, _p) + +from agent import PROMPT_PATHS # noqa: E402 +from pipeline import attribution as attrib_mod # noqa: E402 +from pipeline import gate as gate_mod # noqa: E402 +from pipeline import optimize as opt_mod # noqa: E402 +from pipeline import report as report_mod # noqa: E402 +from pipeline.evaluate import evaluate_set # noqa: E402 + + +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +METRICS_PATH = _HERE / "eval_metrics.json" +CONFIG_PATH = _HERE / "config.json" +OPTIMIZER_PATH = _HERE / "optimizer.json" +RUNS_DIR = _HERE / "runs" + + +def _build_call_agent(mode: str): + """按模式返回 call_agent 回调。""" + if mode == "fake": + from agent import fake_backend # 延迟导入:real 模式不必加载 + return partial(fake_backend.call_agent_fake, prompt_paths=PROMPT_PATHS) + from agent.orchestrator import call_agent_real + return call_agent_real + + +async def run(mode: str, output_dir: Path) -> dict: + config = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + seed = int(config.get("seed", 42)) + random.seed(seed) + + call_agent = _build_call_agent(mode) + started_at = datetime.now() + output_dir.mkdir(parents=True, exist_ok=True) + + # ---- 阶段 1:Baseline 评测(train + val,读源 prompt=baseline)---- + print(f"[1/6] Baseline 评测 (mode={mode}) ...") + baseline_train = await evaluate_set(TRAIN_PATH, call_agent, METRICS_PATH, output_dir / "baseline_train") + baseline_val = await evaluate_set(VAL_PATH, call_agent, METRICS_PATH, output_dir / "baseline_val") + print(f" train {baseline_train.pass_count}/{baseline_train.total}" + f" | val {baseline_val.pass_count}/{baseline_val.total}") + + # ---- 阶段 2:失败归因 ---- + print("[2/6] 失败归因 ...") + train_attrib = await attrib_mod.attribute_failures(baseline_train, mode) + val_attrib = await attrib_mod.attribute_failures(baseline_val, mode) + print(f" train clusters={train_attrib['clusters']} | val clusters={val_attrib['clusters']}") + + # ---- 阶段 3:优化执行(结束后源 prompt 已还原到 baseline)---- + print("[3/6] 优化执行 ...") + if mode == "fake": + candidate = await opt_mod.optimize_fake(TRAIN_PATH, VAL_PATH) + else: + candidate = await opt_mod.optimize_real( + OPTIMIZER_PATH, call_agent, TRAIN_PATH, VAL_PATH, output_dir / "optimize" + ) + print(f" status={candidate.status} fields={candidate.optimized_fields}" + f" cost=${candidate.cost_usd:.4f}") + + # ---- 阶段 4:候选验证(临时把候选写入源 prompt,评测后还原)---- + print("[4/6] 候选验证 ...") + snapshot = opt_mod.apply_candidate(candidate.prompts) + try: + # 同时在训练集与验证集上重跑候选:训练集用于识别"训练提升但验证退化" + # 的过拟合,验证集用于门控。 + candidate_train = await evaluate_set(TRAIN_PATH, call_agent, METRICS_PATH, output_dir / "candidate_train") + candidate_val = await evaluate_set(VAL_PATH, call_agent, METRICS_PATH, output_dir / "candidate_val") + finally: + opt_mod.restore_prompts(snapshot) + deltas = gate_mod.compute_delta(baseline_val, candidate_val) + train_deltas = gate_mod.compute_delta(baseline_train, candidate_train) + print(f" candidate train {candidate_train.pass_count}/{candidate_train.total}" + f" | val {candidate_val.pass_count}/{candidate_val.total}") + + # ---- 阶段 5:接受门控 ---- + print("[5/6] 接受门控 ...") + decision = gate_mod.evaluate_gate( + baseline_val, candidate_val, deltas, candidate.cost_usd, config.get("gate", {}) + ) + print(f" decision={'ACCEPT' if decision.accepted else 'REJECT'} :: {decision.summary}") + + # ---- 阶段 6:审计落盘 ---- + print("[6/6] 审计落盘 ...") + finished_at = datetime.now() + run_meta = { + "mode": mode, + "seed": seed, + "started_at": started_at.isoformat(timespec="seconds"), + "finished_at": finished_at.isoformat(timespec="seconds"), + "elapsed_seconds": round((finished_at - started_at).total_seconds(), 2), + "train_dataset": str(TRAIN_PATH.relative_to(_HERE)), + "val_dataset": str(VAL_PATH.relative_to(_HERE)), + "target_fields": list(PROMPT_PATHS.keys()), + "gate_config": config.get("gate", {}), + } + report = report_mod.build_report( + run_meta=run_meta, + baseline_train=baseline_train, + baseline_val=baseline_val, + train_attrib=train_attrib, + val_attrib=val_attrib, + candidate=candidate, + candidate_train=candidate_train, + candidate_val=candidate_val, + deltas=deltas, + train_deltas=train_deltas, + gate=decision, + ) + report_mod.save_json(report, output_dir / "optimization_report.json") + report_mod.save_markdown(report, output_dir / "optimization_report.md") + print(f" 报告已写入 {output_dir}/optimization_report.(json|md)") + return report + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluation + Optimization 自动闭环") + parser.add_argument("--mode", choices=["fake", "real"], default="fake", + help="fake=离线确定性(默认);real=真实 LLM + AgentOptimizer") + parser.add_argument("--output-dir", default="", help="产物目录;默认 runs/") + args = parser.parse_args() + + output_dir = Path(args.output_dir) if args.output_dir else ( + RUNS_DIR / datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + ) + report = asyncio.run(run(args.mode, output_dir)) + accepted = report["gate_decision"]["accepted"] + # 退出码:接受=0,拒绝=2(便于 CI 判定;拒绝不是错误,是有效负决策) + sys.exit(0 if accepted else 2) + + +if __name__ == "__main__": + main() diff --git a/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json new file mode 100644 index 0000000..92b6a2a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json @@ -0,0 +1,397 @@ +{ + "schema_version": "eol-v1", + "run": { + "mode": "real", + "seed": 42, + "started_at": "2026-07-01T20:17:13", + "finished_at": "2026-07-01T20:17:21", + "elapsed_seconds": 8.4, + "train_dataset": "data/train.evalset.json", + "val_dataset": "data/val.evalset.json", + "target_fields": [ + "router", + "system_prompt", + "skill" + ], + "gate_config": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": [ + "val_add_class" + ], + "cost_budget_usd": 1.0 + } + }, + "baseline": { + "train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + } + } + }, + "val": { + "set_id": "val.evalset", + "pass_count": 3, + "total": 3, + "avg_score": 1.0, + "cases": { + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + } + } + } + }, + "failure_attribution": { + "train": { + "clusters": { + "最终回复不匹配": 1 + }, + "cases": { + "train_discount_shirt": { + "category": "final_response_mismatch", + "category_label": "最终回复不匹配", + "reason": "agent 未给出正确计算结果,直接拒绝回答,与期望答案不符。", + "source": "llm" + } + } + }, + "val": { + "clusters": {}, + "cases": {} + } + }, + "candidate": { + "status": "SUCCEEDED", + "stop_reason": "required_metrics_passing", + "optimized_fields": [], + "rounds": 0, + "cost_usd": 0.0, + "duration_seconds": 2.0019, + "meta": { + "backend": "real", + "algorithm": "gepa_reflective", + "baseline_pass_rate": 1.0, + "best_pass_rate": 1.0 + }, + "prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n\n\n\n" + }, + "rounds_detail": [] + }, + "candidate_train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + }, + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "我暂时无法解答这道题。因为题目涉及“打折扣”的计算,而我的技能只包括加法,无法处理乘法或百分比相关的运算。请提供加法类题目,我会很乐意为您解答!", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:我暂时无法解答这道题。因为题目涉及“打折扣”的计算,而我的技能只包括加法,无法处理乘法或百分比相关的运算。请提供加法类题目,我会很乐意为您解答!" + ] + } + } + }, + "candidate_val": { + "set_id": "val.evalset", + "pass_count": 3, + "total": 3, + "avg_score": 1.0, + "cases": { + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + } + } + }, + "overfitting_signal": { + "train_score_delta": 0.0, + "val_score_delta": 0.0, + "train_up_val_down": false + }, + "delta": { + "val_score_delta": 0.0, + "train_score_delta": 0.0, + "baseline_val_pass": "3/3", + "candidate_val_pass": "3/3", + "baseline_train_pass": "2/3", + "candidate_train_pass": "2/3", + "per_case": [ + { + "eval_id": "val_add_class", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + }, + { + "eval_id": "val_mul_box", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + }, + { + "eval_id": "val_add_orange", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + } + ], + "newly_passed": [], + "newly_failed": [], + "regressed": [] + }, + "gate_decision": { + "accepted": false, + "summary": "拒绝候选:命中规则 ['min_val_score_delta']。", + "rules": [ + { + "name": "min_val_score_delta", + "passed": false, + "detail": "验证集平均分 delta=+0.0000,阈值 ≥ +0.0500" + }, + { + "name": "forbid_new_hard_fail", + "passed": true, + "detail": "新增失败 case=无" + }, + { + "name": "key_cases_no_regression", + "passed": true, + "detail": "关键 case=['val_add_class'],其中退化=无" + }, + { + "name": "cost_within_budget", + "passed": true, + "detail": "候选成本=$0.0000,预算 ≤ $1.0000" + } + ] + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md new file mode 100644 index 0000000..076a002 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md @@ -0,0 +1,56 @@ +# Evaluation + Optimization 闭环报告 + +- **决策**:❌ 拒绝 (REJECT) +- **结论**:拒绝候选:命中规则 ['min_val_score_delta']。 +- 运行模式:`real` | seed:`42` | 耗时:8.4s +- 时间:2026-07-01T20:17:13 → 2026-07-01T20:17:21 + +## 1. 分数总览 + +| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 | +|---|---|---|---|---|---| +| 训练集 | 2/3 | 0.667 | 2/3 | 0.667 | +0.000 | +| 验证集 | 3/3 | 1.000 | 3/3 | 1.000 | +0.000 | + +## 2. Baseline 失败归因 + +**训练集** 失败聚类:{'最终回复不匹配': 1} +- `train_discount_shirt` → **最终回复不匹配**(llm):agent 未给出正确计算结果,直接拒绝回答,与期望答案不符。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +**验证集** 失败聚类:无失败 + +## 3. 候选验证 · 逐 case delta + +| case | baseline | candidate | Δ分 | 状态 | +|---|---|---|---|---| +| `val_add_class` | PASS | PASS | +0.000 | ⚪ 不变 | +| `val_mul_box` | PASS | PASS | +0.000 | ⚪ 不变 | +| `val_add_orange` | PASS | PASS | +0.000 | ⚪ 不变 | + +- 新增通过:无 +- 新增失败:无 + +## 4. 门控决策明细 + +| 规则 | 结果 | 说明 | +|---|---|---| +| `min_val_score_delta` | ❌ | 验证集平均分 delta=+0.0000,阈值 ≥ +0.0500 | +| `forbid_new_hard_fail` | ✅ | 新增失败 case=无 | +| `key_cases_no_regression` | ✅ | 关键 case=['val_add_class'],其中退化=无 | +| `cost_within_budget` | ✅ | 候选成本=$0.0000,预算 ≤ $1.0000 | + +## 5. 每轮候选审计 + +- (优化器未产出任何轮次;候选=baseline) + +> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。 + +## 6. 候选与成本审计 + +- 优化状态:`SUCCEEDED` | stop_reason:`required_metrics_passing` +- 被改写字段:[] | 轮数:0 +- 成本:$0.000000 | 优化耗时:2.0019s +- 后端:`real` + +> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。