diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore new file mode 100644 index 0000000..1f08172 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/.gitignore @@ -0,0 +1,3 @@ +runs/ +__pycache__/ +*.pyc diff --git a/examples/optimization/eval_optimize_loop/DESIGN.md b/examples/optimization/eval_optimize_loop/DESIGN.md new file mode 100644 index 0000000..c0c899f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/DESIGN.md @@ -0,0 +1,35 @@ +# 方案设计说明 + +本示例把 `AgentEvaluator` 与 `AgentOptimizer` 串成「评测 → 失败归因 → prompt 优化 +→ 回归验证 → 产物审计」的自动闭环,目标是判断一次优化是否**真的值得进生产**, +而非仅仅让训练分变高。 + +## 失败归因方法 + +Baseline 评测后,对每条未达标 case 做单标签归因,落入六类之一:最终回复不匹配、 +工具调用错误、参数错误、LLM rubric 不达标、知识召回不足、格式不符合要求。real 模式 +用一个 LLM 裁判读『题面 / 期望答案 / 实际答复 / 运行错误』输出 `{category, reason}`; +fake 模式用确定性规则(拒答→知识召回不足;数值不符→参数错误;数值对但串不匹配→ +格式不符;运行报错→工具调用错误)。两种后端输出结构一致,并聚类成类别计数, +指导优化器聚焦真正的缺陷,且每条失败都附一句可解释原因。 + +## 接受策略 + +优化产出候选后,在**验证集**上逐 case 与 baseline 对比(新增通过 / 新增失败 / +分升 / 分降 / 不变),再过一道可配置 gate:验证集均分提升需 ≥ 阈值、不得新增 +hard fail(原通过转失败)、关键 case 不得退化、成本不超预算。任一规则不过即拒绝。 +拒绝是有效的负决策(退出码 2),不是流程错误。 + +## 防过拟合策略 + +严格 train/val 隔离:优化器只在训练集反思,验证集仅用于接受判定,从不参与改写。 +候选会在训练集上重跑一次,报告显式给出 `overfitting_signal`——「训练涨而验证不涨」 +即高亮。示例内置过拟合候选:它学会乘法(训练 +0.33)却带入「大数默认按乘法」副作用, +使一条大数加法验证题由通过转失败,`forbid_new_hard_fail` 据此拒绝。 + +## 产物审计方式 + +每次运行落到独立 `runs//`,记录 baseline/candidate 逐 case 评测、失败归因、 +逐 case delta、gate 决策与逐规则理由、候选 prompt 全文、成本、耗时、随机种子与数据 +路径,输出 `optimization_report.json` 与 `optimization_report.md`,使「为何接受/拒绝」 +完全可复现、可审计。 diff --git a/examples/optimization/eval_optimize_loop/README.md b/examples/optimization/eval_optimize_loop/README.md new file mode 100644 index 0000000..7c05fb7 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/README.md @@ -0,0 +1,120 @@ +# eval_optimize_loop · 评测 + 优化自动闭环 + +把 `AgentEvaluator`(评测)与 `AgentOptimizer`(优化)串成一条可复现、可审计的闭环: + +``` +Baseline 评测 → 失败归因 → prompt 优化 → 候选验证(逐 case delta) → 接受门控 → 审计落盘 +``` + +它回答的不是"能不能跑一次优化",而是"这次优化**是否真的值得接受**"——是否提升、 +是否牺牲其他指标、是否过拟合、是否值得回写源 prompt。 + +## 快速开始 + +```bash +# 离线 fake 模式:无需 API Key,确定性、可复现,秒级完成(默认) +python run_pipeline.py + +# 真实模式:需要 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +python run_pipeline.py --mode real +``` + +产物写入 `runs//`,含 `optimization_report.json` 与 `optimization_report.md`。 +退出码:**0 = 接受候选**,**2 = 拒绝候选**(拒绝是有效负决策,便于 CI 判定)。 + +仓库已提交两份示例输出: +- 规范样例(fake 模式,确定性):[optimization_report.json](optimization_report.json) / + [optimization_report.md](optimization_report.md) —— 完整复现"成功/无效/退化"三场景与过拟合拒绝。 +- 真实链路样例(real 模式,实跑于一个 OpenAI 兼容端点): + [samples/real_sample.optimization_report.md](samples/real_sample.optimization_report.md)。 + 该次真实模型在验证集 baseline 已 3/3 满分(真实 LLM 不受 fake 的 `@cap` 能力标记约束, + 乘法本就会做),故 GEPA 0 轮无可优化,门控据此以"无提升"正确拒绝——这也是一种有效负决策。 + +## 双后端设计(为什么有 fake) + +issue 要求 "没有真实 API Key 时也能跑通核心流程" 且 "fake 模式 ≤ 3 分钟"。因此 +pipeline 是**双后端**:编排、评测、门控、报告四层两档**完全共用真实代码**,fake 只替换 +两个要花钱/联网的点。 + +| 阶段 | fake(默认·离线) | real(配 Key) | +|---|---|---| +| Agent 推理 | `agent/fake_backend.py` 确定性求解 | `agent/orchestrator.py` 真实多 agent + `LlmAgent` | +| 评测打分 | 真实 `AgentEvaluator` + text-contains | 同左 | +| 失败归因 | 确定性规则桩 | 纯 LLM 裁判 | +| 优化 | 脚本化候选 | 真实 `AgentOptimizer`(GEPA) | + +fake 后端从 prompt 文件里的 `` 能力标记决定行为,于是"改 prompt"被 +映射成"改能力集合",让每条 case 的 pass/fail 随候选确定性翻转——这正是稳定复现三类 +场景所需的可控信号。 + +**无 Key 跑通的三种途径**(issue 要求 fake judge / fake model / trace mode): +- **fake model**:默认档,`call_agent` 换成确定性求解器(本示例采用)。 +- **fake judge**:评测 metric 默认用 `final_response` 文本匹配,不调用任何裁判模型; + 若改用 LLM-rubric 指标,把判分入口替换为规则桩即可(`pipeline/attribution.py` 的 + `classify_fake` 即是失败归因的 fake judge 实现)。 +- **trace mode**:evalset case 可携带预录 `intermediate_data`,`AgentEvaluator` 以 + trace 直接评分而不驱动 agent(SDK 原生支持,无需模型)。 + +## 优化目标(三字段 TargetPrompt) + +对应 `agent/prompts/` 三个文件,`round_robin` 每轮只改一个便于归因: + +- `router` — [router.md](agent/prompts/router.md):题型分流 +- `system_prompt` — [system.md](agent/prompts/system.md):输出格式约束 +- `skill` — [skill.md](agent/prompts/skill.md):解题题型能力 + +## 样例 case 与三类场景 + +6 条 case(3 训练 / 3 验证,见 [data/](data/)),覆盖 issue 要求的三类: + +| 场景 | case | baseline → candidate | +|---|---|---| +| 可优化成功 | `train_mul_car` / `val_mul_box` | FAIL → PASS(学会乘法) | +| 优化无效 | `train_discount_shirt` | FAIL → FAIL(折扣仍不会) | +| 优化后退化 | `val_add_class` | PASS → FAIL(大数加法被过拟合规则误算) | + +候选在训练集 +0.33 却在验证集出现新增失败 → **门控拒绝**,正是"训练涨、验证退"的 +过拟合必须挡下的情形。 + +## 失败归因(六类) + +最终回复不匹配 / 工具调用错误 / 参数错误 / LLM rubric 不达标 / 知识召回不足 / +格式不符合要求。每条失败至少给出一条可解释原因,并聚类成类别计数(见报告第 2 节)。 + +## 接受门控(可配置,[config.json](config.json)) + +```json +"gate": { + "min_val_score_delta": 0.05, // 验证集均分提升需 ≥ 此值 + "forbid_new_hard_fail": true, // 不得新增 hard fail(原通过转失败) + "key_case_ids": ["val_add_class"],// 关键 case 不得退化 + "cost_budget_usd": 1.0 // 成本预算 +} +``` + +任一规则不过即拒绝。 + +## 目录结构 + +``` +eval_optimize_loop/ +├── run_pipeline.py # 入口:六阶段编排 +├── config.json # gate + seed +├── optimizer.json # AgentOptimizer(GEPA) 配置(real 模式) +├── eval_metrics.json # 共享评测 metric(final_response contains) +├── DESIGN.md # 300–500 字方案说明 +├── optimization_report.json # 示例输出(fake 模式) +├── optimization_report.md +├── data/ # train.evalset.json / val.evalset.json(6 条) +├── agent/ +│ ├── orchestrator.py # real 档:router→solver(system+skill) +│ ├── fake_backend.py # fake 档:确定性求解器 +│ ├── config.py # real 档模型配置(读环境变量) +│ └── prompts/ # router.md / system.md / skill.md +└── pipeline/ + ├── evaluate.py # AgentEvaluator → 结构化逐 case + ├── attribution.py # 六类失败归因(LLM / 规则) + ├── optimize.py # AgentOptimizer 包装 + 脚本化候选 + ├── gate.py # 逐 case delta + 接受门控 + └── report.py # optimization_report.{json,md} +``` diff --git a/examples/optimization/eval_optimize_loop/agent/__init__.py b/examples/optimization/eval_optimize_loop/agent/__init__.py new file mode 100644 index 0000000..cd2de93 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/__init__.py @@ -0,0 +1,29 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""被优化的 agent:三个 prompt 字段(router/system/skill) + 双后端(real/fake)。""" + +from __future__ import annotations + +from .orchestrator import ( + ROUTER_PROMPT_PATH, + SKILL_PROMPT_PATH, + SYSTEM_PROMPT_PATH, +) + +# TargetPrompt 字段名 -> prompt 文件路径。pipeline 各阶段共用这一份映射: +# real 模式喂给 AgentOptimizer 的 TargetPrompt,fake 模式喂给确定性求解器。 +PROMPT_PATHS = { + "router": ROUTER_PROMPT_PATH, + "system_prompt": SYSTEM_PROMPT_PATH, + "skill": SKILL_PROMPT_PATH, +} + +__all__ = [ + "PROMPT_PATHS", + "ROUTER_PROMPT_PATH", + "SYSTEM_PROMPT_PATH", + "SKILL_PROMPT_PATH", +] diff --git a/examples/optimization/eval_optimize_loop/agent/config.py b/examples/optimization/eval_optimize_loop/agent/config.py new file mode 100644 index 0000000..89c44aa --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/config.py @@ -0,0 +1,36 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""真实模式的模型配置:从环境变量读取,供 orchestrator 与 LLM judge 共用。""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """返回 (api_key, base_url, model_name),缺失时抛出清晰错误。 + + 仅在 real 模式下被调用;fake 模式不会触碰模型配置,因此无 key 也能跑。 + """ + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + missing = [ + name + for name, val in ( + ("TRPC_AGENT_API_KEY", api_key), + ("TRPC_AGENT_BASE_URL", base_url), + ("TRPC_AGENT_MODEL_NAME", model_name), + ) + if not val + ] + if missing: + raise RuntimeError( + "real 模式需要以下环境变量: " + + ", ".join(missing) + + "。若无 API Key,请改用 fake 模式:python run_pipeline.py --mode fake" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/eval_optimize_loop/agent/fake_backend.py b/examples/optimization/eval_optimize_loop/agent/fake_backend.py new file mode 100644 index 0000000..b76ff9a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/fake_backend.py @@ -0,0 +1,130 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""确定性 fake agent 后端:无 API Key 也能跑通完整 pipeline。 + +设计目标 +-------- +issue 要求 fake / trace 模式下 3 分钟内跑通闭环,并且能稳定复现三类场景: +可优化成功、优化无效、优化后退化(过拟合)。为此本模块提供一个**不依赖 +任何真实 LLM** 的求解器:它只从当前 prompt 文件里解析「能力标记」(``@cap:`` +注释行),据此决定这次能不能解题、格式对不对。 + +于是"改 prompt"这个动作被映射成"改能力集合",从而让每条 case 的 pass/fail +可以随 prompt 候选确定性翻转——这正是演示评测→优化闭环所需要的可控信号, +同时又完全离线、可复现(固定 seed 无关,无随机)。 + +能力标记(写在 prompts/*.md 里的 ````) +------------------------------------------------------ +- ``op-add`` / ``op-mul`` / ``op-discount`` : 求解器掌握的运算(一般放在 skill.md) +- ``fmt-answer-prefix`` : 最终答复以「答案:」开头(system.md) +- ``fmt-unit-suffix`` : 数字后带单位(system.md) +- ``route-ok`` : 路由器能正确分流(router.md) +- ``assume-mul-default`` : **过拟合副作用**——对含大操作数(>=10) + 的加法题过度使用乘法,故意制造回归 + +真实模式请改用 :mod:`agent.orchestrator`(真正的多 agent + LlmAgent)。 +""" + +from __future__ import annotations + +import re +from pathlib import Path + + +_CAP_RE = re.compile(r"@cap:\s*([a-z0-9\-]+)", re.IGNORECASE) +_NUM_RE = re.compile(r"\d+(?:\.\d+)?") + +REFUSAL = "抱歉,我暂时无法解答这道题。" + + +def read_caps(*prompt_paths: Path) -> set[str]: + """从若干 prompt 文件里解析全部能力标记,合成一个能力集合。""" + caps: set[str] = set() + for path in prompt_paths: + try: + text = Path(path).read_text(encoding="utf-8") + except FileNotFoundError: + continue + caps.update(m.group(1).lower() for m in _CAP_RE.finditer(text)) + return caps + + +def _detect_operation(query: str) -> str: + """从题面关键词判断运算类型:discount / mul / add。""" + if "折" in query: + return "discount" + if "每" in query: # “每小时”“每盒” 这类单价/速率题 → 乘法 + return "mul" + return "add" + + +def _detect_unit(query: str) -> str: + """从题面关键词判断单位。顺序敏感:先匹配更具体的单位。""" + if "公里" in query: + return "公里" + if "元" in query: + return "元" + if any(k in query for k in ("人", "男生", "女生", "名")): + return "人" + return "个" + + +def _format_number(value: float) -> str: + """整数值去掉小数尾巴:150.0 -> '150'。""" + if value == int(value): + return str(int(value)) + return str(value) + + +def solve(query: str, caps: set[str]) -> str: + """确定性求解:根据能力集合返回最终答复文本。 + + 这是 fake agent 的全部"智能"。改动 prompt(即改动 ``caps``)会确定性地 + 改变返回值,从而让评测分数随候选 prompt 翻转。 + """ + operation = _detect_operation(query) + unit = _detect_unit(query) + numbers = [float(n) for n in _NUM_RE.findall(query)] + + # 1) 能力缺失 → 如实拒答(映射到"知识召回不足"类失败) + required_cap = {"add": "op-add", "mul": "op-mul", "discount": "op-discount"}[operation] + if required_cap not in caps: + return REFUSAL + if len(numbers) < 2: + return REFUSAL + + a, b = numbers[0], numbers[1] + + # 2) 计算数值 + if operation == "add": + # 过拟合副作用:assume-mul-default 让求解器对含大操作数的加法题 + # 过度使用乘法 → 大数加法题被算错(制造验证集回归)。 + if "assume-mul-default" in caps and (a >= 10 or b >= 10): + result = a * b + else: + result = a + b + elif operation == "mul": + result = a * b + else: # discount:原价 * 折数/10 + result = a * (b / 10.0) + + # 3) 套用格式 + body = _format_number(result) + if "fmt-unit-suffix" in caps: + body = f"{body} {unit}" + if "fmt-answer-prefix" in caps: + body = f"答案:{body}" + return body + + +async def call_agent_fake(query: str, prompt_paths: dict[str, Path]) -> str: + """框架回调(fake 版):读当前 prompt 能力集合 → 确定性求解。 + + 与真实 ``call_agent`` 保持同签名(``query -> str``),由 pipeline 通过 + ``functools.partial`` 绑定 ``prompt_paths`` 后传入 AgentEvaluator。 + """ + caps = read_caps(*prompt_paths.values()) + return solve(query, caps) diff --git a/examples/optimization/eval_optimize_loop/agent/orchestrator.py b/examples/optimization/eval_optimize_loop/agent/orchestrator.py new file mode 100644 index 0000000..152ca3a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/orchestrator.py @@ -0,0 +1,99 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""真实模式的多 agent 链路:router → solver(system + skill) → 最终答复。 + +链路形态:: + + 用户问题 → router(分流) → solver(用 system.md 定格式 + skill.md 定题型) → 答复 + +这是 real 模式(配了 TRPC_AGENT_* 时)真正驱动的 agent。三个 prompt 文件正好 +对应 TargetPrompt 的三个优化字段:router / system_prompt / skill。 + +prompt 热加载:每次 invoke 都重读 prompt 文件——优化器写入候选后下一次调用 +即生效,无需重启进程。 + +fake 模式不会用到本文件,改用 :mod:`agent.fake_backend` 的确定性求解器。 +""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +_PROMPTS_DIR = Path(__file__).parent / "prompts" +ROUTER_PROMPT_PATH = _PROMPTS_DIR / "router.md" +SYSTEM_PROMPT_PATH = _PROMPTS_DIR / "system.md" +SKILL_PROMPT_PATH = _PROMPTS_DIR / "skill.md" + +APP_NAME = "eval_optimize_loop" + + +def _create_agent(name: str, instruction: str) -> LlmAgent: + """构造一个 LlmAgent,instruction 由调用方现读现拼。""" + api_key, base_url, model_name = get_model_config() + return LlmAgent( + name=name, + description=f"eval_optimize_loop {name}", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=instruction, + generate_content_config=GenerateContentConfig( + temperature=0.2, top_p=0.9, max_output_tokens=1024, + ), + ) + + +async def _run_one(agent: LlmAgent, user_text: str) -> str: + """跑一个 agent 拿最终回答;每次独立 Runner/Session 保证评测隔离。""" + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "pipeline" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def call_agent_real(query: str) -> str: + """框架回调(real 版):把 query 跑过整条链路,返回最终答复。""" + # 1. router:判定题型(每次重读 router.md) + router = _create_agent("router", ROUTER_PROMPT_PATH.read_text(encoding="utf-8").strip()) + await _run_one(router, f"用户问题:{query}\n\n请判断这是加法、乘法还是折扣问题。") + + # 2. solver:system.md(格式约束)+ skill.md(题型能力)拼成 instruction + solver_instruction = ( + SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + "\n\n## 解题技能\n" + + SKILL_PROMPT_PATH.read_text(encoding="utf-8").strip() + ) + solver = _create_agent("solver", solver_instruction) + return await _run_one(solver, query) diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/router.md b/examples/optimization/eval_optimize_loop/agent/prompts/router.md new file mode 100644 index 0000000..4a9a410 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/router.md @@ -0,0 +1,9 @@ +# 路由器 Prompt + +你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。 + +- 认真读题,识别是加法、乘法还是折扣问题。 +- 把控制权交给下游解题 agent。 + + + diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/skill.md b/examples/optimization/eval_optimize_loop/agent/prompts/skill.md new file mode 100644 index 0000000..657eec3 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/skill.md @@ -0,0 +1,10 @@ +# 解题技能 · Skill Prompt + +这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型, +如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。 + +## 已掌握题型 +- 加法:把题目中的两个数量相加。 + + + diff --git a/examples/optimization/eval_optimize_loop/agent/prompts/system.md b/examples/optimization/eval_optimize_loop/agent/prompts/system.md new file mode 100644 index 0000000..3439801 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/prompts/system.md @@ -0,0 +1,12 @@ +# 解题 Agent · System Prompt + +你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。 + +## 输出格式要求 +- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。 +- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。 +- 示例:`答案:11 个` + + + + diff --git a/examples/optimization/eval_optimize_loop/config.json b/examples/optimization/eval_optimize_loop/config.json new file mode 100644 index 0000000..c699a18 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/config.json @@ -0,0 +1,9 @@ +{ + "seed": 42, + "gate": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": ["val_add_class"], + "cost_budget_usd": 1.0 + } +} diff --git a/examples/optimization/eval_optimize_loop/data/train.evalset.json b/examples/optimization/eval_optimize_loop/data/train.evalset.json new file mode 100644 index 0000000..9dbf1c8 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/data/train.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "eol_train", + "name": "评测-优化闭环 · 训练集", + "description": "3 道小学算术应用题,用于反思归因与优化。final_response 中的『答案:X 单位』既作为 contains 匹配的参考答案,也作为失败归因时的期望答案。三条覆盖:baseline 已通过(加法) / 可优化成功(乘法) / 优化无效(折扣)。", + "eval_cases": [ + { + "eval_id": "train_add_apple", + "conversation": [ + { + "invocation_id": "tr1", + "user_content": { + "parts": [{"text": "小明早上买了 4 个苹果,下午又买了 7 个苹果,他一共有多少个苹果?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:11 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "train_mul_car", + "conversation": [ + { + "invocation_id": "tr2", + "user_content": { + "parts": [{"text": "一辆汽车以每小时 60 公里的速度行驶了 2.5 小时,一共行驶了多少公里?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:150 公里"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "train_discount_shirt", + "conversation": [ + { + "invocation_id": "tr3", + "user_content": { + "parts": [{"text": "一件衣服原价 200 元,现在打 8 折出售,折后价是多少元?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:160 元"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/data/val.evalset.json b/examples/optimization/eval_optimize_loop/data/val.evalset.json new file mode 100644 index 0000000..94d1d3f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/data/val.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "eol_val", + "name": "评测-优化闭环 · 验证集", + "description": "3 道与训练集同分布的算术应用题,用于候选回归与防过拟合门控。三条覆盖:稳定通过(小数加法) / 泛化提升(乘法) / 优化后退化(大数加法被过拟合规则误算)。", + "eval_cases": [ + { + "eval_id": "val_add_orange", + "conversation": [ + { + "invocation_id": "va1", + "user_content": { + "parts": [{"text": "篮子里有 3 个橙子,妈妈又放进 5 个橙子,现在篮子里一共有多少个橙子?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:8 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + }, + { + "eval_id": "val_mul_box", + "conversation": [ + { + "invocation_id": "va2", + "user_content": { + "parts": [{"text": "每盒装 12 个鸡蛋,一共有 5 盒,请问总共有多少个鸡蛋?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:60 个"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + }, + { + "eval_id": "val_add_class", + "conversation": [ + { + "invocation_id": "va3", + "user_content": { + "parts": [{"text": "三年级二班有 20 名男生和 15 名女生,这个班一共有多少人?"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "答案:35 人"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "eval_optimize_loop", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/eval_metrics.json b/examples/optimization/eval_optimize_loop/eval_metrics.json new file mode 100644 index 0000000..19684db --- /dev/null +++ b/examples/optimization/eval_optimize_loop/eval_metrics.json @@ -0,0 +1,14 @@ +{ + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 1 +} diff --git a/examples/optimization/eval_optimize_loop/optimization_report.json b/examples/optimization/eval_optimize_loop/optimization_report.json new file mode 100644 index 0000000..3dc052b --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.json @@ -0,0 +1,431 @@ +{ + "schema_version": "eol-v1", + "run": { + "mode": "fake", + "seed": 42, + "started_at": "2026-07-01T20:17:51", + "finished_at": "2026-07-01T20:17:51", + "elapsed_seconds": 0.01, + "train_dataset": "data/train.evalset.json", + "val_dataset": "data/val.evalset.json", + "target_fields": [ + "router", + "system_prompt", + "skill" + ], + "gate_config": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": [ + "val_add_class" + ], + "cost_budget_usd": 1.0 + } + }, + "baseline": { + "train": { + "set_id": "train.evalset", + "pass_count": 1, + "total": 3, + "avg_score": 0.3333, + "cases": { + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_mul_car": { + "passed": false, + "score": 0.0, + "expected": "答案:150 公里", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + } + } + }, + "val": { + "set_id": "val.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "val_mul_box": { + "passed": false, + "score": 0.0, + "expected": "答案:60 个", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + } + } + } + }, + "failure_attribution": { + "train": { + "clusters": { + "知识召回不足": 2 + }, + "cases": { + "train_mul_car": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + }, + "train_discount_shirt": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + } + } + }, + "val": { + "clusters": { + "知识召回不足": 1 + }, + "cases": { + "val_mul_box": { + "category": "knowledge_gap", + "category_label": "知识召回不足", + "reason": "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。", + "source": "fake" + } + } + } + }, + "candidate": { + "status": "SUCCEEDED", + "stop_reason": "fake_scripted_candidate", + "optimized_fields": [ + "skill" + ], + "rounds": 1, + "cost_usd": 0.0, + "duration_seconds": 0.0001, + "meta": { + "backend": "fake", + "note": "scripted overfitting candidate for offline demo" + }, + "prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt(候选)\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n- 乘法:把题目中的两个数量相乘(新增)。\n\n## 快捷启发式(反思器新增,实为过拟合)\n- 数字较大的题目倾向于按乘法处理,往往更快得到答案。\n\n\n\n\n" + }, + "rounds_detail": [ + { + "round": 1, + "optimized_fields": [ + "skill" + ], + "candidate_prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt(候选)\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n- 乘法:把题目中的两个数量相乘(新增)。\n\n## 快捷启发式(反思器新增,实为过拟合)\n- 数字较大的题目倾向于按乘法处理,往往更快得到答案。\n\n\n\n\n" + }, + "note": "脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用" + } + ] + }, + "candidate_train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + } + } + }, + "candidate_val": { + "set_id": "val.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "val_add_class": { + "passed": false, + "score": 0.0, + "expected": "答案:35 人", + "actual": "答案:300 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:300 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + } + } + }, + "overfitting_signal": { + "train_score_delta": 0.3333, + "val_score_delta": 0.0, + "train_up_val_down": true + }, + "delta": { + "val_score_delta": 0.0, + "train_score_delta": 0.3333, + "baseline_val_pass": "2/3", + "candidate_val_pass": "2/3", + "baseline_train_pass": "1/3", + "candidate_train_pass": "2/3", + "per_case": [ + { + "eval_id": "val_mul_box", + "baseline_passed": false, + "candidate_passed": true, + "baseline_score": 0.0, + "candidate_score": 1.0, + "score_delta": 1.0, + "status": "newly_passed" + }, + { + "eval_id": "val_add_class", + "baseline_passed": true, + "candidate_passed": false, + "baseline_score": 1.0, + "candidate_score": 0.0, + "score_delta": -1.0, + "status": "newly_failed" + }, + { + "eval_id": "val_add_orange", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + } + ], + "newly_passed": [ + "val_mul_box" + ], + "newly_failed": [ + "val_add_class" + ], + "regressed": [ + "val_add_class" + ] + }, + "gate_decision": { + "accepted": false, + "summary": "拒绝候选:命中规则 ['min_val_score_delta', 'forbid_new_hard_fail', 'key_cases_no_regression']。疑似过拟合(验证集出现退化/新增失败)。", + "rules": [ + { + "name": "min_val_score_delta", + "passed": false, + "detail": "验证集平均分 delta=+0.0000,阈值 ≥ +0.0500" + }, + { + "name": "forbid_new_hard_fail", + "passed": false, + "detail": "新增失败 case=['val_add_class']" + }, + { + "name": "key_cases_no_regression", + "passed": false, + "detail": "关键 case=['val_add_class'],其中退化=['val_add_class']" + }, + { + "name": "cost_within_budget", + "passed": true, + "detail": "候选成本=$0.0000,预算 ≤ $1.0000" + } + ] + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/optimization_report.md b/examples/optimization/eval_optimize_loop/optimization_report.md new file mode 100644 index 0000000..13ad235 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimization_report.md @@ -0,0 +1,63 @@ +# Evaluation + Optimization 闭环报告 + +- **决策**:❌ 拒绝 (REJECT) +- **结论**:拒绝候选:命中规则 ['min_val_score_delta', 'forbid_new_hard_fail', 'key_cases_no_regression']。疑似过拟合(验证集出现退化/新增失败)。 +- 运行模式:`fake` | seed:`42` | 耗时:0.01s +- 时间:2026-07-01T20:17:51 → 2026-07-01T20:17:51 + +## 1. 分数总览 + +| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 | +|---|---|---|---|---|---| +| 训练集 | 1/3 | 0.333 | 2/3 | 0.667 | +0.333 | +| 验证集 | 2/3 | 0.667 | 2/3 | 0.667 | +0.000 | + +> ⚠️ **过拟合信号**:训练集提升 +0.333,验证集却未提升(+0.000)——候选在训练分布上过度特化。 + +## 2. Baseline 失败归因 + +**训练集** 失败聚类:{'知识召回不足': 2} +- `train_mul_car` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 +- `train_discount_shirt` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +**验证集** 失败聚类:{'知识召回不足': 1} +- `val_mul_box` → **知识召回不足**(fake):agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +## 3. 候选验证 · 逐 case delta + +| case | baseline | candidate | Δ分 | 状态 | +|---|---|---|---|---| +| `val_mul_box` | FAIL | PASS | +1.000 | 🟢 新增通过 | +| `val_add_class` | PASS | FAIL | -1.000 | 🔴 新增失败 | +| `val_add_orange` | PASS | PASS | +0.000 | ⚪ 不变 | + +- 新增通过:['val_mul_box'] +- 新增失败:['val_add_class'] + +## 4. 门控决策明细 + +| 规则 | 结果 | 说明 | +|---|---|---| +| `min_val_score_delta` | ❌ | 验证集平均分 delta=+0.0000,阈值 ≥ +0.0500 | +| `forbid_new_hard_fail` | ❌ | 新增失败 case=['val_add_class'] | +| `key_cases_no_regression` | ❌ | 关键 case=['val_add_class'],其中退化=['val_add_class'] | +| `cost_within_budget` | ✅ | 候选成本=$0.0000,预算 ≤ $1.0000 | + +## 5. 每轮候选审计 + +- **Round 1**:改写字段 ['skill'] + - 脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用 + +> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。 + +## 6. 候选与成本审计 + +- 优化状态:`SUCCEEDED` | stop_reason:`fake_scripted_candidate` +- 被改写字段:['skill'] | 轮数:1 +- 成本:$0.000000 | 优化耗时:0.0001s +- 后端:`fake` + +> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。 diff --git a/examples/optimization/eval_optimize_loop/optimizer.json b/examples/optimization/eval_optimize_loop/optimizer.json new file mode 100644 index 0000000..1551824 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.json @@ -0,0 +1,38 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": {"max_tokens": 4096, "temperature": 0.6} + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 3, + "skip_perfect_score": false, + "use_merge": true, + "max_metric_calls": 60, + "max_iterations_without_improvement": 6 + } + } +} diff --git a/examples/optimization/eval_optimize_loop/pipeline/__init__.py b/examples/optimization/eval_optimize_loop/pipeline/__init__.py new file mode 100644 index 0000000..9de8f4e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/__init__.py @@ -0,0 +1,6 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""评测-优化闭环 pipeline:evaluate / attribution / optimize / gate / report。""" diff --git a/examples/optimization/eval_optimize_loop/pipeline/attribution.py b/examples/optimization/eval_optimize_loop/pipeline/attribution.py new file mode 100644 index 0000000..3e692b1 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/attribution.py @@ -0,0 +1,160 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""失败归因阶段:把每条失败 case 归入六大类之一,并给出可解释原因。 + +六大类(对齐 issue):: + + 最终回复不匹配 / 工具调用错误 / 参数错误 / + LLM rubric 不达标 / 知识召回不足 / 格式不符合要求 + +两种后端 +-------- +- real(纯 LLM 裁判,issue 指定):用一个 LlmAgent 裁判读『题面/期望/实际』 + 输出 JSON 分类;语义最灵活。 +- fake(离线确定性桩):从『期望文本 vs 实际文本 vs 运行错误』用规则确定性 + 判类。保证无 key 时归因结果稳定、可复现(验收第 4 条要求分类准确率与可解释性)。 + +无论哪种后端,输出结构一致:{eval_id: Attribution},并可聚类成类别计数。 +""" + +from __future__ import annotations + +import json +import re +import uuid +from dataclasses import dataclass +from typing import Optional + +from .evaluate import CaseEval + + +# 六大失败类别(value 为报告中展示的中文标签) +CATEGORIES = { + "final_response_mismatch": "最终回复不匹配", + "tool_call_error": "工具调用错误", + "param_error": "参数错误", + "llm_rubric_fail": "LLM rubric 不达标", + "knowledge_gap": "知识召回不足", + "format_error": "格式不符合要求", +} + +_REFUSAL_MARK = "无法解答" +_NUM_RE = re.compile(r"-?\d+(?:\.\d+)?") + + +@dataclass +class Attribution: + eval_id: str + category: str # CATEGORIES 的 key + category_label: str # 中文标签 + reason: str # 可解释原因(每条失败至少一条) + source: str # "fake" | "llm" + + +def _first_number(text: str) -> Optional[str]: + m = _NUM_RE.search(text or "") + return m.group(0) if m else None + + +def classify_fake(case: CaseEval) -> Attribution: + """确定性归因:仅凭已有评测信号(期望/实际/错误)判类。""" + actual = case.actual_text or "" + expected = case.expected_text or "" + + if case.error: + cat, reason = "tool_call_error", f"运行期报错,链路未产出答复:{case.error[:80]}" + elif not actual: + cat, reason = "final_response_mismatch", "agent 未产出任何最终答复文本。" + elif _REFUSAL_MARK in actual: + cat, reason = "knowledge_gap", "agent 声明无法解答,说明缺少对应题型的解题能力(技能/知识缺口)。" + else: + exp_num, act_num = _first_number(expected), _first_number(actual) + if exp_num is not None and act_num is not None and exp_num != act_num: + cat, reason = "param_error", f"计算结果数值错误:期望 {exp_num},实际 {act_num}(运算或取数有误)。" + elif expected and expected not in actual: + # 数字对得上,但整体串不匹配 → 多半是格式(缺「答案:」前缀或单位) + cat, reason = "format_error", f"数值正确但格式不符:期望包含『{expected}』,实际输出『{actual}』。" + else: + cat, reason = "final_response_mismatch", f"最终答复与期望不一致:期望『{expected}』,实际『{actual}』。" + + return Attribution(case.eval_id, cat, CATEGORIES[cat], reason, source="fake") + + +_JUDGE_INSTRUCTION = ( + "你是评测失败归因裁判。给定一道题的『题面/期望答案/agent 实际答复』," + "把这次失败归入且仅归入以下六类之一,并给出一句可解释原因。\n" + "类别(用括号里的英文 key):最终回复不匹配(final_response_mismatch)、" + "工具调用错误(tool_call_error)、参数错误(param_error)、" + "LLM rubric 不达标(llm_rubric_fail)、知识召回不足(knowledge_gap)、" + "格式不符合要求(format_error)。\n" + "只输出 JSON:{\"category\": \"\", \"reason\": \"<一句话>\"},不要多余文字。" +) + + +async def classify_llm(case: CaseEval) -> Attribution: + """纯 LLM 裁判归因(real 模式)。失败时回退到 fake 规则,保证 pipeline 不中断。""" + try: + from trpc_agent_sdk.agents import LlmAgent + from trpc_agent_sdk.models import OpenAIModel + from trpc_agent_sdk.runners import Runner + from trpc_agent_sdk.sessions import InMemorySessionService + from trpc_agent_sdk.types import Content, GenerateContentConfig, Part + + from agent.config import get_model_config + + api_key, base_url, model_name = get_model_config() + judge = LlmAgent( + name="attribution_judge", + description="failure attribution judge", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=_JUDGE_INSTRUCTION, + generate_content_config=GenerateContentConfig(temperature=0.0, max_output_tokens=256), + ) + session_service = InMemorySessionService() + runner = Runner(app_name="eol_attribution", agent=judge, session_service=session_service) + session_id, user_id = str(uuid.uuid4()), "judge" + await session_service.create_session( + app_name="eol_attribution", user_id=user_id, session_id=session_id, state={}, + ) + prompt = ( + f"题面:{case.query}\n期望答案:{case.expected_text}\n" + f"agent 实际答复:{case.actual_text or '(空)'}\n运行错误:{case.error or '无'}" + ) + content = Content(role="user", parts=[Part.from_text(text=prompt)]) + out = "" + async for event in runner.run_async(user_id=user_id, session_id=session_id, new_message=content): + if event.is_final_response() and event.content and event.content.parts: + for part in event.content.parts: + if part.text and not part.thought: + out += part.text + parsed = json.loads(re.search(r"\{.*\}", out, re.DOTALL).group(0)) + cat = parsed.get("category", "final_response_mismatch") + if cat not in CATEGORIES: + cat = "final_response_mismatch" + reason = str(parsed.get("reason", "")).strip() or "(裁判未给出原因)" + return Attribution(case.eval_id, cat, CATEGORIES[cat], reason, source="llm") + except Exception as exc: # noqa: BLE001 - 归因失败不应中断闭环 + fallback = classify_fake(case) + fallback.reason = f"[LLM 裁判失败回退规则] {fallback.reason}(原因:{exc.__class__.__name__})" + return fallback + + +async def attribute_failures(set_eval, mode: str) -> dict: + """对一个 SetEval 里的所有失败 case 归因,返回明细 + 类别聚类计数。""" + attributions: dict[str, Attribution] = {} + for eval_id, case in set_eval.cases.items(): + if case.passed: + continue + if mode == "real": + attributions[eval_id] = await classify_llm(case) + else: + attributions[eval_id] = classify_fake(case) + + clusters: dict[str, int] = {} + for attr in attributions.values(): + clusters[attr.category_label] = clusters.get(attr.category_label, 0) + 1 + + return {"attributions": attributions, "clusters": clusters} diff --git a/examples/optimization/eval_optimize_loop/pipeline/evaluate.py b/examples/optimization/eval_optimize_loop/pipeline/evaluate.py new file mode 100644 index 0000000..8af463e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/evaluate.py @@ -0,0 +1,171 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""评测阶段:调**真实** AgentEvaluator 对一个 evalset 打分,抽成结构化记录。 + +real / fake 两种模式在这里**共用同一套评测代码**——差别只在传进来的 +``call_agent`` 是真实多 agent 还是确定性求解器。评测器、metric、pass/fail +判定完全一致,这样 fake 的分数与 real 的分数口径可比。 +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Awaitable, Callable + +from trpc_agent_sdk.evaluation import AgentEvaluator +from trpc_agent_sdk.evaluation import EvalStatus +from trpc_agent_sdk.evaluation import get_all_tool_calls + + +CallAgent = Callable[[str], Awaitable[str]] + + +@dataclass +class MetricScore: + """单个 metric 在单条 case 上的结果。""" + + name: str + score: float + passed: bool + threshold: float + reason: str = "" + + +@dataclass +class CaseEval: + """单条 case 的评测结果(跨 metric 汇总)。""" + + eval_id: str + passed: bool + score: float # 主 metric 分(这里是 final_response_avg_score) + metrics: list[MetricScore] = field(default_factory=list) + query: str = "" + expected_text: str = "" + actual_text: str = "" + error: str = "" + trajectory: list[str] = field(default_factory=list) # 关键轨迹(工具调用 + 最终答复) + + +@dataclass +class SetEval: + """一个 evalset 的整体评测结果。""" + + set_id: str + cases: dict[str, CaseEval] + + @property + def pass_count(self) -> int: + return sum(1 for c in self.cases.values() if c.passed) + + @property + def total(self) -> int: + return len(self.cases) + + @property + def avg_score(self) -> float: + if not self.cases: + return 0.0 + return sum(c.score for c in self.cases.values()) / len(self.cases) + + +def _content_text(content) -> str: + if content is None or not getattr(content, "parts", None): + return "" + return "".join(p.text for p in content.parts if getattr(p, "text", None)).strip() + + +def _load_expected(dataset_path: Path) -> dict[str, tuple[str, str]]: + """从 evalset 文件读每条 case 的 (query, expected_text),用于报告与归因。""" + data = json.loads(Path(dataset_path).read_text(encoding="utf-8")) + out: dict[str, tuple[str, str]] = {} + for case in data.get("eval_cases", []): + conv = case.get("conversation", []) + if not conv: + continue + first = conv[0] + query = "".join(p.get("text", "") for p in first.get("user_content", {}).get("parts", [])) + expected = "".join(p.get("text", "") for p in first.get("final_response", {}).get("parts", [])) + out[case["eval_id"]] = (query.strip(), expected.strip()) + return out + + +async def evaluate_set( + dataset_path: Path, + call_agent: CallAgent, + metrics_path: Path, + output_dir: Path, +) -> SetEval: + """对单个 evalset 跑真实 AgentEvaluator,返回结构化 SetEval。""" + executer = AgentEvaluator.get_executer( + str(dataset_path), + call_agent=call_agent, + num_runs=1, + print_detailed_results=False, + eval_metrics_file_path_or_dir=str(metrics_path), + eval_result_output_dir=str(output_dir), + ) + # AgentEvaluator 在有 case 未达标时会抛 AssertionError(_EvaluationCasesFailed), + # 但抛出前已把完整结果写入 executer。我们要的是回归信号(含失败),因此捕获 + # 断言、照常取回结果——失败是评测的正常输出,不是流程错误。 + try: + await executer.evaluate() + except AssertionError: + pass + result = executer.get_result() + + expected_map = _load_expected(dataset_path) + cases: dict[str, CaseEval] = {} + + for set_id, agg in (result.results_by_eval_set_id if result else {}).items(): + for eval_id, case_runs in agg.eval_results_by_eval_id.items(): + case = case_runs[0] # num_runs=1 + metric_scores: list[MetricScore] = [] + for m in case.overall_eval_metric_results: + metric_scores.append( + MetricScore( + name=m.metric_name, + score=float(m.score) if m.score is not None else 0.0, + passed=m.eval_status == EvalStatus.PASSED, + threshold=float(getattr(m, "threshold", 1.0) or 1.0), + reason=(m.details.reason if m.details and m.details.reason else ""), + ) + ) + # 抽取实际 agent 输出文本 + 关键轨迹(取第一个 invocation) + actual_text = "" + trajectory: list[str] = [] + if case.eval_metric_result_per_invocation: + actual_inv = case.eval_metric_result_per_invocation[0].actual_invocation + actual_text = _content_text(actual_inv.final_response) + # 关键轨迹:记录每次工具调用(名称 + 参数摘要),再附最终答复。 + # 单 agent/无工具时轨迹只含最终答复;多 agent + 工具时可见调用链。 + for call in get_all_tool_calls(actual_inv.intermediate_data): + args = getattr(call, "args", None) or {} + trajectory.append(f"tool_call:{getattr(call, 'name', '?')}({args})") + if actual_text: + trajectory.append(f"final_response:{actual_text}") + query, expected = expected_map.get(eval_id, ("", "")) + cases[eval_id] = CaseEval( + eval_id=eval_id, + passed=case.final_eval_status == EvalStatus.PASSED, + score=metric_scores[0].score if metric_scores else 0.0, + metrics=metric_scores, + query=query, + expected_text=expected, + actual_text=actual_text, + error=case.error_message or "", + trajectory=trajectory, + ) + + # 兜底:若某 case 未出现在结果里,也补一条 failed 记录 + for eval_id, (query, expected) in expected_map.items(): + cases.setdefault( + eval_id, + CaseEval(eval_id=eval_id, passed=False, score=0.0, query=query, expected_text=expected), + ) + + return SetEval(set_id=Path(dataset_path).stem, cases=cases) diff --git a/examples/optimization/eval_optimize_loop/pipeline/gate.py b/examples/optimization/eval_optimize_loop/pipeline/gate.py new file mode 100644 index 0000000..46abd46 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/gate.py @@ -0,0 +1,146 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""门控阶段:逐 case delta + 可配置接受策略(含防过拟合)。 + +先把 baseline 验证集与候选验证集逐 case 对比,分出 +新增通过 / 新增失败 / 分数提升 / 分数下降 / 不变;再按 config 里的 gate +规则综合裁决 accept / reject,并给出逐条规则的通过情况与理由。 + +防过拟合的关键在 ``forbid_new_hard_fail`` 与 ``key_case_ids``:只要验证集里 +出现"原本通过、优化后失败"的退化,即便训练集提升、验证集总分持平,也会拒绝。 +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from .evaluate import SetEval + + +@dataclass +class CaseDelta: + eval_id: str + baseline_passed: bool + candidate_passed: bool + baseline_score: float + candidate_score: float + status: str # newly_passed | newly_failed | improved | regressed | unchanged + + @property + def score_delta(self) -> float: + return round(self.candidate_score - self.baseline_score, 4) + + +def compute_delta(baseline: SetEval, candidate: SetEval) -> list[CaseDelta]: + """逐 case 对比 baseline 与候选的验证结果。""" + deltas: list[CaseDelta] = [] + for eval_id, base_case in baseline.cases.items(): + cand_case = candidate.cases.get(eval_id) + if cand_case is None: + continue + bp, cp = base_case.passed, cand_case.passed + bs, cs = base_case.score, cand_case.score + if not bp and cp: + status = "newly_passed" + elif bp and not cp: + status = "newly_failed" + elif cs > bs: + status = "improved" + elif cs < bs: + status = "regressed" + else: + status = "unchanged" + deltas.append(CaseDelta(eval_id, bp, cp, bs, cs, status)) + return deltas + + +@dataclass +class RuleResult: + name: str + passed: bool + detail: str + + +@dataclass +class GateDecision: + accepted: bool + rules: list[RuleResult] = field(default_factory=list) + summary: str = "" + val_score_delta: float = 0.0 + newly_passed: list[str] = field(default_factory=list) + newly_failed: list[str] = field(default_factory=list) + regressed: list[str] = field(default_factory=list) + + +def evaluate_gate( + baseline_val: SetEval, + candidate_val: SetEval, + deltas: list[CaseDelta], + candidate_cost: float, + gate_config: dict, +) -> GateDecision: + """按 gate 配置综合裁决。任一 rule 不过即拒绝。""" + min_delta = float(gate_config.get("min_val_score_delta", 0.0)) + forbid_new_hard_fail = bool(gate_config.get("forbid_new_hard_fail", True)) + key_case_ids = set(gate_config.get("key_case_ids", [])) + cost_budget = float(gate_config.get("cost_budget_usd", float("inf"))) + + val_delta = round(candidate_val.avg_score - baseline_val.avg_score, 4) + newly_failed = [d.eval_id for d in deltas if d.status == "newly_failed"] + newly_passed = [d.eval_id for d in deltas if d.status == "newly_passed"] + regressed = [d.eval_id for d in deltas if d.status in ("newly_failed", "regressed")] + + rules: list[RuleResult] = [] + + # R1: 验证集总分提升 >= 阈值 + rules.append(RuleResult( + "min_val_score_delta", + val_delta >= min_delta, + f"验证集平均分 delta={val_delta:+.4f},阈值 ≥ {min_delta:+.4f}", + )) + + # R2: 不得新增 hard fail(原通过 → 现失败)——防过拟合主闸 + rules.append(RuleResult( + "forbid_new_hard_fail", + (not forbid_new_hard_fail) or (len(newly_failed) == 0), + f"新增失败 case={newly_failed or '无'}", + )) + + # R3: 关键 case 不得退化 + key_regressed = [d.eval_id for d in deltas + if d.eval_id in key_case_ids and d.status in ("newly_failed", "regressed")] + rules.append(RuleResult( + "key_cases_no_regression", + len(key_regressed) == 0, + f"关键 case={sorted(key_case_ids) or '未指定'},其中退化={key_regressed or '无'}", + )) + + # R4: 成本预算 + rules.append(RuleResult( + "cost_within_budget", + candidate_cost <= cost_budget, + f"候选成本=${candidate_cost:.4f},预算 ≤ ${cost_budget:.4f}", + )) + + accepted = all(r.passed for r in rules) + failed_rules = [r.name for r in rules if not r.passed] + if accepted: + summary = f"接受候选:验证集提升 {val_delta:+.4f} 且未触发任何拒绝规则。" + else: + summary = ( + f"拒绝候选:命中规则 {failed_rules}。" + + ("疑似过拟合(验证集出现退化/新增失败)。" if newly_failed else "") + ) + + return GateDecision( + accepted=accepted, + rules=rules, + summary=summary, + val_score_delta=val_delta, + newly_passed=newly_passed, + newly_failed=newly_failed, + regressed=regressed, + ) diff --git a/examples/optimization/eval_optimize_loop/pipeline/optimize.py b/examples/optimization/eval_optimize_loop/pipeline/optimize.py new file mode 100644 index 0000000..ce47301 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/optimize.py @@ -0,0 +1,172 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""优化阶段:产出一个候选 prompt 组(三字段),供后续验证与门控。 + +两种后端 +-------- +- real:调**真实** ``AgentOptimizer.optimize``(GEPA 反思), + ``update_source=False`` 让源文件在优化结束后自动还原到 baseline, + 最优候选从 ``OptimizeResult.best_prompts`` 取回。成本/耗时来自真实运行。 +- fake:脚本化候选。刻意构造一个"训练集提升、验证集退化"的过拟合候选 + (给 skill 增加乘法能力,同时植入 ``assume-mul-default`` 过拟合副作用), + 用来离线、确定性地演示门控拒绝过拟合(验收第 3 条)。 + +两档统一输出 :class:`CandidateResult`(候选文本字典 + 成本 + 耗时 + 元信息)。 +候选的"应用/还原"由 :func:`apply_candidate` / :func:`restore_prompts` 完成—— +验证阶段把候选临时写入源 prompt 文件,评测后还原,real / fake 共用同一机制。 +""" + +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Awaitable, Callable + +from agent import PROMPT_PATHS + + +CallAgent = Callable[[str], Awaitable[str]] + + +@dataclass +class CandidateResult: + prompts: dict[str, str] # field name -> 候选 prompt 全文 + status: str = "SUCCEEDED" + stop_reason: str = "" + cost_usd: float = 0.0 + duration_seconds: float = 0.0 + rounds: int = 0 + optimized_fields: list[str] = field(default_factory=list) + rounds_detail: list[dict] = field(default_factory=list) # 每轮候选 prompt + 分数 + 接受与否 + meta: dict = field(default_factory=dict) + + +def read_baseline_prompts() -> dict[str, str]: + """读当前源 prompt 文件(baseline)文本,keyed by TargetPrompt 字段名。""" + return {name: Path(path).read_text(encoding="utf-8") for name, path in PROMPT_PATHS.items()} + + +def apply_candidate(candidate: dict[str, str]) -> dict[str, str]: + """把候选文本写入源 prompt 文件,返回原始快照供还原。""" + snapshot = read_baseline_prompts() + for name, text in candidate.items(): + if name in PROMPT_PATHS: + Path(PROMPT_PATHS[name]).write_text(text, encoding="utf-8") + return snapshot + + +def restore_prompts(snapshot: dict[str, str]) -> None: + """还原源 prompt 文件到给定快照。""" + for name, text in snapshot.items(): + if name in PROMPT_PATHS: + Path(PROMPT_PATHS[name]).write_text(text, encoding="utf-8") + + +# -------------------------------------------------------------------------- +# fake 后端:脚本化过拟合候选 +# -------------------------------------------------------------------------- +_SKILL_CANDIDATE = """# 解题技能 · Skill Prompt(候选) + +这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型, +如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。 + +## 已掌握题型 +- 加法:把题目中的两个数量相加。 +- 乘法:把题目中的两个数量相乘(新增)。 + +## 快捷启发式(反思器新增,实为过拟合) +- 数字较大的题目倾向于按乘法处理,往往更快得到答案。 + + + + +""" + + +async def optimize_fake(train_path: Path, val_path: Path) -> CandidateResult: + """确定性脚本候选:加乘法能力(真实提升)+ 过拟合副作用(制造验证退化)。""" + start = time.time() + baseline = read_baseline_prompts() + candidate = dict(baseline) + candidate["skill"] = _SKILL_CANDIDATE # 只改 skill 字段;router/system 保持不变 + rounds_detail = [{ + "round": 1, + "optimized_fields": ["skill"], + "candidate_prompts": candidate, + "note": "脚本化过拟合候选:新增乘法能力 + assume-mul-default 副作用", + }] + return CandidateResult( + prompts=candidate, + status="SUCCEEDED", + stop_reason="fake_scripted_candidate", + cost_usd=0.0, + duration_seconds=round(time.time() - start, 4), + rounds=1, + optimized_fields=["skill"], + rounds_detail=rounds_detail, + meta={"backend": "fake", "note": "scripted overfitting candidate for offline demo"}, + ) + + +# -------------------------------------------------------------------------- +# real 后端:真实 AgentOptimizer(GEPA) +# -------------------------------------------------------------------------- +async def optimize_real( + config_path: Path, + call_agent: CallAgent, + train_path: Path, + val_path: Path, + output_dir: Path, +) -> CandidateResult: + """调真实 AgentOptimizer;update_source=False 让源文件优化后自动还原。""" + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + + target = TargetPrompt() + for name, path in PROMPT_PATHS.items(): + target.add_path(name, str(path)) + + result = await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=False, # 候选先不落源,交由 pipeline 门控决定是否接受 + verbose=1, + ) + return CandidateResult( + prompts=dict(result.best_prompts), + status=str(result.status), + stop_reason=str(result.stop_reason or ""), + cost_usd=float(result.total_llm_cost), + duration_seconds=float(result.duration_seconds), + rounds=len(result.rounds), + optimized_fields=sorted( + {f for r in result.rounds for f in r.optimized_field_names} + ), + rounds_detail=[ + { + "round": r.round, + "optimized_fields": list(r.optimized_field_names), + "candidate_prompts": dict(r.candidate_prompts), + "validation_pass_rate": r.validation_pass_rate, + "metric_breakdown": dict(r.metric_breakdown), + "accepted": r.accepted, + "acceptance_reason": r.acceptance_reason, + "cost_usd": r.round_llm_cost, + "duration_seconds": r.duration_seconds, + } + for r in result.rounds + ], + meta={ + "backend": "real", + "algorithm": result.algorithm, + "baseline_pass_rate": result.baseline_pass_rate, + "best_pass_rate": result.best_pass_rate, + }, + ) diff --git a/examples/optimization/eval_optimize_loop/pipeline/report.py b/examples/optimization/eval_optimize_loop/pipeline/report.py new file mode 100644 index 0000000..81c28b8 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/pipeline/report.py @@ -0,0 +1,279 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""审计阶段:把闭环全过程落成结构化 JSON + 人可读 Markdown。 + +产物 +---- +- ``optimization_report.json`` : baseline / candidate / 逐 case delta / + gate 决策 / 失败归因统计 / 成本 / 耗时 / 复现实验配置(seed、mode、数据路径)。 +- ``optimization_report.md`` : 用人能读懂的方式说明是否值得接受。 +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from .evaluate import SetEval +from .gate import CaseDelta, GateDecision +from .optimize import CandidateResult + + +def _set_to_dict(s: SetEval) -> dict: + return { + "set_id": s.set_id, + "pass_count": s.pass_count, + "total": s.total, + "avg_score": round(s.avg_score, 4), + "cases": { + eid: { + "passed": c.passed, + "score": round(c.score, 4), + "expected": c.expected_text, + "actual": c.actual_text, + "error": c.error, + # 每条 case 的 metric 明细(分/阈值/pass-fail/失败原因) + "metrics": [ + { + "name": m.name, + "score": round(m.score, 4), + "passed": m.passed, + "threshold": m.threshold, + "reason": m.reason, + } + for m in c.metrics + ], + "trajectory": c.trajectory, # 关键轨迹 + } + for eid, c in s.cases.items() + }, + } + + +def _attrib_to_dict(attrib: dict) -> dict: + return { + "clusters": attrib["clusters"], + "cases": { + eid: { + "category": a.category, + "category_label": a.category_label, + "reason": a.reason, + "source": a.source, + } + for eid, a in attrib["attributions"].items() + }, + } + + +def build_report( + *, + run_meta: dict, + baseline_train: SetEval, + baseline_val: SetEval, + train_attrib: dict, + val_attrib: dict, + candidate: CandidateResult, + candidate_train: SetEval, + candidate_val: SetEval, + deltas: list[CaseDelta], + train_deltas: list[CaseDelta], + gate: GateDecision, +) -> dict: + """汇总为一个可直接 json.dump 的报告字典。""" + return { + "schema_version": "eol-v1", + "run": run_meta, + "baseline": { + "train": _set_to_dict(baseline_train), + "val": _set_to_dict(baseline_val), + }, + "failure_attribution": { + "train": _attrib_to_dict(train_attrib), + "val": _attrib_to_dict(val_attrib), + }, + "candidate": { + "status": candidate.status, + "stop_reason": candidate.stop_reason, + "optimized_fields": candidate.optimized_fields, + "rounds": candidate.rounds, + "cost_usd": round(candidate.cost_usd, 6), + "duration_seconds": round(candidate.duration_seconds, 4), + "meta": candidate.meta, + "prompts": candidate.prompts, + "rounds_detail": candidate.rounds_detail, # 每轮候选 prompt 审计 + }, + "candidate_train": _set_to_dict(candidate_train), + "candidate_val": _set_to_dict(candidate_val), + "overfitting_signal": { + "train_score_delta": round(candidate_train.avg_score - baseline_train.avg_score, 4), + "val_score_delta": gate.val_score_delta, + "train_up_val_down": ( + candidate_train.avg_score > baseline_train.avg_score + and gate.val_score_delta <= 0 + ), + }, + "delta": { + "val_score_delta": gate.val_score_delta, + "train_score_delta": round(candidate_train.avg_score - baseline_train.avg_score, 4), + "baseline_val_pass": f"{baseline_val.pass_count}/{baseline_val.total}", + "candidate_val_pass": f"{candidate_val.pass_count}/{candidate_val.total}", + "baseline_train_pass": f"{baseline_train.pass_count}/{baseline_train.total}", + "candidate_train_pass": f"{candidate_train.pass_count}/{candidate_train.total}", + "per_case": [ + { + "eval_id": d.eval_id, + "baseline_passed": d.baseline_passed, + "candidate_passed": d.candidate_passed, + "baseline_score": round(d.baseline_score, 4), + "candidate_score": round(d.candidate_score, 4), + "score_delta": d.score_delta, + "status": d.status, + } + for d in deltas + ], + "newly_passed": gate.newly_passed, + "newly_failed": gate.newly_failed, + "regressed": gate.regressed, + }, + "gate_decision": { + "accepted": gate.accepted, + "summary": gate.summary, + "rules": [ + {"name": r.name, "passed": r.passed, "detail": r.detail} + for r in gate.rules + ], + }, + } + + +def save_json(report: dict, path: Path) -> None: + Path(path).write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _status_emoji(status: str) -> str: + return { + "newly_passed": "🟢 新增通过", + "newly_failed": "🔴 新增失败", + "improved": "🔼 分数提升", + "regressed": "🔻 分数下降", + "unchanged": "⚪ 不变", + }.get(status, status) + + +def render_markdown(report: dict) -> str: + r = report + run = r["run"] + gate = r["gate_decision"] + delta = r["delta"] + decision = "✅ 接受 (ACCEPT)" if gate["accepted"] else "❌ 拒绝 (REJECT)" + + lines: list[str] = [] + lines.append("# Evaluation + Optimization 闭环报告") + lines.append("") + lines.append(f"- **决策**:{decision}") + lines.append(f"- **结论**:{gate['summary']}") + lines.append(f"- 运行模式:`{run['mode']}` | seed:`{run['seed']}` | 耗时:{run['elapsed_seconds']}s") + lines.append(f"- 时间:{run['started_at']} → {run['finished_at']}") + lines.append("") + + # 分数总览 + bt, bv = r["baseline"]["train"], r["baseline"]["val"] + ct, cv = r["candidate_train"], r["candidate_val"] + ovf = r["overfitting_signal"] + lines.append("## 1. 分数总览") + lines.append("") + lines.append("| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 |") + lines.append("|---|---|---|---|---|---|") + lines.append( + f"| 训练集 | {bt['pass_count']}/{bt['total']} | {bt['avg_score']:.3f} " + f"| {ct['pass_count']}/{ct['total']} | {ct['avg_score']:.3f} | {ovf['train_score_delta']:+.3f} |" + ) + lines.append( + f"| 验证集 | {bv['pass_count']}/{bv['total']} | {bv['avg_score']:.3f} " + f"| {cv['pass_count']}/{cv['total']} | {cv['avg_score']:.3f} | {ovf['val_score_delta']:+.3f} |" + ) + lines.append("") + if ovf["train_up_val_down"]: + lines.append( + f"> ⚠️ **过拟合信号**:训练集提升 {ovf['train_score_delta']:+.3f}," + f"验证集却未提升({ovf['val_score_delta']:+.3f})——候选在训练分布上过度特化。" + ) + lines.append("") + + # 失败归因 + lines.append("## 2. Baseline 失败归因") + lines.append("") + for split in ("train", "val"): + fa = r["failure_attribution"][split] + base_cases = r["baseline"][split]["cases"] + lines.append(f"**{'训练集' if split == 'train' else '验证集'}** 失败聚类:{fa['clusters'] or '无失败'}") + for eid, a in fa["cases"].items(): + lines.append(f"- `{eid}` → **{a['category_label']}**({a['source']}):{a['reason']}") + traj = base_cases.get(eid, {}).get("trajectory", []) + if traj: + lines.append(f" - 关键轨迹:{' → '.join(traj)}") + lines.append("") + + # 逐 case delta + lines.append("## 3. 候选验证 · 逐 case delta") + lines.append("") + lines.append("| case | baseline | candidate | Δ分 | 状态 |") + lines.append("|---|---|---|---|---|") + for d in delta["per_case"]: + lines.append( + f"| `{d['eval_id']}` | {'PASS' if d['baseline_passed'] else 'FAIL'} " + f"| {'PASS' if d['candidate_passed'] else 'FAIL'} " + f"| {d['score_delta']:+.3f} | {_status_emoji(d['status'])} |" + ) + lines.append("") + lines.append(f"- 新增通过:{delta['newly_passed'] or '无'}") + lines.append(f"- 新增失败:{delta['newly_failed'] or '无'}") + lines.append("") + + # gate 明细 + lines.append("## 4. 门控决策明细") + lines.append("") + lines.append("| 规则 | 结果 | 说明 |") + lines.append("|---|---|---|") + for rule in gate["rules"]: + lines.append(f"| `{rule['name']}` | {'✅' if rule['passed'] else '❌'} | {rule['detail']} |") + lines.append("") + + # 每轮候选审计 + cand = r["candidate"] + lines.append("## 5. 每轮候选审计") + lines.append("") + if cand["rounds_detail"]: + for rd in cand["rounds_detail"]: + fields = rd.get("optimized_fields", []) + extra = "" + if "validation_pass_rate" in rd: + extra = f" | val_pass={rd['validation_pass_rate']} | accepted={rd.get('accepted')}" + lines.append(f"- **Round {rd['round']}**:改写字段 {fields}{extra}") + note = rd.get("note") or rd.get("acceptance_reason") + if note: + lines.append(f" - {note}") + else: + lines.append("- (优化器未产出任何轮次;候选=baseline)") + lines.append("") + lines.append("> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。") + lines.append("") + + # 候选与成本 + lines.append("## 6. 候选与成本审计") + lines.append("") + lines.append(f"- 优化状态:`{cand['status']}` | stop_reason:`{cand['stop_reason']}`") + lines.append(f"- 被改写字段:{cand['optimized_fields']} | 轮数:{cand['rounds']}") + lines.append(f"- 成本:${cand['cost_usd']:.6f} | 优化耗时:{cand['duration_seconds']}s") + lines.append(f"- 后端:`{cand['meta'].get('backend', 'unknown')}`") + lines.append("") + lines.append("> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。") + lines.append("") + return "\n".join(lines) + + +def save_markdown(report: dict, path: Path) -> None: + Path(path).write_text(render_markdown(report), encoding="utf-8") diff --git a/examples/optimization/eval_optimize_loop/run_pipeline.py b/examples/optimization/eval_optimize_loop/run_pipeline.py new file mode 100644 index 0000000..6410c9f --- /dev/null +++ b/examples/optimization/eval_optimize_loop/run_pipeline.py @@ -0,0 +1,171 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Evaluation + Optimization 自动闭环入口。 + +一条命令跑完六阶段: + Baseline 评测 → 失败归因 → 优化执行 → 候选验证(逐 case delta) + → 接受门控 → 审计落盘(optimization_report.json + .md) + +用法 +---- + # 离线 fake 模式(无需 API Key,≤3min,确定性可复现)——默认 + python run_pipeline.py + + # 真实模式(需 TRPC_AGENT_API_KEY / _BASE_URL / _MODEL_NAME) + python run_pipeline.py --mode real + +产物写到 ``runs//``,并在其中生成 optimization_report.{json,md}。 +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import random +import sys +from datetime import datetime +from functools import partial +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +for _p in (str(_REPO_ROOT), str(_HERE)): + if _p not in sys.path: + sys.path.insert(0, _p) + +from agent import PROMPT_PATHS # noqa: E402 +from pipeline import attribution as attrib_mod # noqa: E402 +from pipeline import gate as gate_mod # noqa: E402 +from pipeline import optimize as opt_mod # noqa: E402 +from pipeline import report as report_mod # noqa: E402 +from pipeline.evaluate import evaluate_set # noqa: E402 + + +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +METRICS_PATH = _HERE / "eval_metrics.json" +CONFIG_PATH = _HERE / "config.json" +OPTIMIZER_PATH = _HERE / "optimizer.json" +RUNS_DIR = _HERE / "runs" + + +def _build_call_agent(mode: str): + """按模式返回 call_agent 回调。""" + if mode == "fake": + from agent import fake_backend # 延迟导入:real 模式不必加载 + return partial(fake_backend.call_agent_fake, prompt_paths=PROMPT_PATHS) + from agent.orchestrator import call_agent_real + return call_agent_real + + +async def run(mode: str, output_dir: Path) -> dict: + config = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + seed = int(config.get("seed", 42)) + random.seed(seed) + + call_agent = _build_call_agent(mode) + started_at = datetime.now() + output_dir.mkdir(parents=True, exist_ok=True) + + # ---- 阶段 1:Baseline 评测(train + val,读源 prompt=baseline)---- + print(f"[1/6] Baseline 评测 (mode={mode}) ...") + baseline_train = await evaluate_set(TRAIN_PATH, call_agent, METRICS_PATH, output_dir / "baseline_train") + baseline_val = await evaluate_set(VAL_PATH, call_agent, METRICS_PATH, output_dir / "baseline_val") + print(f" train {baseline_train.pass_count}/{baseline_train.total}" + f" | val {baseline_val.pass_count}/{baseline_val.total}") + + # ---- 阶段 2:失败归因 ---- + print("[2/6] 失败归因 ...") + train_attrib = await attrib_mod.attribute_failures(baseline_train, mode) + val_attrib = await attrib_mod.attribute_failures(baseline_val, mode) + print(f" train clusters={train_attrib['clusters']} | val clusters={val_attrib['clusters']}") + + # ---- 阶段 3:优化执行(结束后源 prompt 已还原到 baseline)---- + print("[3/6] 优化执行 ...") + if mode == "fake": + candidate = await opt_mod.optimize_fake(TRAIN_PATH, VAL_PATH) + else: + candidate = await opt_mod.optimize_real( + OPTIMIZER_PATH, call_agent, TRAIN_PATH, VAL_PATH, output_dir / "optimize" + ) + print(f" status={candidate.status} fields={candidate.optimized_fields}" + f" cost=${candidate.cost_usd:.4f}") + + # ---- 阶段 4:候选验证(临时把候选写入源 prompt,评测后还原)---- + print("[4/6] 候选验证 ...") + snapshot = opt_mod.apply_candidate(candidate.prompts) + try: + # 同时在训练集与验证集上重跑候选:训练集用于识别"训练提升但验证退化" + # 的过拟合,验证集用于门控。 + candidate_train = await evaluate_set(TRAIN_PATH, call_agent, METRICS_PATH, output_dir / "candidate_train") + candidate_val = await evaluate_set(VAL_PATH, call_agent, METRICS_PATH, output_dir / "candidate_val") + finally: + opt_mod.restore_prompts(snapshot) + deltas = gate_mod.compute_delta(baseline_val, candidate_val) + train_deltas = gate_mod.compute_delta(baseline_train, candidate_train) + print(f" candidate train {candidate_train.pass_count}/{candidate_train.total}" + f" | val {candidate_val.pass_count}/{candidate_val.total}") + + # ---- 阶段 5:接受门控 ---- + print("[5/6] 接受门控 ...") + decision = gate_mod.evaluate_gate( + baseline_val, candidate_val, deltas, candidate.cost_usd, config.get("gate", {}) + ) + print(f" decision={'ACCEPT' if decision.accepted else 'REJECT'} :: {decision.summary}") + + # ---- 阶段 6:审计落盘 ---- + print("[6/6] 审计落盘 ...") + finished_at = datetime.now() + run_meta = { + "mode": mode, + "seed": seed, + "started_at": started_at.isoformat(timespec="seconds"), + "finished_at": finished_at.isoformat(timespec="seconds"), + "elapsed_seconds": round((finished_at - started_at).total_seconds(), 2), + "train_dataset": str(TRAIN_PATH.relative_to(_HERE)), + "val_dataset": str(VAL_PATH.relative_to(_HERE)), + "target_fields": list(PROMPT_PATHS.keys()), + "gate_config": config.get("gate", {}), + } + report = report_mod.build_report( + run_meta=run_meta, + baseline_train=baseline_train, + baseline_val=baseline_val, + train_attrib=train_attrib, + val_attrib=val_attrib, + candidate=candidate, + candidate_train=candidate_train, + candidate_val=candidate_val, + deltas=deltas, + train_deltas=train_deltas, + gate=decision, + ) + report_mod.save_json(report, output_dir / "optimization_report.json") + report_mod.save_markdown(report, output_dir / "optimization_report.md") + print(f" 报告已写入 {output_dir}/optimization_report.(json|md)") + return report + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluation + Optimization 自动闭环") + parser.add_argument("--mode", choices=["fake", "real"], default="fake", + help="fake=离线确定性(默认);real=真实 LLM + AgentOptimizer") + parser.add_argument("--output-dir", default="", help="产物目录;默认 runs/") + args = parser.parse_args() + + output_dir = Path(args.output_dir) if args.output_dir else ( + RUNS_DIR / datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + ) + report = asyncio.run(run(args.mode, output_dir)) + accepted = report["gate_decision"]["accepted"] + # 退出码:接受=0,拒绝=2(便于 CI 判定;拒绝不是错误,是有效负决策) + sys.exit(0 if accepted else 2) + + +if __name__ == "__main__": + main() diff --git a/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json new file mode 100644 index 0000000..92b6a2a --- /dev/null +++ b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.json @@ -0,0 +1,397 @@ +{ + "schema_version": "eol-v1", + "run": { + "mode": "real", + "seed": 42, + "started_at": "2026-07-01T20:17:13", + "finished_at": "2026-07-01T20:17:21", + "elapsed_seconds": 8.4, + "train_dataset": "data/train.evalset.json", + "val_dataset": "data/val.evalset.json", + "target_fields": [ + "router", + "system_prompt", + "skill" + ], + "gate_config": { + "min_val_score_delta": 0.05, + "forbid_new_hard_fail": true, + "key_case_ids": [ + "val_add_class" + ], + "cost_budget_usd": 1.0 + } + }, + "baseline": { + "train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "抱歉,我暂时无法解答这道题。", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:抱歉,我暂时无法解答这道题。" + ] + }, + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + } + } + }, + "val": { + "set_id": "val.evalset", + "pass_count": 3, + "total": 3, + "avg_score": 1.0, + "cases": { + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + } + } + } + }, + "failure_attribution": { + "train": { + "clusters": { + "最终回复不匹配": 1 + }, + "cases": { + "train_discount_shirt": { + "category": "final_response_mismatch", + "category_label": "最终回复不匹配", + "reason": "agent 未给出正确计算结果,直接拒绝回答,与期望答案不符。", + "source": "llm" + } + } + }, + "val": { + "clusters": {}, + "cases": {} + } + }, + "candidate": { + "status": "SUCCEEDED", + "stop_reason": "required_metrics_passing", + "optimized_fields": [], + "rounds": 0, + "cost_usd": 0.0, + "duration_seconds": 2.0019, + "meta": { + "backend": "real", + "algorithm": "gepa_reflective", + "baseline_pass_rate": 1.0, + "best_pass_rate": 1.0 + }, + "prompts": { + "router": "# 路由器 Prompt\n\n你是算术应用题助手的路由器。判断用户问题属于哪类运算,并把问题转交给解题 agent。\n\n- 认真读题,识别是加法、乘法还是折扣问题。\n- 把控制权交给下游解题 agent。\n\n\n\n", + "system_prompt": "# 解题 Agent · System Prompt\n\n你是一名严谨的小学算术老师,负责把解题结果整理成最终答复。\n\n## 输出格式要求\n- 最终答复必须以「答案:」开头,后面紧跟计算得到的数字。\n- 数字后必须带上正确的单位(个 / 公里 / 元 / 人 等)。\n- 示例:`答案:11 个`\n\n\n\n\n", + "skill": "# 解题技能 · Skill Prompt\n\n这里描述你会做哪些类型的算术题。只有列出的题型你才能解答;遇到未掌握的题型,\n如实回答「抱歉,我暂时无法解答这道题。」,不要瞎猜。\n\n## 已掌握题型\n- 加法:把题目中的两个数量相加。\n\n\n\n" + }, + "rounds_detail": [] + }, + "candidate_train": { + "set_id": "train.evalset", + "pass_count": 2, + "total": 3, + "avg_score": 0.6667, + "cases": { + "train_mul_car": { + "passed": true, + "score": 1.0, + "expected": "答案:150 公里", + "actual": "答案:150 公里", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:150 公里" + ] + }, + "train_add_apple": { + "passed": true, + "score": 1.0, + "expected": "答案:11 个", + "actual": "答案:11 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:11 个" + ] + }, + "train_discount_shirt": { + "passed": false, + "score": 0.0, + "expected": "答案:160 元", + "actual": "我暂时无法解答这道题。因为题目涉及“打折扣”的计算,而我的技能只包括加法,无法处理乘法或百分比相关的运算。请提供加法类题目,我会很乐意为您解答!", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 0.0, + "passed": false, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:我暂时无法解答这道题。因为题目涉及“打折扣”的计算,而我的技能只包括加法,无法处理乘法或百分比相关的运算。请提供加法类题目,我会很乐意为您解答!" + ] + } + } + }, + "candidate_val": { + "set_id": "val.evalset", + "pass_count": 3, + "total": 3, + "avg_score": 1.0, + "cases": { + "val_add_class": { + "passed": true, + "score": 1.0, + "expected": "答案:35 人", + "actual": "答案:35 人", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:35 人" + ] + }, + "val_add_orange": { + "passed": true, + "score": 1.0, + "expected": "答案:8 个", + "actual": "答案:8 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:8 个" + ] + }, + "val_mul_box": { + "passed": true, + "score": 1.0, + "expected": "答案:60 个", + "actual": "答案:60 个", + "error": "", + "metrics": [ + { + "name": "final_response_avg_score", + "score": 1.0, + "passed": true, + "threshold": 1.0, + "reason": "" + } + ], + "trajectory": [ + "final_response:答案:60 个" + ] + } + } + }, + "overfitting_signal": { + "train_score_delta": 0.0, + "val_score_delta": 0.0, + "train_up_val_down": false + }, + "delta": { + "val_score_delta": 0.0, + "train_score_delta": 0.0, + "baseline_val_pass": "3/3", + "candidate_val_pass": "3/3", + "baseline_train_pass": "2/3", + "candidate_train_pass": "2/3", + "per_case": [ + { + "eval_id": "val_add_class", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + }, + { + "eval_id": "val_mul_box", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + }, + { + "eval_id": "val_add_orange", + "baseline_passed": true, + "candidate_passed": true, + "baseline_score": 1.0, + "candidate_score": 1.0, + "score_delta": 0.0, + "status": "unchanged" + } + ], + "newly_passed": [], + "newly_failed": [], + "regressed": [] + }, + "gate_decision": { + "accepted": false, + "summary": "拒绝候选:命中规则 ['min_val_score_delta']。", + "rules": [ + { + "name": "min_val_score_delta", + "passed": false, + "detail": "验证集平均分 delta=+0.0000,阈值 ≥ +0.0500" + }, + { + "name": "forbid_new_hard_fail", + "passed": true, + "detail": "新增失败 case=无" + }, + { + "name": "key_cases_no_regression", + "passed": true, + "detail": "关键 case=['val_add_class'],其中退化=无" + }, + { + "name": "cost_within_budget", + "passed": true, + "detail": "候选成本=$0.0000,预算 ≤ $1.0000" + } + ] + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md new file mode 100644 index 0000000..076a002 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/samples/real_sample.optimization_report.md @@ -0,0 +1,56 @@ +# Evaluation + Optimization 闭环报告 + +- **决策**:❌ 拒绝 (REJECT) +- **结论**:拒绝候选:命中规则 ['min_val_score_delta']。 +- 运行模式:`real` | seed:`42` | 耗时:8.4s +- 时间:2026-07-01T20:17:13 → 2026-07-01T20:17:21 + +## 1. 分数总览 + +| 数据集 | baseline 通过 | baseline 均分 | candidate 通过 | candidate 均分 | Δ均分 | +|---|---|---|---|---|---| +| 训练集 | 2/3 | 0.667 | 2/3 | 0.667 | +0.000 | +| 验证集 | 3/3 | 1.000 | 3/3 | 1.000 | +0.000 | + +## 2. Baseline 失败归因 + +**训练集** 失败聚类:{'最终回复不匹配': 1} +- `train_discount_shirt` → **最终回复不匹配**(llm):agent 未给出正确计算结果,直接拒绝回答,与期望答案不符。 + - 关键轨迹:final_response:抱歉,我暂时无法解答这道题。 + +**验证集** 失败聚类:无失败 + +## 3. 候选验证 · 逐 case delta + +| case | baseline | candidate | Δ分 | 状态 | +|---|---|---|---|---| +| `val_add_class` | PASS | PASS | +0.000 | ⚪ 不变 | +| `val_mul_box` | PASS | PASS | +0.000 | ⚪ 不变 | +| `val_add_orange` | PASS | PASS | +0.000 | ⚪ 不变 | + +- 新增通过:无 +- 新增失败:无 + +## 4. 门控决策明细 + +| 规则 | 结果 | 说明 | +|---|---|---| +| `min_val_score_delta` | ❌ | 验证集平均分 delta=+0.0000,阈值 ≥ +0.0500 | +| `forbid_new_hard_fail` | ✅ | 新增失败 case=无 | +| `key_cases_no_regression` | ✅ | 关键 case=['val_add_class'],其中退化=无 | +| `cost_within_budget` | ✅ | 候选成本=$0.0000,预算 ≤ $1.0000 | + +## 5. 每轮候选审计 + +- (优化器未产出任何轮次;候选=baseline) + +> 每轮候选 prompt 全文见 `optimization_report.json` 的 `candidate.rounds_detail`。 + +## 6. 候选与成本审计 + +- 优化状态:`SUCCEEDED` | stop_reason:`required_metrics_passing` +- 被改写字段:[] | 轮数:0 +- 成本:$0.000000 | 优化耗时:2.0019s +- 后端:`real` + +> 候选 prompt 全文与逐 case 明细见 `optimization_report.json`。