评测集与线上回归
→ 返回工程实践
业务侧维护 Golden Set(黄金集) + 离线门禁,与 Harness 里的公开基准(MMLU、HumanEval 等)互补:前者对齐产品与 RAG 管线,后者对齐通用模型能力。
黄金集:JSON Lines 与 pandas
import json
from pathlib import Path
import pandas as pd
def load_golden(path: Path) -> pd.DataFrame:
rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
return pd.DataFrame(rows)
# 示例 golden.jsonl(每行一条)
example = """\
{"qid":"1","question":"Redis 持久化方式?","ground_truth":"RDB 与 AOF","expected_ctx":["redis-persist-1"],"tags":["fact"]}
{"qid":"2","question":"本公司无文档的问题 X?","ground_truth":null,"expected_ctx":[],"tags":["should_abstain"]}
"""
Path("golden.jsonl").write_text(example, encoding="utf-8")
df = load_golden(Path("golden.jsonl"))
print(df[["qid", "tags"]])RAGAS:最小可跑示例(需安装 ragas、datasets)
与 Harness 文中的指标一致;下面演示 faithfulness、answer_relevancy 等(实际跑需配置 LLM / embedding,请设置环境变量)。
import os
from datasets import Dataset
# 若未设置 OPENAI_API_KEY,可将 evaluate 换成离线 stub 或换本地模型 —— 此处仅展示数据结构
os.environ.setdefault("OPENAI_API_KEY", "sk-xxx")
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_precision, faithfulness
data = {
"question": ["Redis 是什么?"],
"answer": ["Redis 是内存键值数据库,常用于缓存。"],
"contexts": [["Redis 是一个开源的内存数据结构存储,可用作数据库、缓存。"]],
"ground_truth": ["Redis 是基于内存的键值存储,可用作缓存与消息代理。"],
}
ds = Dataset.from_dict(data)
result = evaluate(
ds,
metrics=[faithfulness, answer_relevancy, context_precision],
)
print(result)CI 中可对 result 里各指标设阈值,低于 baseline 即 non-zero exit。
规则门禁:引用角标与拒答
import re
def must_have_citations(answer: str) -> bool:
return bool(re.search(r"\[\d+\]", answer))
def abstain_ok(answer: str, should_abstain: bool) -> bool:
if not should_abstain:
return True
markers = ("不知道", "未覆盖", "无法根据", "没有相关信息")
return any(m in answer for m in markers)
def lint_rag_output(answer: str, *, should_abstain: bool = False) -> list[str]:
errors: list[str] = []
if should_abstain:
if not abstain_ok(answer, True):
errors.append("expected abstain-style answer")
else:
if not must_have_citations(answer):
errors.append("missing [n] citations")
return errors
print(lint_rag_output("见文档 [1][2].", should_abstain=False))
print(lint_rag_output("随便编的答案", should_abstain=False))版本 diff:新 pipeline 相对 baseline 的通过率
def regression_gate(baseline: dict[str, bool], current: dict[str, bool], min_delta: float = -0.05) -> bool:
"""baseline/current: qid -> 是否通过规则+指标门槛"""
keys = baseline.keys() & current.keys()
if not keys:
return False
b = sum(baseline[k] for k in keys) / len(keys)
c = sum(current[k] for k in keys) / len(keys)
return c + 1e-9 >= b + min_delta
base = {"1": True, "2": True, "3": False}
curr_good = {"1": True, "2": True, "3": True}
curr_bad = {"1": True, "2": False, "3": False}
print("improved", regression_gate(base, curr_good, min_delta=0.0))
print("regressed", regression_gate(base, curr_bad, min_delta=0.0))Shadow:双写日志结构(不落库给用户)
import json
import time
import uuid
def shadow_log(*, user_id: str, question: str, prod_answer: str, cand_answer: str, meta: dict) -> str:
row = {
"rid": str(uuid.uuid4()),
"ts": time.time(),
"user_id": user_id,
"question": question,
"prod": prod_answer,
"candidate": cand_answer,
"meta": meta,
}
return json.dumps(row, ensure_ascii=False)
line = shadow_log(
user_id="u1",
question="demo?",
prod_answer="A",
cand_answer="B",
meta={"variant": "new_reranker"},
)
print(line)