评测集与线上回归

返回工程实践

业务侧维护 Golden Set(黄金集) + 离线门禁,与 Harness 里的公开基准(MMLU、HumanEval 等)互补:前者对齐产品与 RAG 管线,后者对齐通用模型能力。


黄金集:JSON Lines 与 pandas

import json
from pathlib import Path
 
import pandas as pd
 
def load_golden(path: Path) -> pd.DataFrame:
    rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
    return pd.DataFrame(rows)
 
 
# 示例 golden.jsonl(每行一条)
example = """\
{"qid":"1","question":"Redis 持久化方式?","ground_truth":"RDB 与 AOF","expected_ctx":["redis-persist-1"],"tags":["fact"]}
{"qid":"2","question":"本公司无文档的问题 X?","ground_truth":null,"expected_ctx":[],"tags":["should_abstain"]}
"""
Path("golden.jsonl").write_text(example, encoding="utf-8")
 
df = load_golden(Path("golden.jsonl"))
print(df[["qid", "tags"]])

RAGAS:最小可跑示例(需安装 ragas、datasets)

Harness 文中的指标一致;下面演示 faithfulnessanswer_relevancy 等(实际跑需配置 LLM / embedding,请设置环境变量)。

import os
 
from datasets import Dataset
 
# 若未设置 OPENAI_API_KEY,可将 evaluate 换成离线 stub 或换本地模型 —— 此处仅展示数据结构
os.environ.setdefault("OPENAI_API_KEY", "sk-xxx")
 
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_precision, faithfulness
 
 
data = {
    "question": ["Redis 是什么?"],
    "answer": ["Redis 是内存键值数据库,常用于缓存。"],
    "contexts": [["Redis 是一个开源的内存数据结构存储,可用作数据库、缓存。"]],
    "ground_truth": ["Redis 是基于内存的键值存储,可用作缓存与消息代理。"],
}
 
ds = Dataset.from_dict(data)
result = evaluate(
    ds,
    metrics=[faithfulness, answer_relevancy, context_precision],
)
print(result)

CI 中可对 result 里各指标设阈值,低于 baseline 即 non-zero exit


规则门禁:引用角标与拒答

import re
 
 
def must_have_citations(answer: str) -> bool:
    return bool(re.search(r"\[\d+\]", answer))
 
 
def abstain_ok(answer: str, should_abstain: bool) -> bool:
    if not should_abstain:
        return True
    markers = ("不知道", "未覆盖", "无法根据", "没有相关信息")
    return any(m in answer for m in markers)
 
 
def lint_rag_output(answer: str, *, should_abstain: bool = False) -> list[str]:
    errors: list[str] = []
    if should_abstain:
        if not abstain_ok(answer, True):
            errors.append("expected abstain-style answer")
    else:
        if not must_have_citations(answer):
            errors.append("missing [n] citations")
    return errors
 
 
print(lint_rag_output("见文档 [1][2].", should_abstain=False))
print(lint_rag_output("随便编的答案", should_abstain=False))

版本 diff:新 pipeline 相对 baseline 的通过率

def regression_gate(baseline: dict[str, bool], current: dict[str, bool], min_delta: float = -0.05) -> bool:
    """baseline/current: qid -> 是否通过规则+指标门槛"""
    keys = baseline.keys() & current.keys()
    if not keys:
        return False
    b = sum(baseline[k] for k in keys) / len(keys)
    c = sum(current[k] for k in keys) / len(keys)
    return c + 1e-9 >= b + min_delta
 
 
base = {"1": True, "2": True, "3": False}
curr_good = {"1": True, "2": True, "3": True}
curr_bad = {"1": True, "2": False, "3": False}
print("improved", regression_gate(base, curr_good, min_delta=0.0))
print("regressed", regression_gate(base, curr_bad, min_delta=0.0))

Shadow:双写日志结构(不落库给用户)

import json
import time
import uuid
 
 
def shadow_log(*, user_id: str, question: str, prod_answer: str, cand_answer: str, meta: dict) -> str:
    row = {
        "rid": str(uuid.uuid4()),
        "ts": time.time(),
        "user_id": user_id,
        "question": question,
        "prod": prod_answer,
        "candidate": cand_answer,
        "meta": meta,
    }
    return json.dumps(row, ensure_ascii=False)
 
 
line = shadow_log(
    user_id="u1",
    question="demo?",
    prod_answer="A",
    cand_answer="B",
    meta={"variant": "new_reranker"},
)
print(line)

相关文档