文档分块与索引
→ 返回工程实践
Chunking:把长文档切成带元数据的片段,再送 Embedding 与 向量库。RAG 的答案质量,上限往往由「检索到的块是否恰好包住答案句」决定。
固定窗口 + overlap(纯 Python)
from __future__ import annotations
def chunk_fixed(text: str, size: int = 500, overlap: int = 80) -> list[dict]:
"""按字符窗口切片;overlap 减少边界截断代码/公式。"""
if overlap >= size:
raise ValueError("overlap must be smaller than size")
chunks: list[dict] = []
start = 0
i = 0
n = len(text)
while start < n:
end = min(start + size, n)
piece = text[start:end]
chunks.append({"chunk_id": i, "start": start, "end": end, "text": piece})
i += 1
if end == n:
break
start = end - overlap
return chunks
sample = "abcdefghijklmnopqrstuvwxyz" * 20
for c in chunk_fixed(sample, size=60, overlap=10)[:3]:
print(c["chunk_id"], len(c["text"]))tiktoken:按 token 估算窗口(OpenAI 系)
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def chunk_by_tokens(text: str, max_tokens: int = 256, overlap_tokens: int = 32) -> list[str]:
ids = enc.encode(text)
out: list[str] = []
step = max_tokens - overlap_tokens
if step <= 0:
raise ValueError("invalid overlap")
for i in range(0, len(ids), step):
piece = ids[i : i + max_tokens]
out.append(enc.decode(piece))
if i + max_tokens >= len(ids):
break
return out
text = " ".join(["hello"] * 500)
parts = chunk_by_tokens(text)
print("segments", len(parts), "first tokens", len(enc.encode(parts[0])))LangChain:RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
text = """# Redis\n\nRedis is in-memory.\n\n## Persistence\nRDB and AOF.\n"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=120,
chunk_overlap=20,
separators=["\n\n", "\n", "。", ".", " ", ""],
)
docs = splitter.create_documents([text], metadatas=[{"doc_id": "redis-intro"}])
for d in docs:
print("---")
print(d.metadata, d.page_content[:80])Markdown:按标题分段(LangChain)
from langchain_text_splitters import MarkdownHeaderTextSplitter
md = """# 安装\n\npip install foo\n\n## 配置\n\n设置 BAR=1\n"""
header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[("#", "h1"), ("##", "h2")],
strip_headers=False,
)
md_chunks = header_splitter.split_text(md)
for c in md_chunks:
print(c.metadata, "->", c.page_content.replace("\n", " ")[:60])「父子块」粗实现:小块检索、大块扩上下文
def parent_child_split(long_text: str, child: int = 200, parent: int = 1200):
"""子块用于向量命中;返回时可同 doc_id 合并父窗口。"""
children = chunk_fixed(long_text, size=child, overlap=40)
parents = chunk_fixed(long_text, size=parent, overlap=200)
enriched = []
for ch in children:
# 简单策略:找与 child 中心点重叠最多的 parent
mid = (ch["start"] + ch["end"]) // 2
best = min(parents, key=lambda p: min(abs(mid - p["start"]), abs(mid - p["end"])))
enriched.append(
{
**ch,
"parent_text": best["text"],
"parent_span": (best["start"], best["end"]),
}
)
return enriched元数据与删除:按 doc_id 批次清理(逻辑示意)
# 向量库侧通常支持 delete by filter,例如 Qdrant:
# client.delete(collection, points_selector=Filter(must=[FieldCondition(key="doc_id", match=MatchValue(value="u123"))]))
def chunk_records(doc_id: str, chunks: list[dict]) -> list[dict]:
return [{**c, "doc_id": doc_id, "source": "kb-v1"} for c in chunks]文档或模型升级后应 全量重嵌入 并跑 评测集与线上回归。