Document Chunking Strategies
How you split documents before embedding them directly impacts retrieval quality. Poor chunking is the most common cause of RAG systems that feel inaccurate.
Fixed-Size Chunking
The simplest approach — split by token or character count with overlap:
def fixed_size_chunks(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]:
"""Split text into fixed-size chunks with overlap to preserve context at boundaries."""
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunks
text = "Long document content here..." * 200
chunks = fixed_size_chunks(text, chunk_size=200, overlap=20)
print(f"Generated {len(chunks)} chunks")
Semantic Chunking
Split at natural boundaries (paragraphs, sections) rather than arbitrary character counts:
import re
def semantic_chunks(text: str, max_chunk_size: int = 1000) -> list[dict]:
"""Split text by paragraph boundaries, grouping small paragraphs together."""
paragraphs = re.split(r"\n\n+", text.strip())
chunks = []
current_chunk = []
current_size = 0
for para in paragraphs:
para_size = len(para.split())
if current_size + para_size > max_chunk_size and current_chunk:
chunks.append({
"content": "\n\n".join(current_chunk),
"word_count": current_size,
})
current_chunk = [para]
current_size = para_size
else:
current_chunk.append(para)
current_size += para_size
if current_chunk:
chunks.append({
"content": "\n\n".join(current_chunk),
"word_count": current_size,
})
return chunks
Recursive Character Splitting (LangChain-style)
def recursive_split(text: str, separators: list[str] = None, chunk_size: int = 500) -> list[str]:
"""Split using a hierarchy of separators, falling back to smaller ones."""
if separators is None:
separators = ["\n\n", "\n", ". ", " ", ""]
separator = separators[0]
remaining = separators[1:]
parts = text.split(separator)
chunks = []
current = ""
for part in parts:
if len(current) + len(part) < chunk_size:
current += (separator if current else "") + part
else:
if current:
chunks.append(current)
if len(part) > chunk_size and remaining:
chunks.extend(recursive_split(part, remaining, chunk_size))
else:
current = part
if current:
chunks.append(current)
return chunks
Metadata Enrichment
Add metadata to chunks to improve retrieval filtering:
def chunk_with_metadata(document: dict, chunks: list[str]) -> list[dict]:
return [
{
"id": f"{document['id']}_chunk_{i}",
"content": chunk,
"metadata": {
"source": document["title"],
"url": document.get("url", ""),
"chunk_index": i,
"total_chunks": len(chunks),
"word_count": len(chunk.split()),
},
}
for i, chunk in enumerate(chunks)
]
Rule of thumb: chunks should be large enough to be self-contained (include surrounding context) but small enough that retrieved chunks stay relevant to the query.