The Retrieval Pipeline
A complete retrieval pipeline ingests documents, stores them as searchable embeddings, and retrieves the most relevant chunks at query time.
Full Ingestion Pipeline
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import anthropic
client = anthropic.Anthropic()
embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
chroma = chromadb.Client()
collection = chroma.get_or_create_collection("docs", embedding_function=embedding_fn)
def ingest_document(title: str, content: str, source_url: str = "") -> int:
"""Chunk a document and store it in the vector database."""
chunks = semantic_chunks(content, max_chunk_size=400) # from previous lesson
ids = []
documents = []
metadatas = []
for i, chunk in enumerate(chunks):
chunk_id = f"{title.lower().replace(' ', '_')}_{i}"
ids.append(chunk_id)
documents.append(chunk["content"])
metadatas.append({
"title": title,
"source_url": source_url,
"chunk_index": i,
})
collection.add(ids=ids, documents=documents, metadatas=metadatas)
print(f"Ingested '{title}': {len(chunks)} chunks")
return len(chunks)
def retrieve(query: str, n_results: int = 4) -> list[dict]:
"""Retrieve the most relevant chunks for a query."""
results = collection.query(query_texts=[query], n_results=n_results)
retrieved = []
for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0])):
retrieved.append({
"content": doc,
"title": meta["title"],
"source_url": meta.get("source_url", ""),
"relevance_rank": i + 1,
})
return retrieved
# Ingest sample documents
ingest_document(
"Claude API Guide",
"The Messages API is the primary way to interact with Claude...",
"https://docs.anthropic.com",
)
# Retrieve
relevant_chunks = retrieve("How do I send messages to Claude?")
for chunk in relevant_chunks:
print(f"[{chunk['relevance_rank']}] {chunk['title']}: {chunk['content'][:120]}")
Hybrid Search (Semantic + Keyword)
For better coverage, combine embedding similarity with keyword matching:
def hybrid_retrieve(query: str, n_semantic: int = 3, n_keyword: int = 2) -> list[dict]:
"""Combine semantic search with BM25 keyword matching."""
# Semantic results
semantic = retrieve(query, n_results=n_semantic)
# Keyword results (simplified BM25 approximation)
keywords = set(query.lower().split())
all_docs = collection.get()
keyword_matches = []
for doc, meta in zip(all_docs["documents"], all_docs["metadatas"]):
doc_words = set(doc.lower().split())
overlap = len(keywords & doc_words)
if overlap > 0:
keyword_matches.append((overlap, doc, meta))
keyword_matches.sort(reverse=True)
keyword_results = [
{"content": doc, "title": meta["title"], "relevance_rank": 0}
for _, doc, meta in keyword_matches[:n_keyword]
]
# Deduplicate and combine
seen = {c["content"] for c in semantic}
combined = semantic + [c for c in keyword_results if c["content"] not in seen]
return combined
Hybrid retrieval catches cases where semantic similarity alone misses exact keyword matches.