RAG (Retrieval-Augmented Generation) lets LLMs answer questions from your own documents by finding relevant passages at query time. This guide builds a complete RAG system from scratch.
How RAG Works
- Indexing (offline): Split documents into chunks → embed chunks → store in vector DB
- Retrieval (online): Embed query → find similar chunks → return top-k
- Generation: Send chunks + question to LLM → generate answer citing sources
Stack
- Embedding model:
text-embedding-3-small(OpenAI) orall-MiniLM-L6-v2(local) - Vector database: Chroma (local), Pinecone (cloud), Qdrant (self-hosted)
- LLM: Claude 3.5 Sonnet or GPT-4o
- Framework: Optional — LangChain or LlamaIndex simplify but add complexity
This guide builds without a framework first.
Installation
pip install anthropic openai chromadb pypdf sentence-transformers
Document Processing
PDF Loading and Chunking
import pypdf
from pathlib import Path
def load_pdf(path: str) -> str:
reader = pypdf.PdfReader(path)
pages = []
for page in reader.pages:
text = page.extract_text()
if text.strip():
pages.append(text)
return "\n\n".join(pages)
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[dict]:
chunks = []
words = text.split()
i = 0
chunk_id = 0
while i < len(words):
chunk_words = words[i:i + chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append({
"id": f"chunk_{chunk_id}",
"text": chunk_text,
"word_count": len(chunk_words),
})
chunk_id += 1
i += chunk_size - overlap
return chunks
Semantic Chunking (Better Quality)
Chunk at paragraph/section boundaries rather than fixed size:
import re
def semantic_chunk(text: str, max_chunk_size: int = 1500) -> list[str]:
# Split on double newlines (paragraphs)
paragraphs = re.split(r'\n\s*\n', text)
chunks = []
current_chunk = []
current_size = 0
for paragraph in paragraphs:
para_size = len(paragraph.split())
if current_size + para_size > max_chunk_size and current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = [paragraph]
current_size = para_size
else:
current_chunk.append(paragraph)
current_size += para_size
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks
Embeddings
from openai import OpenAI
openai_client = OpenAI()
def embed_texts(texts: list[str]) -> list[list[float]]:
# Batch up to 2048 texts per request
batch_size = 100
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=batch,
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
def embed_query(query: str) -> list[float]:
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=[query],
)
return response.data[0].embedding
Local Embeddings (Free, Private)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
def embed_local(texts: list[str]) -> list[list[float]]:
return model.encode(texts).tolist()
Vector Database with Chroma
import chromadb
from chromadb.config import Settings
# Persistent storage
client = chromadb.PersistentClient(path="./chroma_db")
def create_collection(name: str):
return client.get_or_create_collection(
name=name,
metadata={"hnsw:space": "cosine"},
)
def index_documents(collection, chunks: list[dict], embeddings: list[list[float]]):
collection.add(
ids=[c["id"] for c in chunks],
embeddings=embeddings,
documents=[c["text"] for c in chunks],
metadatas=[{"source": c.get("source", "unknown")} for c in chunks],
)
print(f"Indexed {len(chunks)} chunks")
def retrieve(collection, query_embedding: list[float], top_k: int = 5) -> list[dict]:
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
include=["documents", "metadatas", "distances"],
)
retrieved = []
for i in range(len(results["ids"][0])):
retrieved.append({
"text": results["documents"][0][i],
"source": results["metadatas"][0][i].get("source"),
"distance": results["distances"][0][i],
})
return retrieved
Generation with Claude
import anthropic
claude = anthropic.Anthropic()
def generate_answer(question: str, context_chunks: list[dict]) -> str:
context = "\n\n---\n\n".join([
f"Source: {c['source']}\n{c['text']}"
for c in context_chunks
])
prompt = f"""Answer the question based ONLY on the provided context.
If the context doesn't contain enough information to answer, say so.
Context:
{context}
Question: {question}
Instructions:
- Answer directly and concisely
- Cite which source(s) support your answer
- If the answer isn't in the context, say "The provided documents don't contain information about this."
"""
response = claude.messages.create(
model="claude-opus-4-7",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
Complete RAG Pipeline
class RAGSystem:
def __init__(self, collection_name: str = "documents"):
self.collection = create_collection(collection_name)
def index(self, file_paths: list[str]):
for path in file_paths:
print(f"Indexing {path}...")
if path.endswith(".pdf"):
text = load_pdf(path)
else:
with open(path) as f:
text = f.read()
chunks_text = semantic_chunk(text)
chunks = [
{"id": f"{Path(path).stem}_{i}", "text": t, "source": path}
for i, t in enumerate(chunks_text)
]
embeddings = embed_texts([c["text"] for c in chunks])
index_documents(self.collection, chunks, embeddings)
print(f"Indexing complete. Total chunks: {self.collection.count()}")
def query(self, question: str, top_k: int = 5) -> dict:
query_embedding = embed_query(question)
context_chunks = retrieve(self.collection, query_embedding, top_k)
# Filter low-relevance chunks (cosine distance > 0.7 is usually irrelevant)
relevant_chunks = [c for c in context_chunks if c["distance"] < 0.7]
if not relevant_chunks:
return {
"answer": "No relevant information found in the documents.",
"sources": [],
}
answer = generate_answer(question, relevant_chunks)
sources = list(set(c["source"] for c in relevant_chunks))
return {"answer": answer, "sources": sources, "chunks_used": len(relevant_chunks)}
# Usage
rag = RAGSystem()
rag.index(["docs/technical_guide.pdf", "docs/faq.txt", "docs/policy.pdf"])
result = rag.query("What is the refund policy?")
print(result["answer"])
print("Sources:", result["sources"])
Improving Retrieval Quality
Hybrid Search (Keyword + Semantic)
Combine vector search with BM25 for better recall:
from rank_bm25 import BM25Okapi
class HybridRetriever:
def __init__(self, chunks: list[str]):
self.chunks = chunks
tokenized = [c.lower().split() for c in chunks]
self.bm25 = BM25Okapi(tokenized)
def retrieve(self, query: str, top_k: int = 5) -> list[int]:
# BM25 scores
bm25_scores = self.bm25.get_scores(query.lower().split())
# Normalize and combine with vector scores (passed in separately)
# Return top-k indices
indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)
return indices[:top_k]
Query Expansion
Generate multiple queries from the original:
def expand_query(question: str) -> list[str]:
response = claude.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=256,
messages=[{
"role": "user",
"content": f"""Generate 3 alternative phrasings of this question for document retrieval.
Output only the questions, one per line, no numbering.
Question: {question}"""
}],
)
alternatives = response.content[0].text.strip().split("\n")
return [question] + alternatives[:3]
Evaluation
def evaluate_rag(test_cases: list[dict]) -> dict:
correct = 0
for case in test_cases:
result = rag.query(case["question"])
answer = result["answer"].lower()
expected = case["expected_answer"].lower()
# Simple keyword match (use LLM-as-judge for better evaluation)
if any(keyword in answer for keyword in case["keywords"]):
correct += 1
return {
"accuracy": correct / len(test_cases),
"n": len(test_cases),
}
test_cases = [
{
"question": "What is the return policy?",
"expected_answer": "30 days",
"keywords": ["30 days", "30-day", "thirty days"],
},
]