RAG (Retrieval-Augmented Generation) lets LLMs answer questions from your own documents by finding relevant passages at query time. This guide builds a complete RAG system from scratch.


How RAG Works

  1. Indexing (offline): Split documents into chunks → embed chunks → store in vector DB
  2. Retrieval (online): Embed query → find similar chunks → return top-k
  3. Generation: Send chunks + question to LLM → generate answer citing sources

Stack

  • Embedding model: text-embedding-3-small (OpenAI) or all-MiniLM-L6-v2 (local)
  • Vector database: Chroma (local), Pinecone (cloud), Qdrant (self-hosted)
  • LLM: Claude 3.5 Sonnet or GPT-4o
  • Framework: Optional — LangChain or LlamaIndex simplify but add complexity

This guide builds without a framework first.


Installation

pip install anthropic openai chromadb pypdf sentence-transformers

Document Processing

PDF Loading and Chunking

import pypdf
from pathlib import Path

def load_pdf(path: str) -> str:
    reader = pypdf.PdfReader(path)
    pages = []
    for page in reader.pages:
        text = page.extract_text()
        if text.strip():
            pages.append(text)
    return "\n\n".join(pages)

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[dict]:
    chunks = []
    words = text.split()
    
    i = 0
    chunk_id = 0
    while i < len(words):
        chunk_words = words[i:i + chunk_size]
        chunk_text = " ".join(chunk_words)
        
        chunks.append({
            "id": f"chunk_{chunk_id}",
            "text": chunk_text,
            "word_count": len(chunk_words),
        })
        
        chunk_id += 1
        i += chunk_size - overlap
    
    return chunks

Semantic Chunking (Better Quality)

Chunk at paragraph/section boundaries rather than fixed size:

import re

def semantic_chunk(text: str, max_chunk_size: int = 1500) -> list[str]:
    # Split on double newlines (paragraphs)
    paragraphs = re.split(r'\n\s*\n', text)
    
    chunks = []
    current_chunk = []
    current_size = 0
    
    for paragraph in paragraphs:
        para_size = len(paragraph.split())
        
        if current_size + para_size > max_chunk_size and current_chunk:
            chunks.append("\n\n".join(current_chunk))
            current_chunk = [paragraph]
            current_size = para_size
        else:
            current_chunk.append(paragraph)
            current_size += para_size
    
    if current_chunk:
        chunks.append("\n\n".join(current_chunk))
    
    return chunks

Embeddings

from openai import OpenAI

openai_client = OpenAI()

def embed_texts(texts: list[str]) -> list[list[float]]:
    # Batch up to 2048 texts per request
    batch_size = 100
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=batch,
        )
        embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(embeddings)
    
    return all_embeddings

def embed_query(query: str) -> list[float]:
    response = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=[query],
    )
    return response.data[0].embedding

Local Embeddings (Free, Private)

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_local(texts: list[str]) -> list[list[float]]:
    return model.encode(texts).tolist()

Vector Database with Chroma

import chromadb
from chromadb.config import Settings

# Persistent storage
client = chromadb.PersistentClient(path="./chroma_db")

def create_collection(name: str):
    return client.get_or_create_collection(
        name=name,
        metadata={"hnsw:space": "cosine"},
    )

def index_documents(collection, chunks: list[dict], embeddings: list[list[float]]):
    collection.add(
        ids=[c["id"] for c in chunks],
        embeddings=embeddings,
        documents=[c["text"] for c in chunks],
        metadatas=[{"source": c.get("source", "unknown")} for c in chunks],
    )
    print(f"Indexed {len(chunks)} chunks")

def retrieve(collection, query_embedding: list[float], top_k: int = 5) -> list[dict]:
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"],
    )
    
    retrieved = []
    for i in range(len(results["ids"][0])):
        retrieved.append({
            "text": results["documents"][0][i],
            "source": results["metadatas"][0][i].get("source"),
            "distance": results["distances"][0][i],
        })
    
    return retrieved

Generation with Claude

import anthropic

claude = anthropic.Anthropic()

def generate_answer(question: str, context_chunks: list[dict]) -> str:
    context = "\n\n---\n\n".join([
        f"Source: {c['source']}\n{c['text']}" 
        for c in context_chunks
    ])
    
    prompt = f"""Answer the question based ONLY on the provided context. 
If the context doesn't contain enough information to answer, say so.

Context:
{context}

Question: {question}

Instructions:
- Answer directly and concisely
- Cite which source(s) support your answer
- If the answer isn't in the context, say "The provided documents don't contain information about this."
"""
    
    response = claude.messages.create(
        model="claude-opus-4-7",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}],
    )
    
    return response.content[0].text

Complete RAG Pipeline

class RAGSystem:
    def __init__(self, collection_name: str = "documents"):
        self.collection = create_collection(collection_name)
    
    def index(self, file_paths: list[str]):
        for path in file_paths:
            print(f"Indexing {path}...")
            
            if path.endswith(".pdf"):
                text = load_pdf(path)
            else:
                with open(path) as f:
                    text = f.read()
            
            chunks_text = semantic_chunk(text)
            chunks = [
                {"id": f"{Path(path).stem}_{i}", "text": t, "source": path}
                for i, t in enumerate(chunks_text)
            ]
            
            embeddings = embed_texts([c["text"] for c in chunks])
            index_documents(self.collection, chunks, embeddings)
        
        print(f"Indexing complete. Total chunks: {self.collection.count()}")
    
    def query(self, question: str, top_k: int = 5) -> dict:
        query_embedding = embed_query(question)
        context_chunks = retrieve(self.collection, query_embedding, top_k)
        
        # Filter low-relevance chunks (cosine distance > 0.7 is usually irrelevant)
        relevant_chunks = [c for c in context_chunks if c["distance"] < 0.7]
        
        if not relevant_chunks:
            return {
                "answer": "No relevant information found in the documents.",
                "sources": [],
            }
        
        answer = generate_answer(question, relevant_chunks)
        sources = list(set(c["source"] for c in relevant_chunks))
        
        return {"answer": answer, "sources": sources, "chunks_used": len(relevant_chunks)}

# Usage
rag = RAGSystem()
rag.index(["docs/technical_guide.pdf", "docs/faq.txt", "docs/policy.pdf"])

result = rag.query("What is the refund policy?")
print(result["answer"])
print("Sources:", result["sources"])

Improving Retrieval Quality

Hybrid Search (Keyword + Semantic)

Combine vector search with BM25 for better recall:

from rank_bm25 import BM25Okapi

class HybridRetriever:
    def __init__(self, chunks: list[str]):
        self.chunks = chunks
        tokenized = [c.lower().split() for c in chunks]
        self.bm25 = BM25Okapi(tokenized)
    
    def retrieve(self, query: str, top_k: int = 5) -> list[int]:
        # BM25 scores
        bm25_scores = self.bm25.get_scores(query.lower().split())
        
        # Normalize and combine with vector scores (passed in separately)
        # Return top-k indices
        indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)
        return indices[:top_k]

Query Expansion

Generate multiple queries from the original:

def expand_query(question: str) -> list[str]:
    response = claude.messages.create(
        model="claude-haiku-4-5-20251001",
        max_tokens=256,
        messages=[{
            "role": "user",
            "content": f"""Generate 3 alternative phrasings of this question for document retrieval.
Output only the questions, one per line, no numbering.

Question: {question}"""
        }],
    )
    
    alternatives = response.content[0].text.strip().split("\n")
    return [question] + alternatives[:3]

Evaluation

def evaluate_rag(test_cases: list[dict]) -> dict:
    correct = 0
    
    for case in test_cases:
        result = rag.query(case["question"])
        answer = result["answer"].lower()
        expected = case["expected_answer"].lower()
        
        # Simple keyword match (use LLM-as-judge for better evaluation)
        if any(keyword in answer for keyword in case["keywords"]):
            correct += 1
    
    return {
        "accuracy": correct / len(test_cases),
        "n": len(test_cases),
    }

test_cases = [
    {
        "question": "What is the return policy?",
        "expected_answer": "30 days",
        "keywords": ["30 days", "30-day", "thirty days"],
    },
]