Retrieval-Augmented Generation (RAG) connects Claude to your own documents, databases, or knowledge bases. Instead of Claude relying solely on its training data, it searches your documents first, retrieves relevant passages, and uses those as context for its answer.

This guide builds a complete RAG application: document ingestion, embedding storage, semantic search, and Claude-powered answers.


The Architecture

User query → Embed query → Search vector database → Retrieve top-k chunks
     → Inject chunks into Claude prompt → Generate grounded answer

Stack used in this guide:

  • Python 3.11+
  • Anthropic Claude API
  • PostgreSQL + pgvector (vector storage)
  • Anthropic or OpenAI embeddings

Step 1: Set Up pgvector

Install PostgreSQL with the pgvector extension:

# On macOS with Homebrew
brew install postgresql@16
brew services start postgresql@16

# Connect and enable extension
psql postgres

In PostgreSQL:

CREATE DATABASE rag_app;
\c rag_app
CREATE EXTENSION IF NOT EXISTS vector;

-- Documents table
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    content TEXT NOT NULL,
    embedding vector(1536),  -- 1536 for text-embedding-3-small
    created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Index for fast similarity search
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops);

Step 2: Install Dependencies

pip install anthropic openai psycopg2-binary python-dotenv

Create .env:

ANTHROPIC_API_KEY=sk-ant-...
OPENAI_API_KEY=sk-...   # For embeddings (optional, or use Anthropic)
DATABASE_URL=postgresql://localhost/rag_app

Step 3: Build the Embedding Utility

# embeddings.py
import os
from openai import OpenAI
from anthropic import Anthropic

# We use OpenAI's text-embedding-3-small for embeddings
# (Anthropic doesn't currently offer an embedding API)
embedding_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def embed_text(text: str) -> list[float]:
    """Generate embedding vector for a piece of text."""
    response = embedding_client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """Split text into overlapping chunks for better retrieval."""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    
    return chunks

Step 4: Build the Document Ingestion Pipeline

# ingest.py
import os
import psycopg2
from embeddings import embed_text, chunk_text

def ingest_document(title: str, content: str):
    """Ingest a document into the vector database."""
    conn = psycopg2.connect(os.getenv("DATABASE_URL"))
    cursor = conn.cursor()
    
    # Split into chunks
    chunks = chunk_text(content)
    
    for i, chunk in enumerate(chunks):
        # Generate embedding for this chunk
        embedding = embed_text(chunk)
        
        # Store chunk with embedding
        cursor.execute(
            """INSERT INTO documents (title, content, embedding) 
               VALUES (%s, %s, %s)""",
            (f"{title} [chunk {i+1}]", chunk, embedding)
        )
    
    conn.commit()
    cursor.close()
    conn.close()
    
    print(f"Ingested '{title}': {len(chunks)} chunks")


def ingest_from_file(filepath: str):
    """Ingest a text file."""
    with open(filepath, 'r') as f:
        content = f.read()
    
    title = os.path.basename(filepath)
    ingest_document(title, content)


if __name__ == "__main__":
    # Example: ingest all .txt files in a directory
    import glob
    
    for filepath in glob.glob("documents/*.txt"):
        ingest_from_file(filepath)

Step 5: Build the Retrieval Function

# retrieval.py
import os
import psycopg2
from embeddings import embed_text

def retrieve_relevant_chunks(query: str, top_k: int = 5) -> list[dict]:
    """Find the most relevant document chunks for a query."""
    
    # Embed the query
    query_embedding = embed_text(query)
    
    conn = psycopg2.connect(os.getenv("DATABASE_URL"))
    cursor = conn.cursor()
    
    # Find most similar chunks using cosine similarity
    cursor.execute(
        """
        SELECT title, content, 
               1 - (embedding <=> %s::vector) AS similarity
        FROM documents
        ORDER BY embedding <=> %s::vector
        LIMIT %s
        """,
        (query_embedding, query_embedding, top_k)
    )
    
    results = []
    for title, content, similarity in cursor.fetchall():
        results.append({
            "title": title,
            "content": content,
            "similarity": float(similarity)
        })
    
    cursor.close()
    conn.close()
    
    # Filter out very low similarity results
    return [r for r in results if r["similarity"] > 0.7]

Step 6: Build the RAG Query Function

# rag.py
import os
from anthropic import Anthropic
from retrieval import retrieve_relevant_chunks

client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

def rag_query(question: str, verbose: bool = False) -> str:
    """Answer a question using RAG."""
    
    # 1. Retrieve relevant chunks
    chunks = retrieve_relevant_chunks(question, top_k=5)
    
    if not chunks:
        return "I couldn't find relevant information in the documents to answer this question."
    
    if verbose:
        print(f"\nRetrieved {len(chunks)} chunks:")
        for c in chunks:
            print(f"  [{c['similarity']:.2f}] {c['title']}")
    
    # 2. Build context from retrieved chunks
    context = "\n\n---\n\n".join([
        f"Source: {c['title']}\n\n{c['content']}"
        for c in chunks
    ])
    
    # 3. Query Claude with retrieved context
    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=1024,
        system="""You are a helpful assistant that answers questions based on provided documents.
        
        Rules:
        - Only use information from the provided context documents
        - If the answer isn't in the documents, say "I don't have that information in the provided documents"
        - Cite which document each piece of information comes from
        - Be concise and accurate""",
        messages=[
            {
                "role": "user",
                "content": f"""Context documents:

{context}

---

Question: {question}"""
            }
        ]
    )
    
    return response.content[0].text


if __name__ == "__main__":
    # Interactive CLI
    print("RAG Application Ready")
    print("Type your question (or 'quit' to exit)\n")
    
    while True:
        question = input("Question: ").strip()
        if question.lower() in ["quit", "exit"]:
            break
        
        answer = rag_query(question, verbose=True)
        print(f"\nAnswer:\n{answer}\n")

Step 7: Test It

# First ingest some documents
python ingest.py

# Then ask questions
python rag.py

Improving Retrieval Quality

The basic implementation works but can be improved:

1. Better chunking strategy

Chunk by semantic boundaries (paragraphs, sections) rather than arbitrary word counts:

def chunk_by_paragraph(text: str) -> list[str]:
    """Chunk by paragraph — better for structured documents."""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    # Merge very short paragraphs
    chunks = []
    current = ""
    for para in paragraphs:
        if len(current.split()) + len(para.split()) < 600:
            current = f"{current}\n\n{para}" if current else para
        else:
            if current:
                chunks.append(current)
            current = para
    
    if current:
        chunks.append(current)
    
    return chunks

2. Hybrid search — combine vector similarity with keyword search for better precision on specific terms.

3. Re-ranking — use a cross-encoder model to re-rank retrieved chunks by relevance before sending to Claude.

4. Parent-child chunking — store small chunks for retrieval, but inject the surrounding larger context into Claude.


Production Considerations

Rate limiting: Implement exponential backoff on embedding API calls when ingesting large document sets.

Caching: Cache embeddings for frequently queried terms. Cache Claude responses for identical queries.

Monitoring: Track retrieval quality (are the returned chunks relevant?) and answer quality (are users satisfied?).

Cost: Embedding costs (OpenAI text-embedding-3-small is ~$0.00002/1K tokens). Claude API costs for the generation step. At scale, embedding costs at ingestion time are a one-time cost; generation costs are per query.

This architecture scales to millions of documents with proper indexing. pgvector with ivfflat indexes handles 100M+ vectors efficiently.