Retrieval-Augmented Generation (RAG) connects Claude to your own documents, databases, or knowledge bases. Instead of Claude relying solely on its training data, it searches your documents first, retrieves relevant passages, and uses those as context for its answer.
This guide builds a complete RAG application: document ingestion, embedding storage, semantic search, and Claude-powered answers.
The Architecture
User query → Embed query → Search vector database → Retrieve top-k chunks
→ Inject chunks into Claude prompt → Generate grounded answer
Stack used in this guide:
- Python 3.11+
- Anthropic Claude API
- PostgreSQL + pgvector (vector storage)
- Anthropic or OpenAI embeddings
Step 1: Set Up pgvector
Install PostgreSQL with the pgvector extension:
# On macOS with Homebrew
brew install postgresql@16
brew services start postgresql@16
# Connect and enable extension
psql postgres
In PostgreSQL:
CREATE DATABASE rag_app;
\c rag_app
CREATE EXTENSION IF NOT EXISTS vector;
-- Documents table
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
embedding vector(1536), -- 1536 for text-embedding-3-small
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Index for fast similarity search
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops);
Step 2: Install Dependencies
pip install anthropic openai psycopg2-binary python-dotenv
Create .env:
ANTHROPIC_API_KEY=sk-ant-...
OPENAI_API_KEY=sk-... # For embeddings (optional, or use Anthropic)
DATABASE_URL=postgresql://localhost/rag_app
Step 3: Build the Embedding Utility
# embeddings.py
import os
from openai import OpenAI
from anthropic import Anthropic
# We use OpenAI's text-embedding-3-small for embeddings
# (Anthropic doesn't currently offer an embedding API)
embedding_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def embed_text(text: str) -> list[float]:
"""Generate embedding vector for a piece of text."""
response = embedding_client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks for better retrieval."""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
if chunk:
chunks.append(chunk)
return chunks
Step 4: Build the Document Ingestion Pipeline
# ingest.py
import os
import psycopg2
from embeddings import embed_text, chunk_text
def ingest_document(title: str, content: str):
"""Ingest a document into the vector database."""
conn = psycopg2.connect(os.getenv("DATABASE_URL"))
cursor = conn.cursor()
# Split into chunks
chunks = chunk_text(content)
for i, chunk in enumerate(chunks):
# Generate embedding for this chunk
embedding = embed_text(chunk)
# Store chunk with embedding
cursor.execute(
"""INSERT INTO documents (title, content, embedding)
VALUES (%s, %s, %s)""",
(f"{title} [chunk {i+1}]", chunk, embedding)
)
conn.commit()
cursor.close()
conn.close()
print(f"Ingested '{title}': {len(chunks)} chunks")
def ingest_from_file(filepath: str):
"""Ingest a text file."""
with open(filepath, 'r') as f:
content = f.read()
title = os.path.basename(filepath)
ingest_document(title, content)
if __name__ == "__main__":
# Example: ingest all .txt files in a directory
import glob
for filepath in glob.glob("documents/*.txt"):
ingest_from_file(filepath)
Step 5: Build the Retrieval Function
# retrieval.py
import os
import psycopg2
from embeddings import embed_text
def retrieve_relevant_chunks(query: str, top_k: int = 5) -> list[dict]:
"""Find the most relevant document chunks for a query."""
# Embed the query
query_embedding = embed_text(query)
conn = psycopg2.connect(os.getenv("DATABASE_URL"))
cursor = conn.cursor()
# Find most similar chunks using cosine similarity
cursor.execute(
"""
SELECT title, content,
1 - (embedding <=> %s::vector) AS similarity
FROM documents
ORDER BY embedding <=> %s::vector
LIMIT %s
""",
(query_embedding, query_embedding, top_k)
)
results = []
for title, content, similarity in cursor.fetchall():
results.append({
"title": title,
"content": content,
"similarity": float(similarity)
})
cursor.close()
conn.close()
# Filter out very low similarity results
return [r for r in results if r["similarity"] > 0.7]
Step 6: Build the RAG Query Function
# rag.py
import os
from anthropic import Anthropic
from retrieval import retrieve_relevant_chunks
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def rag_query(question: str, verbose: bool = False) -> str:
"""Answer a question using RAG."""
# 1. Retrieve relevant chunks
chunks = retrieve_relevant_chunks(question, top_k=5)
if not chunks:
return "I couldn't find relevant information in the documents to answer this question."
if verbose:
print(f"\nRetrieved {len(chunks)} chunks:")
for c in chunks:
print(f" [{c['similarity']:.2f}] {c['title']}")
# 2. Build context from retrieved chunks
context = "\n\n---\n\n".join([
f"Source: {c['title']}\n\n{c['content']}"
for c in chunks
])
# 3. Query Claude with retrieved context
response = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=1024,
system="""You are a helpful assistant that answers questions based on provided documents.
Rules:
- Only use information from the provided context documents
- If the answer isn't in the documents, say "I don't have that information in the provided documents"
- Cite which document each piece of information comes from
- Be concise and accurate""",
messages=[
{
"role": "user",
"content": f"""Context documents:
{context}
---
Question: {question}"""
}
]
)
return response.content[0].text
if __name__ == "__main__":
# Interactive CLI
print("RAG Application Ready")
print("Type your question (or 'quit' to exit)\n")
while True:
question = input("Question: ").strip()
if question.lower() in ["quit", "exit"]:
break
answer = rag_query(question, verbose=True)
print(f"\nAnswer:\n{answer}\n")
Step 7: Test It
# First ingest some documents
python ingest.py
# Then ask questions
python rag.py
Improving Retrieval Quality
The basic implementation works but can be improved:
1. Better chunking strategy
Chunk by semantic boundaries (paragraphs, sections) rather than arbitrary word counts:
def chunk_by_paragraph(text: str) -> list[str]:
"""Chunk by paragraph — better for structured documents."""
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
# Merge very short paragraphs
chunks = []
current = ""
for para in paragraphs:
if len(current.split()) + len(para.split()) < 600:
current = f"{current}\n\n{para}" if current else para
else:
if current:
chunks.append(current)
current = para
if current:
chunks.append(current)
return chunks
2. Hybrid search — combine vector similarity with keyword search for better precision on specific terms.
3. Re-ranking — use a cross-encoder model to re-rank retrieved chunks by relevance before sending to Claude.
4. Parent-child chunking — store small chunks for retrieval, but inject the surrounding larger context into Claude.
Production Considerations
Rate limiting: Implement exponential backoff on embedding API calls when ingesting large document sets.
Caching: Cache embeddings for frequently queried terms. Cache Claude responses for identical queries.
Monitoring: Track retrieval quality (are the returned chunks relevant?) and answer quality (are users satisfied?).
Cost: Embedding costs (OpenAI text-embedding-3-small is ~$0.00002/1K tokens). Claude API costs for the generation step. At scale, embedding costs at ingestion time are a one-time cost; generation costs are per query.
This architecture scales to millions of documents with proper indexing. pgvector with ivfflat indexes handles 100M+ vectors efficiently.