Build a RAG System from Scratch with Python and pgvector (No LangChain, 2026)
What is RAG and Why Build It from Scratch?
RAG (Retrieval-Augmented Generation) lets an LLM answer questions about your documents by: 1. Finding the most relevant document chunks (retrieval) 2. Adding them to the prompt (augmentation) 3. Letting the LLM generate an answer (generation)
Why skip LangChain? LangChain adds abstraction that's helpful for prototypes but complex for production. Building RAG from scratch means you understand every component and can optimize it.
Stack: Python + psycopg2 + pgvector + Ollama (free, local)
Prerequisites
# Install PostgreSQL with pgvector
sudo apt install postgresql postgresql-contrib
sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"
# Install Python deps
pip install psycopg2-binary requests
# Pull embedding model
ollama pull nomic-embed-text
ollama pull llama3.2
Step 1: Database Setup
import psycopg2
conn = psycopg2.connect("postgresql://postgres:password@localhost/ragdb")
cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT NOT NULL,
source TEXT,
chunk_index INTEGER,
embedding vector(768)
);
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
""")
conn.commit()
Step 2: Generate Embeddings with Ollama
import requests
def embed(text: str) -> list[float]:
response = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": text}
)
return response.json()["embedding"]
Step 3: Chunk Documents
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks."""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
if chunk:
chunks.append(chunk)
return chunks
# Example
with open("my_document.txt") as f:
text = f.read()
chunks = chunk_text(text)
print(f"Split into {len(chunks)} chunks")
Step 4: Index Documents
import json
def index_document(source: str, text: str):
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
embedding = embed(chunk)
cur.execute(
"""
INSERT INTO documents (content, source, chunk_index, embedding)
VALUES (%s, %s, %s, %s::vector)
""",
(chunk, source, i, json.dumps(embedding))
)
conn.commit()
print(f"Indexed {len(chunks)} chunks from {source}")
# Index your documents
index_document("python_docs.txt", open("python_docs.txt").read())
index_document("flask_guide.txt", open("flask_guide.txt").read())
Step 5: Retrieve Relevant Chunks
def retrieve(query: str, top_k: int = 5) -> list[dict]:
query_embedding = embed(query)
cur.execute(
"""
SELECT content, source,
1 - (embedding <=> %s::vector) AS similarity
FROM documents
ORDER BY embedding <=> %s::vector
LIMIT %s
""",
(json.dumps(query_embedding), json.dumps(query_embedding), top_k)
)
results = cur.fetchall()
return [
{"content": r[0], "source": r[1], "similarity": r[2]}
for r in results
]
# Test retrieval
results = retrieve("How do I create a Flask route?")
for r in results:
print(f"[{r['similarity']:.3f}] {r['source']}: {r['content'][:100]}...")
Step 6: Generate Answer with LLM
def generate(query: str, context_chunks: list[dict]) -> str:
context = "\n\n".join([
f"Source: {c['source']}\n{c['content']}"
for c in context_chunks
])
prompt = f"""Answer the question based on the provided context.
If the answer is not in the context, say so.
Context:
{context}
Question: {query}
Answer:"""
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": "llama3.2",
"prompt": prompt,
"stream": False
}
)
return response.json()["response"]
Step 7: Complete RAG Pipeline
def rag(query: str) -> str:
# 1. Retrieve relevant chunks
chunks = retrieve(query, top_k=5)
# 2. Filter by relevance threshold
relevant = [c for c in chunks if c["similarity"] > 0.7]
if not relevant:
return "I couldn't find relevant information to answer your question."
# 3. Generate answer
answer = generate(query, relevant)
return answer
# Chat with your documents
print(rag("How do I handle errors in Flask?"))
print(rag("What is the difference between GET and POST?"))
Step 8: Interactive Chat Loop
def chat():
print("RAG Chatbot — ask questions about your documents (type 'quit' to exit)")
while True:
query = input("\nYou: ").strip()
if query.lower() in ("quit", "exit"):
break
answer = rag(query)
print(f"\nAssistant: {answer}")
chat()
Add Metadata Filtering
def retrieve_filtered(query: str, source: str, top_k: int = 5):
query_embedding = embed(query)
cur.execute(
"""
SELECT content, source, 1 - (embedding <=> %s::vector) AS similarity
FROM documents
WHERE source = %s
ORDER BY embedding <=> %s::vector
LIMIT %s
""",
(json.dumps(query_embedding), source, json.dumps(query_embedding), top_k)
)
return cur.fetchall()
# Search only in Flask docs
results = retrieve_filtered("routing", "flask_guide.txt")