}

Build a RAG System from Scratch with Python and pgvector (No LangChain, 2026)

Build a RAG System from Scratch with Python and pgvector (No LangChain, 2026)

What is RAG and Why Build It from Scratch?

RAG (Retrieval-Augmented Generation) lets an LLM answer questions about your documents by: 1. Finding the most relevant document chunks (retrieval) 2. Adding them to the prompt (augmentation) 3. Letting the LLM generate an answer (generation)

Why skip LangChain? LangChain adds abstraction that's helpful for prototypes but complex for production. Building RAG from scratch means you understand every component and can optimize it.

Stack: Python + psycopg2 + pgvector + Ollama (free, local)

Prerequisites

# Install PostgreSQL with pgvector
sudo apt install postgresql postgresql-contrib
sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"

# Install Python deps
pip install psycopg2-binary requests

# Pull embedding model
ollama pull nomic-embed-text
ollama pull llama3.2

Step 1: Database Setup

import psycopg2

conn = psycopg2.connect("postgresql://postgres:password@localhost/ragdb")
cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

cur.execute("""
    CREATE TABLE IF NOT EXISTS documents (
        id SERIAL PRIMARY KEY,
        content TEXT NOT NULL,
        source TEXT,
        chunk_index INTEGER,
        embedding vector(768)
    );
""")

cur.execute("""
    CREATE INDEX IF NOT EXISTS documents_embedding_idx
    ON documents USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100);
""")

conn.commit()

Step 2: Generate Embeddings with Ollama

import requests

def embed(text: str) -> list[float]:
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        json={"model": "nomic-embed-text", "prompt": text}
    )
    return response.json()["embedding"]

Step 3: Chunk Documents

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """Split text into overlapping chunks."""
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)

    return chunks

# Example
with open("my_document.txt") as f:
    text = f.read()

chunks = chunk_text(text)
print(f"Split into {len(chunks)} chunks")

Step 4: Index Documents

import json

def index_document(source: str, text: str):
    chunks = chunk_text(text)

    for i, chunk in enumerate(chunks):
        embedding = embed(chunk)

        cur.execute(
            """
            INSERT INTO documents (content, source, chunk_index, embedding)
            VALUES (%s, %s, %s, %s::vector)
            """,
            (chunk, source, i, json.dumps(embedding))
        )

    conn.commit()
    print(f"Indexed {len(chunks)} chunks from {source}")

# Index your documents
index_document("python_docs.txt", open("python_docs.txt").read())
index_document("flask_guide.txt", open("flask_guide.txt").read())

Step 5: Retrieve Relevant Chunks

def retrieve(query: str, top_k: int = 5) -> list[dict]:
    query_embedding = embed(query)

    cur.execute(
        """
        SELECT content, source,
               1 - (embedding <=> %s::vector) AS similarity
        FROM documents
        ORDER BY embedding <=> %s::vector
        LIMIT %s
        """,
        (json.dumps(query_embedding), json.dumps(query_embedding), top_k)
    )

    results = cur.fetchall()
    return [
        {"content": r[0], "source": r[1], "similarity": r[2]}
        for r in results
    ]

# Test retrieval
results = retrieve("How do I create a Flask route?")
for r in results:
    print(f"[{r['similarity']:.3f}] {r['source']}: {r['content'][:100]}...")

Step 6: Generate Answer with LLM

def generate(query: str, context_chunks: list[dict]) -> str:
    context = "\n\n".join([
        f"Source: {c['source']}\n{c['content']}"
        for c in context_chunks
    ])

    prompt = f"""Answer the question based on the provided context.
If the answer is not in the context, say so.

Context:
{context}

Question: {query}

Answer:"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "llama3.2",
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json()["response"]

Step 7: Complete RAG Pipeline

def rag(query: str) -> str:
    # 1. Retrieve relevant chunks
    chunks = retrieve(query, top_k=5)

    # 2. Filter by relevance threshold
    relevant = [c for c in chunks if c["similarity"] > 0.7]

    if not relevant:
        return "I couldn't find relevant information to answer your question."

    # 3. Generate answer
    answer = generate(query, relevant)
    return answer

# Chat with your documents
print(rag("How do I handle errors in Flask?"))
print(rag("What is the difference between GET and POST?"))

Step 8: Interactive Chat Loop

def chat():
    print("RAG Chatbot — ask questions about your documents (type 'quit' to exit)")

    while True:
        query = input("\nYou: ").strip()
        if query.lower() in ("quit", "exit"):
            break

        answer = rag(query)
        print(f"\nAssistant: {answer}")

chat()

Add Metadata Filtering

def retrieve_filtered(query: str, source: str, top_k: int = 5):
    query_embedding = embed(query)

    cur.execute(
        """
        SELECT content, source, 1 - (embedding <=> %s::vector) AS similarity
        FROM documents
        WHERE source = %s
        ORDER BY embedding <=> %s::vector
        LIMIT %s
        """,
        (json.dumps(query_embedding), source, json.dumps(query_embedding), top_k)
    )
    return cur.fetchall()

# Search only in Flask docs
results = retrieve_filtered("routing", "flask_guide.txt")

Leonardo Lazzaro

Software engineer and technical writer. 10+ years experience in DevOps, Python, and Linux systems.

More articles by Leonardo Lazzaro