Retrieval Augmented Generation (RAG) lets LLMs answer questions about your private documents. This guide builds a complete RAG pipeline using LangChain for orchestration and PostgreSQL with the pgvector extension as the vector store — keeping everything self-hosted.
Architecture
Documents → Split → Embed → pgvector store
↓
User query → Embed → Similar doc search → LLM + context → Answer
Setup
PostgreSQL with pgvector
# Using Docker
docker run -d \
--name pgvector \
-e POSTGRES_PASSWORD=secret \
-e POSTGRES_DB=ragdb \
-p 5432:5432 \
pgvector/pgvector:pg16
# Or install extension on existing PostgreSQL 16
sudo apt install postgresql-16-pgvector
psql -U postgres -c "CREATE EXTENSION IF NOT EXISTS vector;"
Python Dependencies
pip install langchain langchain-community langchain-openai
pip install psycopg2-binary pgvector
pip install langchain-ollama # for local LLMs
pip install pypdf unstructured # document loaders
pip install python-dotenv
.env:
OPENAI_API_KEY=sk-... # optional if using Ollama
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=ragdb
POSTGRES_USER=postgres
POSTGRES_PASSWORD=secret
Loading Documents
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
DirectoryLoader,
WebBaseLoader,
)
# Load a single PDF
loader = PyPDFLoader("docs/manual.pdf")
documents = loader.load()
# Load all PDFs from a directory
loader = DirectoryLoader("docs/", glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
# Load text files
loader = DirectoryLoader("docs/", glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()
# Load from web
loader = WebBaseLoader(["https://docs.python.org/3/tutorial/"])
documents = loader.load()
print(f"Loaded {len(documents)} documents")
print(f"First doc preview: {documents[0].page_content[:200]}")
Splitting Documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # characters per chunk
chunk_overlap=200, # overlap between chunks for context
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks")
print(f"Example chunk:\n{chunks[0].page_content}")
print(f"Metadata: {chunks[0].metadata}")
Embedding Models
Option 1: OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small", # 1536 dimensions, cheap
# model="text-embedding-3-large", # 3072 dimensions, better
)
Option 2: Ollama Embeddings (Local, Free)
ollama pull nomic-embed-text
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(
model="nomic-embed-text",
base_url="http://localhost:11434",
)
Creating the pgvector Store
import os
from langchain_community.vectorstores import PGVector
from dotenv import load_dotenv
load_dotenv()
CONNECTION_STRING = PGVector.connection_string_from_db_params(
driver="psycopg2",
host=os.getenv("POSTGRES_HOST", "localhost"),
port=int(os.getenv("POSTGRES_PORT", 5432)),
database=os.getenv("POSTGRES_DB", "ragdb"),
user=os.getenv("POSTGRES_USER", "postgres"),
password=os.getenv("POSTGRES_PASSWORD", "secret"),
)
COLLECTION_NAME = "my_documents"
# Create vector store and ingest documents
vectorstore = PGVector.from_documents(
documents=chunks,
embedding=embeddings,
collection_name=COLLECTION_NAME,
connection_string=CONNECTION_STRING,
pre_delete_collection=False, # True to re-ingest from scratch
)
print("Documents ingested into pgvector!")
Connect to Existing Store
vectorstore = PGVector(
collection_name=COLLECTION_NAME,
connection_string=CONNECTION_STRING,
embedding_function=embeddings,
)
Similarity Search
# Basic similarity search
query = "How do I reset a forgotten password?"
results = vectorstore.similarity_search(query, k=4)
for i, doc in enumerate(results):
print(f"\n--- Result {i+1} ---")
print(f"Source: {doc.metadata.get('source', 'unknown')}")
print(f"Content: {doc.page_content[:300]}")
# With similarity scores
results_with_scores = vectorstore.similarity_search_with_score(query, k=4)
for doc, score in results_with_scores:
print(f"Score: {score:.4f} | {doc.page_content[:100]}")
# MMR (Maximum Marginal Relevance) for diverse results
results = vectorstore.max_marginal_relevance_search(
query, k=4, fetch_k=20, lambda_mult=0.5
)
Building the RAG Chain
Option 1: Local LLM with Ollama
from langchain_ollama import OllamaLLM
llm = OllamaLLM(
model="llama3.2",
base_url="http://localhost:11434",
temperature=0.1,
)
Option 2: OpenAI
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
Simple RAG Chain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
prompt_template = """You are a helpful assistant. Use the following context to answer the question.
If the answer is not in the context, say "I don't have enough information to answer this."
Do not make up information.
Context:
{context}
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 4}
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff", # stuff all docs into context
retriever=retriever,
chain_type_kwargs={"prompt": PROMPT},
return_source_documents=True,
)
# Ask a question
result = qa_chain.invoke({"query": "How do I reset my password?"})
print("Answer:", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
print(f" - {doc.metadata.get('source', 'unknown')} (page {doc.metadata.get('page', '?')})")
LCEL Chain (Modern LangChain Style)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
prompt = ChatPromptTemplate.from_template("""
Answer the question based only on the following context:
{context}
Question: {question}
If you cannot answer from the context alone, say so clearly.
""")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
RunnableParallel(
context=(retriever | format_docs),
question=RunnablePassthrough()
)
| prompt
| llm
| StrOutputParser()
)
answer = rag_chain.invoke("What are the system requirements?")
print(answer)
# Streaming
for chunk in rag_chain.stream("Explain the installation process"):
print(chunk, end="", flush=True)
Conversational RAG (with Memory)
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key="answer"
)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory,
return_source_documents=True,
verbose=False,
)
# Multi-turn conversation
def chat(question: str):
result = conversation_chain.invoke({"question": question})
print(f"Q: {question}")
print(f"A: {result['answer']}\n")
return result["answer"]
chat("What is this document about?")
chat("Can you give me more details about the first point?")
chat("How does that relate to what you said before?")
Complete Working Example
#!/usr/bin/env python3
"""Complete RAG pipeline with LangChain + pgvector + Ollama"""
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_community.vectorstores import PGVector
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
load_dotenv()
CONN_STR = f"postgresql+psycopg2://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}/{os.getenv('POSTGRES_DB')}"
COLLECTION = "my_docs"
embeddings = OllamaEmbeddings(model="nomic-embed-text")
llm = OllamaLLM(model="llama3.2", temperature=0.1)
def ingest_documents(docs_path: str):
"""Load, split, and store documents."""
print(f"Loading documents from {docs_path}...")
loader = DirectoryLoader(docs_path, glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()
print(f"Loaded {len(docs)} documents")
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
chunks = splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks")
print("Embedding and storing in pgvector...")
vectorstore = PGVector.from_documents(
documents=chunks,
embedding=embeddings,
collection_name=COLLECTION,
connection_string=CONN_STR,
pre_delete_collection=True,
)
print("Done! Documents stored in pgvector.")
return vectorstore
def get_vectorstore():
return PGVector(
collection_name=COLLECTION,
connection_string=CONN_STR,
embedding_function=embeddings,
)
def build_rag_chain(vectorstore):
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
prompt = ChatPromptTemplate.from_template("""
Answer the question using only the context below.
Say "I don't know" if the answer isn't in the context.
Context:
{context}
Question: {question}
Answer:""")
chain = (
RunnableParallel(
context=(retriever | (lambda docs: "\n\n".join(d.page_content for d in docs))),
question=RunnablePassthrough()
)
| prompt
| llm
| StrOutputParser()
)
return chain
if __name__ == "__main__":
# First run: ingest docs
if not Path("./docs").exists():
Path("./docs").mkdir()
Path("./docs/example.txt").write_text(
"This is a sample document about Python programming.\n"
"Python is a high-level programming language.\n"
"It is widely used for web development, data science, and automation."
)
vs = ingest_documents("./docs")
chain = build_rag_chain(vs)
questions = [
"What is Python used for?",
"Is Python a low-level language?",
"What is the capital of France?", # not in docs
]
for q in questions:
print(f"\nQ: {q}")
print(f"A: {chain.invoke(q)}")
Performance Tips
Embedding batch size:
# Process in batches to avoid timeouts
from langchain_community.vectorstores import PGVector
vectorstore = PGVector.from_documents(
documents=chunks,
embedding=embeddings,
connection_string=CONN_STR,
collection_name=COLLECTION,
# embeddings are batched automatically
)
Index for faster search:
-- Connect to your PostgreSQL and add HNSW index
CREATE INDEX ON langchain_pg_embedding
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
Metadata filtering:
# Filter by document source
results = vectorstore.similarity_search(
query,
k=4,
filter={"source": "docs/manual.pdf"}
)
Summary
You built a complete RAG pipeline:
- Load documents (PDF, text, web) with LangChain document loaders
- Split into chunks with
RecursiveCharacterTextSplitter - Embed with Ollama (
nomic-embed-text) or OpenAI - Store in pgvector via LangChain's
PGVector - Retrieve similar chunks at query time
- Generate answers by passing retrieved context to an LLM
The pgvector + PostgreSQL combination keeps your data self-hosted while still getting production-grade vector search. Switch between Ollama (free, local) and OpenAI (better quality) by swapping the embedding and LLM classes.