Minimal working RAG trong Python ~80 dòng (không framework), để show các thành phần rõ ràng:
from openai import OpenAI
import numpy as np
client = OpenAI()
# 1. CHUNKING — đơn giản hóa bằng split đoạn
def chunk(text: str, size: int = 500, overlap: int = 50):
chunks, i = [], 0
while i < len(text):
chunks.append(text[i:i+size])
i += size - overlap
return chunks
# 2. EMBEDDING
def embed(texts: list[str]) -> np.ndarray:
res = client.embeddings.create(
model="text-embedding-3-small", input=texts
)
return np.array([d.embedding for d in res.data])
# 3. INDEX in-memory (production: Qdrant/Pinecone)
class SimpleVectorStore:
def __init__(self):
self.chunks, self.vectors = [], None
def add(self, docs: list[str]):
all_chunks = [c for d in docs for c in chunk(d)]
vecs = embed(all_chunks)
# L2 normalize để cosine = dot product
vecs = vecs / np.linalg.norm(vecs, axis=1, keepdims=True)
self.chunks.extend(all_chunks)
self.vectors = (
vecs if self.vectors is None
else np.vstack([self.vectors, vecs])
)
def search(self, query: str, k: int = 3) -> list[tuple[float, str]]:
q = embed([query])[0]
q = q / np.linalg.norm(q)
scores = self.vectors @ q # cosine sim
top = np.argsort(scores)[::-1][:k]
return [(float(scores[i]), self.chunks[i]) for i in top]
# 4. GENERATION with retrieved context
def rag_answer(store: SimpleVectorStore, question: str) -> str:
hits = store.search(question, k=3)
context = "\n\n".join(
f"[doc {i+1}] {chunk}" for i, (_, chunk) in enumerate(hits)
)
prompt = f"""Trả lời CHỈ dựa trên CONTEXT. Nếu không có thông tin, nói "Tôi không biết". Trích [doc N] cho mỗi claim.\n\nCONTEXT:\n{context}\n\nQUESTION: {question}"""
res = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
return res.choices[0].message.content
# --- USAGE ---
store = SimpleVectorStore()
store.add([open("docs.txt").read()])
print(rag_answer(store, "Sản phẩm hỗ trợ đổi trả trong bao nhiêu ngày?"))Production upgrades:
1. Chunking — dùng RecursiveCharacterTextSplitter (LangChain) hoặc semantic chunking thay vì split raw.
2. Vector DB — thay SimpleVectorStore bằng Qdrant/Pinecone/pgvector; persist, scale, filter metadata.
3. Hybrid search — thêm BM25 (rank_bm25) + RRF fusion.
4. Reranker — thêm cross-encoder (Cohere Rerank, BGE) sau vector search top-20 → keep top-3.
5. Metadata — lưu source, section, timestamp với mỗi chunk; filter runtime.
6. Streaming — stream=True cho UX mượt.
7. Citation parsing — regex extract [doc N] → map về source URL.
8. Caching — semantic cache (GPTCache) cho query tương tự.
9. Observability — trace retrieve + generate (Langfuse).
10. Eval — golden dataset + RAGAS faithfulness.
Framework production (ít code hơn): LangChain, LlamaIndex, Haystack, hoặc Vercel AI SDK với @vercel/postgres + pgvector.