03 — Build RAG Chatbot dari Nol
Estimasi: 8 jam Tujuan: Build personal RAG chatbot end-to-end. Output: project lengkap di GitHub, bisa di-demo.
Project Spec
Nama: Personal Knowledge Assistant Goal: Chatbot yang menjawab dari catatan/dokumen pribadi kamu (PDF, markdown, notes). User flow:
- User upload dokumen
- System index
- User chat — tanya tentang konten
- System jawab dengan source citation
Bonus:
- Multi-document support
- Source highlighting
- Chat history
- Export conversation
Step 1: Setup Project
mkdir personal-kb-assistant
cd personal-kb-assistant
# Virtual env
conda create -n kb python=3.11
conda activate kb
# Install
pip install llama-index llama-index-llms-gemini llama-index-embeddings-huggingface
pip install streamlit pypdf python-docx
pip install python-dotenv
# Setup .env
echo "GEMINI_API_KEY=your_key_here" > .env
echo ".env" >> .gitignore
Struktur Folder
personal-kb-assistant/
├── README.md
├── requirements.txt
├── .env
├── .env.example
├── .gitignore
├── app.py ← Streamlit UI
├── src/
│ ├── __init__.py
│ ├── ingestion.py ← document processing
│ ├── rag.py ← RAG engine
│ └── utils.py
├── data/
│ └── sample.pdf
├── storage/ ← persistent index (gitignored)
└── tests/
└── test_rag.py
Step 2: Document Ingestion
# src/ingestion.py
from pathlib import Path
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.node_parser import SentenceSplitter
import logging
logger = logging.getLogger(__name__)
class DocumentIngester:
def __init__(self, chunk_size=512, chunk_overlap=50):
self.splitter = SentenceSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def load_directory(self, path: str) -> list[Document]:
"""Load semua document dari folder."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Path not found: {path}")
try:
reader = SimpleDirectoryReader(
input_dir=str(path),
required_exts=[".pdf", ".txt", ".md", ".docx"],
recursive=True
)
docs = reader.load_data()
logger.info(f"Loaded {len(docs)} documents from {path}")
return docs
except Exception as e:
logger.error(f"Failed to load {path}: {e}")
raise
def load_file(self, file_path: str) -> list[Document]:
"""Load satu file."""
return self.load_directory(str(Path(file_path).parent))
def chunk(self, docs: list[Document]):
"""Chunk documents jadi nodes."""
nodes = self.splitter.get_nodes_from_documents(docs)
logger.info(f"Created {len(nodes)} chunks")
return nodes
Step 3: RAG Engine
# src/rag.py
import os
from pathlib import Path
from typing import Optional
from llama_index.core import VectorStoreIndex, StorageContext, Settings, load_index_from_storage
from llama_index.core import PromptTemplate
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import logging
logger = logging.getLogger(__name__)
QA_TEMPLATE = """Kamu adalah asisten yang menjawab pertanyaan berdasarkan dokumen pribadi user.
Konteks dari dokumen:
{context_str}
Pertanyaan: {query_str}
Aturan:
- Jawab HANYA berdasarkan konteks di atas
- Kalau jawaban tidak ada di konteks, bilang "Saya tidak menemukan info itu di dokumenmu"
- Sertakan kutipan singkat dari konteks kalau relevan
- Jawab dalam bahasa yang sama dengan pertanyaan
Jawaban:"""
class RAGEngine:
def __init__(
self,
api_key: Optional[str] = None,
model_name: str = "models/gemini-1.5-flash",
embedding_model: str = "paraphrase-multilingual-mpnet-base-v2",
persist_dir: str = "./storage"
):
self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
if not self.api_key:
raise ValueError("GEMINI_API_KEY required")
self.persist_dir = Path(persist_dir)
# Configure global settings
Settings.llm = Gemini(api_key=self.api_key, model=model_name)
Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
self.index = None
def build_index(self, nodes):
"""Bangun index dari nodes baru."""
logger.info("Building index...")
self.index = VectorStoreIndex(nodes)
self.persist()
logger.info("Index built and persisted")
def persist(self):
"""Save index ke disk."""
if self.index:
self.index.storage_context.persist(persist_dir=str(self.persist_dir))
def load(self) -> bool:
"""Load index dari disk. Return True kalau sukses."""
if not self.persist_dir.exists():
logger.warning(f"No index found at {self.persist_dir}")
return False
try:
storage_context = StorageContext.from_defaults(persist_dir=str(self.persist_dir))
self.index = load_index_from_storage(storage_context)
logger.info("Index loaded from disk")
return True
except Exception as e:
logger.error(f"Failed to load index: {e}")
return False
def query(self, question: str, top_k: int = 3) -> dict:
"""Query the RAG."""
if not self.index:
raise ValueError("No index. Build or load first.")
query_engine = self.index.as_query_engine(
text_qa_template=PromptTemplate(QA_TEMPLATE),
similarity_top_k=top_k,
)
response = query_engine.query(question)
return {
"answer": str(response),
"sources": [
{
"text": node.text[:200] + "...",
"score": node.score,
"metadata": node.metadata
}
for node in response.source_nodes
]
}
def chat_engine(self):
"""Get chat engine untuk multi-turn."""
if not self.index:
raise ValueError("No index loaded")
return self.index.as_chat_engine(
chat_mode="context",
similarity_top_k=3,
)
Step 4: Streamlit UI
# app.py
import streamlit as st
from pathlib import Path
import tempfile
import logging
from dotenv import load_dotenv
from src.ingestion import DocumentIngester
from src.rag import RAGEngine
load_dotenv()
logging.basicConfig(level=logging.INFO)
# Page config
st.set_page_config(
page_title="Personal KB Assistant",
page_icon="📚",
layout="wide"
)
st.title("📚 Personal Knowledge Assistant")
st.caption("Chat dengan dokumenmu sendiri")
# Sidebar
with st.sidebar:
st.header("⚙️ Setup")
if "rag" not in st.session_state:
st.session_state.rag = RAGEngine()
# Try load existing
if st.session_state.rag.load():
st.session_state.indexed = True
st.success("Index existing loaded")
else:
st.session_state.indexed = False
st.divider()
st.header("📥 Upload Documents")
uploaded_files = st.file_uploader(
"Drop PDF, TXT, MD, or DOCX",
type=["pdf", "txt", "md", "docx"],
accept_multiple_files=True
)
if uploaded_files and st.button("Ingest Documents"):
with st.spinner("Processing..."):
# Save ke temp
with tempfile.TemporaryDirectory() as tmpdir:
for uploaded in uploaded_files:
file_path = Path(tmpdir) / uploaded.name
file_path.write_bytes(uploaded.read())
ingester = DocumentIngester()
docs = ingester.load_directory(tmpdir)
nodes = ingester.chunk(docs)
st.session_state.rag.build_index(nodes)
st.session_state.indexed = True
st.success(f"Indexed {len(docs)} documents, {len(nodes)} chunks")
st.divider()
if st.session_state.get("indexed"):
st.success("✓ Ready to chat")
else:
st.warning("Upload documents first")
# Main chat
if "messages" not in st.session_state:
st.session_state.messages = []
# Display history
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
if msg.get("sources"):
with st.expander("📖 Sources"):
for i, src in enumerate(msg["sources"], 1):
st.markdown(f"**Source {i}** (score: {src['score']:.3f})")
st.text(src["text"])
# Input
if prompt := st.chat_input("Tanya tentang dokumenmu..."):
if not st.session_state.get("indexed"):
st.error("Upload documents first")
else:
# Add user message
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
# Generate response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
result = st.session_state.rag.query(prompt)
st.markdown(result["answer"])
with st.expander("📖 Sources"):
for i, src in enumerate(result["sources"], 1):
st.markdown(f"**Source {i}** (score: {src['score']:.3f})")
st.text(src["text"])
st.session_state.messages.append({
"role": "assistant",
"content": result["answer"],
"sources": result["sources"]
})
# Reset button
if st.session_state.messages and st.sidebar.button("🗑️ Clear Chat"):
st.session_state.messages = []
st.rerun()
Step 5: Run
streamlit run app.py
Browser auto-open ke http://localhost:8501. Upload PDF, tanya jawab.
Step 6: Test
# tests/test_rag.py
import pytest
from src.rag import RAGEngine
from src.ingestion import DocumentIngester
from llama_index.core import Document
def test_ingester():
ingester = DocumentIngester(chunk_size=100, chunk_overlap=10)
docs = [Document(text="ini adalah test " * 20)]
nodes = ingester.chunk(docs)
assert len(nodes) > 0
def test_rag_query():
rag = RAGEngine(persist_dir="./test_storage")
# Simple in-memory test
docs = [Document(text="Capital of Indonesia is Jakarta. It is in Java island.")]
ingester = DocumentIngester()
nodes = ingester.chunk(docs)
rag.build_index(nodes)
result = rag.query("What is the capital of Indonesia?")
assert "Jakarta" in result["answer"]
Step 7: README
# Personal Knowledge Assistant
RAG-based chatbot untuk tanya jawab dokumen pribadi.
## Features
- Upload PDF, TXT, MD, DOCX
- Multi-document support
- Source citation
- Bahasa Indonesia + English
- Streamlit UI
## Demo

## Live
🌐 [Try it here](https://your-app.streamlit.app)
## Stack
- LlamaIndex (RAG)
- Gemini API (LLM)
- Sentence-transformers (embedding)
- Streamlit (UI)
## Setup
\`\`\`bash
git clone <repo>
cd personal-kb-assistant
conda create -n kb python=3.11
conda activate kb
pip install -r requirements.txt
cp .env.example .env
# Edit .env, tambahkan GEMINI_API_KEY
streamlit run app.py
\`\`\`
## Architecture
\`\`\`mermaid
flowchart LR
D["📄 Documents"] --> I["Ingestion"]
I --> C["Chunking"]
C --> E["Embedding"]
E --> V["Vector DB"]
Q["❓ User Query"] --> QE["Embedding"]
QE --> R["Retrieve top-K"]
V --> R
R --> L["🤖 LLM"]
L --> A["✅ Answer"]
\`\`\`
## License
MIT
Challenge 7.3
Challenge 1 — Build & Test
Replicate semua step di atas. Pastikan:
- Bisa upload PDF
- Index sukses
- Chat works
- Source citation muncul
Challenge 2 — Personalize
Customize untuk use case-mu:
- Catatan belajar 12 minggu kamu (dari learning-materials!)
- Resep masakan
- Notes Notion (export markdown)
- Apa yang kamu peduli
Challenge 3 — Improvements
Tambah feature:
- Export chat ke markdown
- Multi-language toggle
- Show document metadata (file name, page)
- Statistics (total docs, chunks, queries)
Challenge 4 — Better Retrieval
- Implement re-ranking
- Hybrid search
- Compare quality before/after
Selanjutnya: 04-deployment.md