Building a Personal Knowledge Base with Vector Embeddings
The Problem with Traditional Note-Taking
I have years of notes spread across markdown files, browser bookmarks, PDF highlights, and Slack messages. Traditional search finds documents by keyword, but that fails when I remember a concept but not the exact words I used to describe it. I wanted to search my entire knowledge base semantically: "that article about handling race conditions in distributed systems" should find the right document even if it never mentions the word "race condition."
So I built one. Here is the complete system.
Architecture Overview
The system has four components:
- Ingestion pipeline: Collects and processes documents from multiple sources
- Embedding engine: Converts documents into vector representations
- Vector store: ChromaDB for storage and similarity search
- Query interface: CLI and web interface for searching
Source Connectors
from pathlib import Path
import frontmatter
class MarkdownConnector:
def __init__(self, directory: str):
self.directory = Path(directory)
def collect(self) -> list[dict]:
documents = []
for md_file in self.directory.rglob("*.md"):
post = frontmatter.load(md_file)
documents.append({
"id": str(md_file),
"title": post.get("title", md_file.stem),
"content": post.content,
"source": "markdown",
"tags": post.get("tags", []),
"date": str(post.get("date", md_file.stat().st_mtime))
})
return documents
class BookmarkConnector:
def __init__(self, bookmarks_file: str):
self.bookmarks_file = bookmarks_file
def collect(self) -> list[dict]:
with open(self.bookmarks_file) as f:
bookmarks = json.load(f)
documents = []
for bookmark in bookmarks:
# Fetch and extract content from bookmarked URLs
content = extract_article_text(bookmark["url"])
if content:
documents.append({
"id": bookmark["url"],
"title": bookmark["title"],
"content": content[:5000],
"source": "bookmark",
"tags": bookmark.get("tags", []),
"date": bookmark["added_date"]
})
return documents
Document Chunking
Documents need to be chunked for effective embedding and retrieval. I use a semantic chunking approach that splits at paragraph boundaries while maintaining context.
class SemanticChunker:
def __init__(self, max_tokens: int = 400, overlap_tokens: int = 50):
self.max_tokens = max_tokens
self.overlap_tokens = overlap_tokens
def chunk(self, document: dict) -> list[dict]:
paragraphs = document["content"].split("\n\n")
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = len(para.split()) # Rough token count
if current_tokens + para_tokens > self.max_tokens and current_chunk:
chunk_text = "\n\n".join(current_chunk)
chunks.append({
"id": f"{document['id']}_chunk_{len(chunks)}",
"content": chunk_text,
"metadata": {
"source_id": document["id"],
"title": document["title"],
"source": document["source"],
"chunk_index": len(chunks)
}
})
# Keep last paragraph for overlap
current_chunk = current_chunk[-1:] if current_chunk else []
current_tokens = len(current_chunk[0].split()) if current_chunk else 0
current_chunk.append(para)
current_tokens += para_tokens
# Handle remaining content
if current_chunk:
chunks.append({
"id": f"{document['id']}_chunk_{len(chunks)}",
"content": "\n\n".join(current_chunk),
"metadata": {
"source_id": document["id"],
"title": document["title"],
"source": document["source"],
"chunk_index": len(chunks)
}
})
return chunks
Embedding and Storage
import chromadb
from openai import OpenAI
class KnowledgeBase:
def __init__(self, db_path: str = "./knowledge_db"):
self.chroma = chromadb.PersistentClient(path=db_path)
self.collection = self.chroma.get_or_create_collection(
name="knowledge",
metadata={"hnsw:space": "cosine"}
)
self.openai = OpenAI()
def embed_texts(self, texts: list[str]) -> list[list[float]]:
response = self.openai.embeddings.create(
model="text-embedding-3-small",
input=texts,
dimensions=512
)
return [item.embedding for item in response.data]
def ingest(self, chunks: list[dict]):
batch_size = 100
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
embeddings = self.embed_texts([c["content"] for c in batch])
self.collection.add(
ids=[c["id"] for c in batch],
embeddings=embeddings,
documents=[c["content"] for c in batch],
metadatas=[c["metadata"] for c in batch]
)
Semantic Search
def search(self, query: str, top_k: int = 10, source_filter: str = None) -> list[dict]:
query_embedding = self.embed_texts([query])[0]
where_filter = {"source": source_filter} if source_filter else None
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
where=where_filter,
include=["documents", "metadatas", "distances"]
)
formatted = []
for i in range(len(results["ids"][0])):
formatted.append({
"id": results["ids"][0][i],
"content": results["documents"][0][i],
"metadata": results["metadatas"][0][i],
"similarity": 1 - results["distances"][0][i]
})
return formatted
The CLI Interface
import argparse
def main():
parser = argparse.ArgumentParser(description="Knowledge Base CLI")
subparsers = parser.add_subparsers(dest="command")
search_parser = subparsers.add_parser("search")
search_parser.add_argument("query", type=str)
search_parser.add_argument("--source", type=str, default=None)
search_parser.add_argument("--top-k", type=int, default=5)
ingest_parser = subparsers.add_parser("ingest")
ingest_parser.add_argument("--source", type=str, required=True)
args = parser.parse_args()
kb = KnowledgeBase()
if args.command == "search":
results = kb.search(args.query, top_k=args.top_k, source_filter=args.source)
for r in results:
print(f"\n[{r['similarity']:.2f}] {r['metadata']['title']}")
print(f" {r['content'][:200]}...")
Daily Sync
I run a daily cron job that checks each source for new or modified documents and re-ingests only the changes. The incremental update keeps the knowledge base fresh without re-embedding everything.
Results
After ingesting 3 years of notes (approximately 2,400 documents, 8,200 chunks), the search quality is remarkably good. Queries like "that technique for reducing API latency with caching" correctly surface my notes on HTTP caching strategies, Redis patterns, and CDN configuration, even though none of those notes contain the exact phrase I searched for.
Building a personal knowledge base is one of the most valuable personal AI projects you can undertake. It compounds in value over time as you add more knowledge, and it becomes your external brain for everything you have ever learned.