| 3 min read

Building a Personal Knowledge Base with Vector Embeddings

knowledge base vector embeddings RAG personal tools Python productivity

The Problem with Traditional Note-Taking

I have years of notes spread across markdown files, browser bookmarks, PDF highlights, and Slack messages. Traditional search finds documents by keyword, but that fails when I remember a concept but not the exact words I used to describe it. I wanted to search my entire knowledge base semantically: "that article about handling race conditions in distributed systems" should find the right document even if it never mentions the word "race condition."

So I built one. Here is the complete system.

Architecture Overview

The system has four components:

  • Ingestion pipeline: Collects and processes documents from multiple sources
  • Embedding engine: Converts documents into vector representations
  • Vector store: ChromaDB for storage and similarity search
  • Query interface: CLI and web interface for searching

Source Connectors

from pathlib import Path
import frontmatter

class MarkdownConnector:
    def __init__(self, directory: str):
        self.directory = Path(directory)
    
    def collect(self) -> list[dict]:
        documents = []
        for md_file in self.directory.rglob("*.md"):
            post = frontmatter.load(md_file)
            documents.append({
                "id": str(md_file),
                "title": post.get("title", md_file.stem),
                "content": post.content,
                "source": "markdown",
                "tags": post.get("tags", []),
                "date": str(post.get("date", md_file.stat().st_mtime))
            })
        return documents

class BookmarkConnector:
    def __init__(self, bookmarks_file: str):
        self.bookmarks_file = bookmarks_file
    
    def collect(self) -> list[dict]:
        with open(self.bookmarks_file) as f:
            bookmarks = json.load(f)
        
        documents = []
        for bookmark in bookmarks:
            # Fetch and extract content from bookmarked URLs
            content = extract_article_text(bookmark["url"])
            if content:
                documents.append({
                    "id": bookmark["url"],
                    "title": bookmark["title"],
                    "content": content[:5000],
                    "source": "bookmark",
                    "tags": bookmark.get("tags", []),
                    "date": bookmark["added_date"]
                })
        return documents

Document Chunking

Documents need to be chunked for effective embedding and retrieval. I use a semantic chunking approach that splits at paragraph boundaries while maintaining context.

class SemanticChunker:
    def __init__(self, max_tokens: int = 400, overlap_tokens: int = 50):
        self.max_tokens = max_tokens
        self.overlap_tokens = overlap_tokens
    
    def chunk(self, document: dict) -> list[dict]:
        paragraphs = document["content"].split("\n\n")
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for para in paragraphs:
            para_tokens = len(para.split())  # Rough token count
            
            if current_tokens + para_tokens > self.max_tokens and current_chunk:
                chunk_text = "\n\n".join(current_chunk)
                chunks.append({
                    "id": f"{document['id']}_chunk_{len(chunks)}",
                    "content": chunk_text,
                    "metadata": {
                        "source_id": document["id"],
                        "title": document["title"],
                        "source": document["source"],
                        "chunk_index": len(chunks)
                    }
                })
                # Keep last paragraph for overlap
                current_chunk = current_chunk[-1:] if current_chunk else []
                current_tokens = len(current_chunk[0].split()) if current_chunk else 0
            
            current_chunk.append(para)
            current_tokens += para_tokens
        
        # Handle remaining content
        if current_chunk:
            chunks.append({
                "id": f"{document['id']}_chunk_{len(chunks)}",
                "content": "\n\n".join(current_chunk),
                "metadata": {
                    "source_id": document["id"],
                    "title": document["title"],
                    "source": document["source"],
                    "chunk_index": len(chunks)
                }
            })
        
        return chunks

Embedding and Storage

import chromadb
from openai import OpenAI

class KnowledgeBase:
    def __init__(self, db_path: str = "./knowledge_db"):
        self.chroma = chromadb.PersistentClient(path=db_path)
        self.collection = self.chroma.get_or_create_collection(
            name="knowledge",
            metadata={"hnsw:space": "cosine"}
        )
        self.openai = OpenAI()
    
    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        response = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=texts,
            dimensions=512
        )
        return [item.embedding for item in response.data]
    
    def ingest(self, chunks: list[dict]):
        batch_size = 100
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            embeddings = self.embed_texts([c["content"] for c in batch])
            
            self.collection.add(
                ids=[c["id"] for c in batch],
                embeddings=embeddings,
                documents=[c["content"] for c in batch],
                metadatas=[c["metadata"] for c in batch]
            )

Semantic Search

    def search(self, query: str, top_k: int = 10, source_filter: str = None) -> list[dict]:
        query_embedding = self.embed_texts([query])[0]
        
        where_filter = {"source": source_filter} if source_filter else None
        
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k,
            where=where_filter,
            include=["documents", "metadatas", "distances"]
        )
        
        formatted = []
        for i in range(len(results["ids"][0])):
            formatted.append({
                "id": results["ids"][0][i],
                "content": results["documents"][0][i],
                "metadata": results["metadatas"][0][i],
                "similarity": 1 - results["distances"][0][i]
            })
        
        return formatted

The CLI Interface

import argparse

def main():
    parser = argparse.ArgumentParser(description="Knowledge Base CLI")
    subparsers = parser.add_subparsers(dest="command")
    
    search_parser = subparsers.add_parser("search")
    search_parser.add_argument("query", type=str)
    search_parser.add_argument("--source", type=str, default=None)
    search_parser.add_argument("--top-k", type=int, default=5)
    
    ingest_parser = subparsers.add_parser("ingest")
    ingest_parser.add_argument("--source", type=str, required=True)
    
    args = parser.parse_args()
    kb = KnowledgeBase()
    
    if args.command == "search":
        results = kb.search(args.query, top_k=args.top_k, source_filter=args.source)
        for r in results:
            print(f"\n[{r['similarity']:.2f}] {r['metadata']['title']}")
            print(f"  {r['content'][:200]}...")

Daily Sync

I run a daily cron job that checks each source for new or modified documents and re-ingests only the changes. The incremental update keeps the knowledge base fresh without re-embedding everything.

Results

After ingesting 3 years of notes (approximately 2,400 documents, 8,200 chunks), the search quality is remarkably good. Queries like "that technique for reducing API latency with caching" correctly surface my notes on HTTP caching strategies, Redis patterns, and CDN configuration, even though none of those notes contain the exact phrase I searched for.

Building a personal knowledge base is one of the most valuable personal AI projects you can undertake. It compounds in value over time as you add more knowledge, and it becomes your external brain for everything you have ever learned.