| 3 min read

Sitemap Generation for Static Sites with Python

sitemap SEO Python static sites web development automation

Why Sitemaps Still Matter in 2026

Search engines are smart, but they are not omniscient. A properly formatted sitemap tells Google exactly what pages exist on your site, when they were last updated, and how important they are relative to each other. For static sites without server-side rendering, sitemaps are especially important because there is no dynamic discovery mechanism.

I built a Python sitemap generator for my portfolio site and now use variations of it across multiple projects. Here is the complete implementation.

The Basic Generator

from datetime import date, datetime
from pathlib import Path
import xml.etree.ElementTree as ET
from xml.dom import minidom

class SitemapGenerator:
    def __init__(self, base_url: str):
        self.base_url = base_url.rstrip('/')
        self.urls: list[dict] = []
    
    def add_url(
        self, 
        path: str, 
        lastmod: str = None, 
        changefreq: str = 'monthly',
        priority: float = 0.5
    ):
        self.urls.append({
            'loc': f"{self.base_url}/{path.lstrip('/')}",
            'lastmod': lastmod or date.today().isoformat(),
            'changefreq': changefreq,
            'priority': str(priority)
        })
    
    def generate(self) -> str:
        urlset = ET.Element('urlset')
        urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
        
        for url_data in self.urls:
            url = ET.SubElement(urlset, 'url')
            for key, value in url_data.items():
                child = ET.SubElement(url, key)
                child.text = value
        
        rough_string = ET.tostring(urlset, encoding='unicode')
        reparsed = minidom.parseString(rough_string)
        return reparsed.toprettyxml(indent='  ', encoding=None)
    
    def save(self, output_path: str):
        xml_content = self.generate()
        header = '\n'
        # Remove the default header from toprettyxml
        content = xml_content.split('\n', 1)[1] if xml_content.startswith('

Auto-Discovery from File System

For static sites, the file system is the source of truth. I scan the build directory and generate sitemap entries for every HTML file.

class StaticSiteScanner:
    IGNORE_PATTERNS = [
        '404.html', '_*', 'template*', 'index-template*'
    ]
    
    def __init__(self, site_dir: str, base_url: str):
        self.site_dir = Path(site_dir)
        self.base_url = base_url
    
    def scan(self) -> list[dict]:
        pages = []
        
        for html_file in self.site_dir.rglob('*.html'):
            relative = html_file.relative_to(self.site_dir)
            
            if self._should_ignore(relative):
                continue
            
            # Determine URL path
            url_path = str(relative)
            if url_path == 'index.html':
                url_path = ''
            elif url_path.endswith('/index.html'):
                url_path = url_path[:-len('/index.html')] + '/'
            
            # Get modification time
            mtime = datetime.fromtimestamp(html_file.stat().st_mtime)
            
            pages.append({
                'path': url_path,
                'lastmod': mtime.strftime('%Y-%m-%d'),
                'priority': self._calculate_priority(url_path),
                'changefreq': self._guess_changefreq(url_path)
            })
        
        return pages
    
    def _should_ignore(self, path: Path) -> bool:
        from fnmatch import fnmatch
        name = path.name
        return any(fnmatch(name, pattern) for pattern in self.IGNORE_PATTERNS)
    
    def _calculate_priority(self, path: str) -> float:
        if path == '' or path == '/':
            return 1.0
        
        depth = path.count('/')
        if 'blog' in path and depth <= 1:
            return 0.8
        if 'blog' in path:
            return 0.6
        
        return max(0.3, 0.8 - depth * 0.1)
    
    def _guess_changefreq(self, path: str) -> str:
        if path == '' or path == '/':
            return 'weekly'
        if 'blog' in path and path.count('/') <= 1:
            return 'daily'
        return 'monthly'

Blog Post Integration

For sites with a blog managed by a JSON-based system, I integrate the post data directly:

import json

def generate_blog_sitemap(posts_file: str, base_url: str) -> SitemapGenerator:
    with open(posts_file) as f:
        posts = json.load(f)
    
    gen = SitemapGenerator(base_url)
    
    # Homepage
    gen.add_url('/', changefreq='weekly', priority=1.0)
    
    # Blog index
    gen.add_url('/blog/', changefreq='daily', priority=0.8)
    
    # Individual posts
    published = [p for p in posts if p['status'] == 'published']
    for post in sorted(published, key=lambda p: p['date'], reverse=True):
        gen.add_url(
            f"/blog/{post['slug']}.html",
            lastmod=post['date'],
            changefreq='monthly',
            priority=0.6
        )
    
    return gen

Sitemap Validation

Before deploying, validate the sitemap to catch common issues:

from urllib.parse import urlparse

class SitemapValidator:
    MAX_URLS = 50000
    MAX_SIZE_BYTES = 50 * 1024 * 1024  # 50MB
    
    def validate(self, sitemap_path: str) -> dict:
        issues = []
        
        content = Path(sitemap_path).read_text()
        
        # Size check
        size = len(content.encode('utf-8'))
        if size > self.MAX_SIZE_BYTES:
            issues.append(f"Sitemap exceeds 50MB limit: {size} bytes")
        
        # Parse and validate
        tree = ET.parse(sitemap_path)
        root = tree.getroot()
        ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        urls = root.findall('.//sm:url', ns)
        
        if len(urls) > self.MAX_URLS:
            issues.append(f"Exceeds {self.MAX_URLS} URL limit: {len(urls)} URLs")
        
        seen_locs = set()
        for url in urls:
            loc = url.find('sm:loc', ns)
            if loc is None or not loc.text:
                issues.append("URL entry missing loc element")
                continue
            
            # Check for duplicates
            if loc.text in seen_locs:
                issues.append(f"Duplicate URL: {loc.text}")
            seen_locs.add(loc.text)
            
            # Validate URL format
            parsed = urlparse(loc.text)
            if not parsed.scheme or not parsed.netloc:
                issues.append(f"Invalid URL: {loc.text}")
            
            # Validate priority
            priority = url.find('sm:priority', ns)
            if priority is not None:
                try:
                    p = float(priority.text)
                    if not 0.0 <= p <= 1.0:
                        issues.append(f"Invalid priority {p} for {loc.text}")
                except ValueError:
                    issues.append(f"Non-numeric priority for {loc.text}")
        
        return {
            'valid': len(issues) == 0,
            'url_count': len(urls),
            'size_bytes': size,
            'issues': issues
        }

Automated Generation with Cron

#!/usr/bin/env python3
"""Regenerate sitemap and submit to search engines."""

import requests

def main():
    gen = generate_blog_sitemap(
        posts_file='/home/user/site/blog/posts.json',
        base_url='https://stevecv.com'
    )
    
    output = '/home/user/site/public_html/sitemap.xml'
    gen.save(output)
    
    validator = SitemapValidator()
    result = validator.validate(output)
    print(f"Sitemap: {result['url_count']} URLs, {result['size_bytes']} bytes")
    
    if not result['valid']:
        print(f"Validation issues: {result['issues']}")
        return
    
    # Ping Google
    ping_url = f"https://www.google.com/ping?sitemap=https://stevecv.com/sitemap.xml"
    requests.get(ping_url)
    print("Google pinged successfully")

if __name__ == '__main__':
    main()

Tips for Better Sitemaps

  • Only include canonical URLs. No duplicate content with different parameters.
  • Update lastmod only when the content actually changes, not on every build.
  • Use priority to indicate relative importance within your own site, not absolute importance.
  • Reference your sitemap in robots.txt: Sitemap: https://yoursite.com/sitemap.xml
  • For sites with more than 50,000 URLs, use a sitemap index file.

Sitemap generation is a small investment that pays continuous SEO dividends. Automate it once, run it on every deploy, and never think about it again.