Sitemap Generation for Static Sites with Python
Why Sitemaps Still Matter in 2026
Search engines are smart, but they are not omniscient. A properly formatted sitemap tells Google exactly what pages exist on your site, when they were last updated, and how important they are relative to each other. For static sites without server-side rendering, sitemaps are especially important because there is no dynamic discovery mechanism.
I built a Python sitemap generator for my portfolio site and now use variations of it across multiple projects. Here is the complete implementation.
The Basic Generator
from datetime import date, datetime
from pathlib import Path
import xml.etree.ElementTree as ET
from xml.dom import minidom
class SitemapGenerator:
def __init__(self, base_url: str):
self.base_url = base_url.rstrip('/')
self.urls: list[dict] = []
def add_url(
self,
path: str,
lastmod: str = None,
changefreq: str = 'monthly',
priority: float = 0.5
):
self.urls.append({
'loc': f"{self.base_url}/{path.lstrip('/')}",
'lastmod': lastmod or date.today().isoformat(),
'changefreq': changefreq,
'priority': str(priority)
})
def generate(self) -> str:
urlset = ET.Element('urlset')
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
for url_data in self.urls:
url = ET.SubElement(urlset, 'url')
for key, value in url_data.items():
child = ET.SubElement(url, key)
child.text = value
rough_string = ET.tostring(urlset, encoding='unicode')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=' ', encoding=None)
def save(self, output_path: str):
xml_content = self.generate()
header = '\n'
# Remove the default header from toprettyxml
content = xml_content.split('\n', 1)[1] if xml_content.startswith('') else xml_content
with open(output_path, 'w') as f:
f.write(header + content)
Auto-Discovery from File System
For static sites, the file system is the source of truth. I scan the build directory and generate sitemap entries for every HTML file.
class StaticSiteScanner:
IGNORE_PATTERNS = [
'404.html', '_*', 'template*', 'index-template*'
]
def __init__(self, site_dir: str, base_url: str):
self.site_dir = Path(site_dir)
self.base_url = base_url
def scan(self) -> list[dict]:
pages = []
for html_file in self.site_dir.rglob('*.html'):
relative = html_file.relative_to(self.site_dir)
if self._should_ignore(relative):
continue
# Determine URL path
url_path = str(relative)
if url_path == 'index.html':
url_path = ''
elif url_path.endswith('/index.html'):
url_path = url_path[:-len('/index.html')] + '/'
# Get modification time
mtime = datetime.fromtimestamp(html_file.stat().st_mtime)
pages.append({
'path': url_path,
'lastmod': mtime.strftime('%Y-%m-%d'),
'priority': self._calculate_priority(url_path),
'changefreq': self._guess_changefreq(url_path)
})
return pages
def _should_ignore(self, path: Path) -> bool:
from fnmatch import fnmatch
name = path.name
return any(fnmatch(name, pattern) for pattern in self.IGNORE_PATTERNS)
def _calculate_priority(self, path: str) -> float:
if path == '' or path == '/':
return 1.0
depth = path.count('/')
if 'blog' in path and depth <= 1:
return 0.8
if 'blog' in path:
return 0.6
return max(0.3, 0.8 - depth * 0.1)
def _guess_changefreq(self, path: str) -> str:
if path == '' or path == '/':
return 'weekly'
if 'blog' in path and path.count('/') <= 1:
return 'daily'
return 'monthly'
Blog Post Integration
For sites with a blog managed by a JSON-based system, I integrate the post data directly:
import json
def generate_blog_sitemap(posts_file: str, base_url: str) -> SitemapGenerator:
with open(posts_file) as f:
posts = json.load(f)
gen = SitemapGenerator(base_url)
# Homepage
gen.add_url('/', changefreq='weekly', priority=1.0)
# Blog index
gen.add_url('/blog/', changefreq='daily', priority=0.8)
# Individual posts
published = [p for p in posts if p['status'] == 'published']
for post in sorted(published, key=lambda p: p['date'], reverse=True):
gen.add_url(
f"/blog/{post['slug']}.html",
lastmod=post['date'],
changefreq='monthly',
priority=0.6
)
return gen
Sitemap Validation
Before deploying, validate the sitemap to catch common issues:
from urllib.parse import urlparse
class SitemapValidator:
MAX_URLS = 50000
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50MB
def validate(self, sitemap_path: str) -> dict:
issues = []
content = Path(sitemap_path).read_text()
# Size check
size = len(content.encode('utf-8'))
if size > self.MAX_SIZE_BYTES:
issues.append(f"Sitemap exceeds 50MB limit: {size} bytes")
# Parse and validate
tree = ET.parse(sitemap_path)
root = tree.getroot()
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = root.findall('.//sm:url', ns)
if len(urls) > self.MAX_URLS:
issues.append(f"Exceeds {self.MAX_URLS} URL limit: {len(urls)} URLs")
seen_locs = set()
for url in urls:
loc = url.find('sm:loc', ns)
if loc is None or not loc.text:
issues.append("URL entry missing loc element")
continue
# Check for duplicates
if loc.text in seen_locs:
issues.append(f"Duplicate URL: {loc.text}")
seen_locs.add(loc.text)
# Validate URL format
parsed = urlparse(loc.text)
if not parsed.scheme or not parsed.netloc:
issues.append(f"Invalid URL: {loc.text}")
# Validate priority
priority = url.find('sm:priority', ns)
if priority is not None:
try:
p = float(priority.text)
if not 0.0 <= p <= 1.0:
issues.append(f"Invalid priority {p} for {loc.text}")
except ValueError:
issues.append(f"Non-numeric priority for {loc.text}")
return {
'valid': len(issues) == 0,
'url_count': len(urls),
'size_bytes': size,
'issues': issues
}
Automated Generation with Cron
#!/usr/bin/env python3
"""Regenerate sitemap and submit to search engines."""
import requests
def main():
gen = generate_blog_sitemap(
posts_file='/home/user/site/blog/posts.json',
base_url='https://stevecv.com'
)
output = '/home/user/site/public_html/sitemap.xml'
gen.save(output)
validator = SitemapValidator()
result = validator.validate(output)
print(f"Sitemap: {result['url_count']} URLs, {result['size_bytes']} bytes")
if not result['valid']:
print(f"Validation issues: {result['issues']}")
return
# Ping Google
ping_url = f"https://www.google.com/ping?sitemap=https://stevecv.com/sitemap.xml"
requests.get(ping_url)
print("Google pinged successfully")
if __name__ == '__main__':
main()
Tips for Better Sitemaps
- Only include canonical URLs. No duplicate content with different parameters.
- Update lastmod only when the content actually changes, not on every build.
- Use priority to indicate relative importance within your own site, not absolute importance.
- Reference your sitemap in robots.txt:
Sitemap: https://yoursite.com/sitemap.xml - For sites with more than 50,000 URLs, use a sitemap index file.
Sitemap generation is a small investment that pays continuous SEO dividends. Automate it once, run it on every deploy, and never think about it again.