search
Python

Generate and Download XML Sitemap with Python

Learn how to create sitemap.xml files in Python. Download sitemaps from websites and generate custom XML sitemaps for SEO.

person By Gautam Sharma
calendar_today December 31, 2024
schedule 6 min read
Python SEO XML Web Development

Sitemaps help search engines discover and index your website pages. Here’s how to generate and download sitemap XML files using Python.

Generate Basic Sitemap

No external libraries needed. Uses Python’s built-in XML library.

from datetime import datetime
import xml.etree.ElementTree as ET

def generate_sitemap(urls, filename='sitemap.xml'):
    # Create root element
    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    # Add URLs
    for url_data in urls:
        url_elem = ET.SubElement(urlset, 'url')

        loc = ET.SubElement(url_elem, 'loc')
        loc.text = url_data['loc']

        if 'lastmod' in url_data:
            lastmod = ET.SubElement(url_elem, 'lastmod')
            lastmod.text = url_data['lastmod']

        if 'changefreq' in url_data:
            changefreq = ET.SubElement(url_elem, 'changefreq')
            changefreq.text = url_data['changefreq']

        if 'priority' in url_data:
            priority = ET.SubElement(url_elem, 'priority')
            priority.text = str(url_data['priority'])

    # Create tree and write
    tree = ET.ElementTree(urlset)
    ET.indent(tree, space='  ')
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Sitemap saved to {filename}")

# Example usage
urls = [
    {
        'loc': 'https://example.com/',
        'lastmod': datetime.now().strftime('%Y-%m-%d'),
        'changefreq': 'daily',
        'priority': 1.0
    },
    {
        'loc': 'https://example.com/about',
        'lastmod': '2024-12-30',
        'changefreq': 'monthly',
        'priority': 0.8
    },
    {
        'loc': 'https://example.com/blog',
        'changefreq': 'weekly',
        'priority': 0.9
    }
]

generate_sitemap(urls)

Download Sitemap from URL

Fetch and save sitemaps from websites.

import requests

def download_sitemap(url, filename='downloaded_sitemap.xml'):
    try:
        response = requests.get(url)
        response.raise_for_status()

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(response.text)

        print(f"Sitemap downloaded to {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading sitemap: {e}")

# Download sitemap
download_sitemap('https://example.com/sitemap.xml')

Parse Existing Sitemap

Extract URLs from sitemap XML files.

import xml.etree.ElementTree as ET

def parse_sitemap(filename):
    tree = ET.parse(filename)
    root = tree.getroot()

    # Handle namespace
    ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    urls = []
    for url in root.findall('sm:url', ns):
        loc = url.find('sm:loc', ns)
        lastmod = url.find('sm:lastmod', ns)
        priority = url.find('sm:priority', ns)

        urls.append({
            'loc': loc.text if loc is not None else None,
            'lastmod': lastmod.text if lastmod is not None else None,
            'priority': priority.text if priority is not None else None
        })

    return urls

# Parse and display
urls = parse_sitemap('sitemap.xml')
for url in urls:
    print(f"{url['loc']} (Priority: {url['priority']})")

Generate from Directory

Scan directory and create sitemap automatically.

import os
from datetime import datetime
import xml.etree.ElementTree as ET

def generate_from_directory(directory, base_url, filename='sitemap.xml'):
    urls = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                # Get relative path
                rel_path = os.path.relpath(
                    os.path.join(root, file),
                    directory
                )

                # Convert to URL
                url = base_url.rstrip('/') + '/' + rel_path.replace('\\', '/')

                # Handle index.html
                if file == 'index.html':
                    url = url.replace('index.html', '')

                # Get modification time
                file_path = os.path.join(root, file)
                mod_time = os.path.getmtime(file_path)
                lastmod = datetime.fromtimestamp(mod_time).strftime('%Y-%m-%d')

                urls.append({
                    'loc': url,
                    'lastmod': lastmod,
                    'changefreq': 'weekly',
                    'priority': 0.8
                })

    # Generate sitemap
    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    for url_data in urls:
        url_elem = ET.SubElement(urlset, 'url')

        loc = ET.SubElement(url_elem, 'loc')
        loc.text = url_data['loc']

        lastmod = ET.SubElement(url_elem, 'lastmod')
        lastmod.text = url_data['lastmod']

        changefreq = ET.SubElement(url_elem, 'changefreq')
        changefreq.text = url_data['changefreq']

        priority = ET.SubElement(url_elem, 'priority')
        priority.text = str(url_data['priority'])

    tree = ET.ElementTree(urlset)
    ET.indent(tree, space='  ')
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Generated sitemap with {len(urls)} URLs")

# Generate from dist folder
generate_from_directory('./dist', 'https://example.com')

Sitemap Index

Create sitemap index for multiple sitemaps.

import xml.etree.ElementTree as ET
from datetime import datetime

def generate_sitemap_index(sitemaps, filename='sitemap-index.xml'):
    sitemapindex = ET.Element('sitemapindex')
    sitemapindex.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    for sitemap_url in sitemaps:
        sitemap = ET.SubElement(sitemapindex, 'sitemap')

        loc = ET.SubElement(sitemap, 'loc')
        loc.text = sitemap_url

        lastmod = ET.SubElement(sitemap, 'lastmod')
        lastmod.text = datetime.now().strftime('%Y-%m-%d')

    tree = ET.ElementTree(sitemapindex)
    ET.indent(tree, space='  ')
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Sitemap index saved to {filename}")

# Create index
sitemaps = [
    'https://example.com/sitemap-posts.xml',
    'https://example.com/sitemap-pages.xml',
    'https://example.com/sitemap-products.xml'
]

generate_sitemap_index(sitemaps)

Validate Sitemap

Check if sitemap is properly formatted.

import xml.etree.ElementTree as ET

def validate_sitemap(filename):
    try:
        tree = ET.parse(filename)
        root = tree.getroot()

        # Check namespace
        if 'sitemaps.org/schemas/sitemap' not in root.tag:
            print("Invalid sitemap namespace")
            return False

        ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        urls = root.findall('sm:url', ns)

        print(f"Valid sitemap with {len(urls)} URLs")

        # Check each URL
        for idx, url in enumerate(urls, 1):
            loc = url.find('sm:loc', ns)
            if loc is None or not loc.text:
                print(f"URL {idx}: Missing loc element")
                return False

        print("All URLs have valid loc elements")
        return True

    except ET.ParseError as e:
        print(f"XML parsing error: {e}")
        return False
    except Exception as e:
        print(f"Validation error: {e}")
        return False

# Validate
validate_sitemap('sitemap.xml')

Dynamic Sitemap from Database

Generate sitemap from database records.

import sqlite3
from datetime import datetime
import xml.etree.ElementTree as ET

def generate_from_database(db_path, base_url, filename='sitemap.xml'):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Query posts (adjust for your schema)
    cursor.execute("""
        SELECT slug, updated_at, category
        FROM posts
        WHERE published = 1
    """)

    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    for row in cursor.fetchall():
        slug, updated_at, category = row

        url_elem = ET.SubElement(urlset, 'url')

        loc = ET.SubElement(url_elem, 'loc')
        loc.text = f"{base_url}/blog/{slug}"

        lastmod = ET.SubElement(url_elem, 'lastmod')
        lastmod.text = updated_at

        priority = ET.SubElement(url_elem, 'priority')
        priority.text = '0.8'

    conn.close()

    tree = ET.ElementTree(urlset)
    ET.indent(tree, space='  ')
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Generated sitemap from database")

# Generate from SQLite database
generate_from_database('blog.db', 'https://example.com')

Compress Sitemap

Compress large sitemaps with gzip.

import gzip
import xml.etree.ElementTree as ET

def generate_compressed_sitemap(urls, filename='sitemap.xml.gz'):
    # Create XML
    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    for url_data in urls:
        url_elem = ET.SubElement(urlset, 'url')
        loc = ET.SubElement(url_elem, 'loc')
        loc.text = url_data['loc']

    # Convert to string
    tree = ET.ElementTree(urlset)
    ET.indent(tree, space='  ')

    # Write compressed
    with gzip.open(filename, 'wt', encoding='utf-8') as f:
        tree.write(f, encoding='unicode', xml_declaration=True)

    print(f"Compressed sitemap saved to {filename}")

# Generate compressed sitemap
urls = [{'loc': f'https://example.com/page-{i}'} for i in range(1000)]
generate_compressed_sitemap(urls)

Complete CLI Tool

Full command-line sitemap generator.

import argparse
import xml.etree.ElementTree as ET
from datetime import datetime

def create_sitemap_cli():
    parser = argparse.ArgumentParser(description='Generate XML sitemap')
    parser.add_argument('--url', required=True, help='Base URL')
    parser.add_argument('--pages', nargs='+', required=True, help='Page paths')
    parser.add_argument('--output', default='sitemap.xml', help='Output file')

    args = parser.parse_args()

    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')

    for page in args.pages:
        url_elem = ET.SubElement(urlset, 'url')

        loc = ET.SubElement(url_elem, 'loc')
        loc.text = args.url.rstrip('/') + '/' + page.lstrip('/')

        lastmod = ET.SubElement(url_elem, 'lastmod')
        lastmod.text = datetime.now().strftime('%Y-%m-%d')

    tree = ET.ElementTree(urlset)
    ET.indent(tree, space='  ')
    tree.write(args.output, encoding='utf-8', xml_declaration=True)

    print(f"Generated {args.output} with {len(args.pages)} URLs")

if __name__ == '__main__':
    create_sitemap_cli()

Run from terminal:

python sitemap_generator.py --url https://example.com --pages / about blog contact

Quick Reference

Basic Structure:

  • <urlset> - Root element with namespace
  • <url> - Container for each URL entry
  • <loc> - URL location (required)
  • <lastmod> - Last modification date (optional)
  • <changefreq> - Update frequency (optional)
  • <priority> - Priority 0.0 to 1.0 (optional)

Change Frequency Values:

  • always - Changes every access
  • hourly - Changes hourly
  • daily - Changes daily
  • weekly - Changes weekly
  • monthly - Changes monthly
  • yearly - Changes yearly
  • never - Archived content

Best Practices:

  • Limit to 50,000 URLs per sitemap
  • Max file size 50MB (uncompressed)
  • Use sitemap index for larger sites
  • Include only canonical URLs
  • Use absolute URLs with protocol
  • Compress large sitemaps with gzip

Common Issues:

  • Wrong namespace causes validation errors
  • Missing <?xml> declaration
  • Using relative URLs instead of absolute
  • Special characters not escaped
  • Incorrect date format (use YYYY-MM-DD)

Conclusion

Python makes sitemap generation simple with its built-in XML library. Use the basic generator for static sites, database queries for dynamic content, or directory scanning for file-based sites. All these methods create valid XML sitemaps that search engines can crawl.

Gautam Sharma

About Gautam Sharma

Full-stack developer and tech blogger sharing coding tutorials and best practices

Related Articles

Python

Generate Excel Files from Raw Data with Python

Quick guide to creating Excel files from raw data using Python. Learn to use openpyxl, xlsxwriter, and pandas for Excel generation.

December 31, 2024
Python

Python FFMPEG Integration: Edit Videos in Terminal

Master video editing from the command line using Python and FFmpeg. Learn to trim, merge, compress, and manipulate videos programmatically.

December 31, 2024
Python

Read and Write CSV Files with Python

Simple guide to reading and writing CSV files in Python using csv module and pandas. Quick examples for data processing.

December 31, 2024