#! /usr/bin/env python # -*- coding: utf-8 -*- """ Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/) Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson Enhanced by Jari Turkia (https://blog.hqcodeshop.fi/) / @HQJaTu """ from bs4 import BeautifulSoup import requests from urllib.parse import urlparse def get_sitemap(url): get_url = requests.get(url) if get_url.status_code == 200: return get_url.text else: print('Unable to fetch sitemap: %s.' % url) def process_sitemap(s): soup = BeautifulSoup(s, 'lxml') result = [] for loc in soup.findAll('loc'): result.append(loc.text) return result def is_sub_sitemap(url): parts = urlparse(url) if parts.path.endswith('.xml') and 'sitemap' in parts.path: return True else: return False def parse_sitemap(s): sitemap = process_sitemap(s) result = [] while sitemap: candidate = sitemap.pop() if is_sub_sitemap(candidate): sub_sitemap = get_sitemap(candidate) for i in process_sitemap(sub_sitemap): sitemap.append(i) else: result.append(candidate) return result def main(): sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml') url_count = 0 for url in parse_sitemap(sitemap): url_count += 1 print("%5d) %s" % (url_count, url)) print("-end-of-list-") if __name__ == '__main__': main()