Created
December 4, 2014 07:10
-
-
Save jirivrany/77d4f250f773e81064dc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
""" | |
Parsovani velkych soubory pomoci Eltree iter parseru | |
""" | |
import xml.etree.ElementTree as etree | |
import bz2 | |
def fix_tag(ns, nsmap, tag): | |
return '{{{}}}{}'.format(nsmap[''], tag) | |
def parse_dump(xml_fn): | |
with bz2.BZ2File(xml_fn, 'r') as fr: | |
nsmap = {} | |
for event, elem in etree.iterparse(fr, events=('end', 'start-ns')): | |
if event == 'start-ns': | |
ns, url = elem | |
nsmap[ns] = url | |
if event == 'end': | |
if elem.tag == fix_tag('', nsmap, 'page'): | |
title = elem.find(fix_tag('', nsmap, 'title')).text | |
yield title | |
elem.clear() | |
if __name__ == '__main__': | |
fname = 'cswiki-latest-pages-articles.xml.bz2' | |
idx = 0 | |
for title in parse_dump(fname): | |
print title | |
idx += 1 | |
if idx >= 10: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment