#!/usr/bin/env python3 from base64 import b64decode import hashlib from lxml import etree from io import BytesIO import os from time import strptime from pypandoc import convert_text #http://www.hanxiaogang.com/writing/parsing-evernote-export-file-enex-using-python/ p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) def parse_content(content): text = convert_text(content, 'org', format='html') return text def parse_resource(resource): rsc_dict = {} for elem in resource: if elem.tag == 'data': # Some times elem.text is None rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b'' rsc_dict['hash'] = hashlib.md5(rsc_dict[elem.tag]).hexdigest() else: rsc_dict[elem.tag] = elem.text return rsc_dict def parse_note(note): note_dict = {} resources = [] for elem in note: if elem.tag == 'content': note_dict[elem.tag] = parse_content(elem.text) # A copy of original content note_dict['content-raw'] = elem.text elif elem.tag == 'resource': resources.append(parse_resource(elem)) elif elem.tag == 'created' or elem.tag == 'updated': note_dict[elem.tag] = strptime(elem.text, '%Y%m%dT%H%M%SZ') else: note_dict[elem.tag] = elem.text note_dict['resource'] = resources return note_dict def parseNoteXML(xmlFile): # Without huge_tree set to True, parser may complain about huge text node # Try to recover, because there may be " ", which will cause # "XMLSyntaxError: Entity 'nbsp' not defined" context = etree.iterparse(xmlFile, encoding='utf-8', strip_cdata=False, huge_tree=True, recover=True) for action, elem in context: if elem.tag == "note": yield parse_note(elem) # Save notes and attachments # in directories named according to date of creation def export_note(note): date = note['created'] year = str(date.tm_year) mon = '%02d' % date.tm_mon mday = '%02d' % date.tm_mday note_dir = os.path.join('en-export', year, mon, mday) os.makedirs(note_dir, exist_ok=True) # Remove "/" from filenames title = note['title'].replace('/', ' ')[:20] text_file = os.path.join(note_dir, title +'.org') with open(text_file, 'w') as fd: # Write the original title fd.write('#+TITLE: ' + note['title'] + '\n') fd.write(note['content']) bak_file = os.path.join(note_dir, title+'.bak') with open(bak_file, 'w') as fd: fd.write(note['content-raw']) for resource in note['resource']: rsc_file = os.path.join(note_dir, resource['hash']+'.data') data = resource['data'] with open(rsc_file, 'wb') as fd: fd.write(data) if __name__ == '__main__': notes = parseNoteXML('mynote.enex') for note in notes: export_note(note)