import zipfile import cStringIO from xml.dom import minidom class ExtractPptNotes(object): _chunks = None def __init__(self, file_data): self._chunks = self.__inflate_file_data(file_data) def get_all_slide_notes(self): notes = {} for slide_name in self._chunks[u'ppt/notesSlides']: notes[slide_name] = self.get_slide_notes(slide_name) return notes def get_slide_notes(self, slide_name): notes = [] slide_xml_string = self._chunks[u'ppt/notesSlides'][slide_name] slide_xml = minidom.parseString(slide_xml_string) nodes = slide_xml.getElementsByTagName('a:t') for node in nodes: for child_node in node.childNodes: if child_node.nodeType == child_node.TEXT_NODE: notes.append(child_node.nodeValue) return notes def __inflate_file_data(self, file_data): # Inflate the file file_stream = cStringIO.StringIO(file_data) openxml_zip = zipfile.ZipFile(file_stream, 'r', zipfile.ZIP_DEFLATED) # Extract each xml file contained in the openxml doc into a chunk store # where chunks[chunk_path][chunk_filename] = chunk_data chunks = {} for chunk_file_path in openxml_zip.namelist(): chunk_data = openxml_zip.read(chunk_file_path) chunk_path_info = chunk_file_path.rsplit('/', 1) if len(chunk_path_info) < 2: chunk_name = chunk_path_info[0] chunk_path = '/' else: chunk_name = chunk_path_info[1] chunk_path = chunk_path_info[0] if not chunk_path in chunks: chunks[chunk_path] = {} chunks[chunk_path][chunk_name] = chunk_data openxml_zip.close() file_stream.close() return chunks