Skip to content

Instantly share code, notes, and snippets.

@terencezl
Created April 20, 2017 04:39

Revisions

  1. terencezl created this gist Apr 20, 2017.
    31 changes: 31 additions & 0 deletions convert_pdf.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,31 @@
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import BytesIO

    def convert_pdf(path, format='text', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
    device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
    raise ValueError('provide format, either text, html or xml!')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
    interpreter.process_page(page)

    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    return text