Created
December 30, 2012 04:22
-
-
Save anonymous/4410965 to your computer and use it in GitHub Desktop.
Python text to pdf converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## {{{ http://code.activestate.com/recipes/532908/ (r3) | |
#! /usr/bin/env python | |
""" | |
pyText2Pdf - Python script to convert plain text files into Adobe | |
Acrobat PDF files with support for arbitrary page breaks etc. | |
Version 2.0 | |
Author: Anand B Pillai <abpillai at gmail dot com> | |
""" | |
# Derived from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/189858 | |
import sys, os | |
import string | |
import time | |
import optparse | |
import re | |
LF_EXTRA=0 | |
LINE_END='\015' | |
# form feed character (^L) | |
FF=chr(12) | |
ENCODING_STR = """\ | |
/Encoding << | |
/Differences [ 0 /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /space /exclam | |
/quotedbl /numbersign /dollar /percent /ampersand | |
/quoteright /parenleft /parenright /asterisk /plus /comma | |
/hyphen /period /slash /zero /one /two /three /four /five | |
/six /seven /eight /nine /colon /semicolon /less /equal | |
/greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L | |
/M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft | |
/backslash /bracketright /asciicircum /underscore | |
/quoteleft /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p | |
/q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright | |
/asciitilde /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef | |
/dotlessi /grave /acute /circumflex /tilde /macron /breve | |
/dotaccent /dieresis /.notdef /ring /cedilla /.notdef | |
/hungarumlaut /ogonek /caron /space /exclamdown /cent | |
/sterling /currency /yen /brokenbar /section /dieresis | |
/copyright /ordfeminine /guillemotleft /logicalnot /hyphen | |
/registered /macron /degree /plusminus /twosuperior | |
/threesuperior /acute /mu /paragraph /periodcentered | |
/cedilla /onesuperior /ordmasculine /guillemotright | |
/onequarter /onehalf /threequarters /questiondown /Agrave | |
/Aacute /Acircumflex /Atilde /Adieresis /Aring /AE | |
/Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave | |
/Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve | |
/Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash | |
/Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn | |
/germandbls /agrave /aacute /acircumflex /atilde /adieresis | |
/aring /ae /ccedilla /egrave /eacute /ecircumflex | |
/edieresis /igrave /iacute /icircumflex /idieresis /eth | |
/ntilde /ograve /oacute /ocircumflex /otilde /odieresis | |
/divide /oslash /ugrave /uacute /ucircumflex /udieresis | |
/yacute /thorn /ydieresis ] | |
>> | |
""" | |
INTRO="""\ | |
%prog [options] filename | |
PyText2Pdf makes a 7-bit clean PDF file from any input file. | |
It reads from a named file, and writes the PDF file to a file specified by | |
the user, otherwise to a file with '.pdf' appended to the input file. | |
Author: Anand B Pillai.""" | |
class PyText2Pdf(object): | |
""" Text2pdf converter in pure Python """ | |
def __init__(self): | |
# version number | |
self._version="1.3" | |
# iso encoding flag | |
self._IsoEnc=False | |
# formfeeds flag | |
self._doFFs=False | |
self._progname="PyText2Pdf" | |
self._appname = " ".join((self._progname,str(self._version))) | |
# default font | |
self._font="/Courier" | |
# default font size | |
self._ptSize=10 | |
# default vert space | |
self._vertSpace=12 | |
self._lines=0 | |
# number of characters in a row | |
self._cols=80 | |
self._columns=1 | |
# page ht | |
self._pageHt=792 | |
# page wd | |
self._pageWd=612 | |
# input file | |
self._ifile="" | |
# output file | |
self._ofile="" | |
# default tab width | |
self._tab=4 | |
# input file descriptor | |
self._ifs=None | |
# output file descriptor | |
self._ofs=None | |
# landscape flag | |
self._landscape=False | |
# Subject | |
self._subject = '' | |
# Author | |
self._author = '' | |
# Keywords | |
self._keywords = [] | |
# Custom regexp for page breaks | |
self._pagebreakre = None | |
# marker objects | |
self._curobj = 5 | |
self._pageObs = [0] | |
self._locations = [0,0,0,0,0,0] | |
self._pageNo=0 | |
# file position marker | |
self._fpos=0 | |
def parse_args(self): | |
""" Callback function called by argument parser. | |
Helps to remove duplicate code """ | |
if len(sys.argv)<2: | |
sys.argv.append('-h') | |
parser = optparse.OptionParser(usage=INTRO) | |
parser.add_option('-o','--output',dest='outfile',help='Direct output to file OUTFILE',metavar='OUTFILE') | |
parser.add_option('-f','--font',dest='font',help='Use Postscript font FONT (must be in standard 14, default: Courier)', | |
default='Courier') | |
parser.add_option('-I','--isolatin',dest='isolatin',help='Use ISO latin-1 encoding',default=False,action='store_true') | |
parser.add_option('-s','--size',dest='fontsize',help='Use font at PTSIZE points (default=>10)',metavar='PTSIZE',default=10) | |
parser.add_option('-v','--linespace',dest='linespace',help='Use line spacing LINESPACE (deault 12)',metavar='LINESPACE',default=12) | |
parser.add_option('-l','--lines',dest='lines',help='Lines per page (default 60, determined automatically if unspecified)',default=60, metavar=None) | |
parser.add_option('-c','--chars',dest='chars',help='Maximum characters per line (default 80)',default=80,metavar=None) | |
parser.add_option('-t','--tab',dest='tabspace',help='Spaces per tab character (default 4)',default=4,metavar=None) | |
parser.add_option('-F','--ignoreff',dest='formfeed',help='Ignore formfeed character ^L (i.e, accept formfeed characters as pagebreaks)',default=False,action='store_true') | |
parser.add_option('-P','--papersize',dest='papersize',help='Set paper size (default is letter, accepted values are "A4" or "A3")') | |
parser.add_option('-W','--width',dest='width',help='Independent paper width in points',metavar=None,default=612) | |
parser.add_option('-H','--height',dest='height',help='Independent paper height in points',metavar=None,default=792) | |
parser.add_option('-2','--twocolumns',dest='twocolumns',help='Format as two columns',metavar=None,default=False,action='store_true') | |
parser.add_option('-L','--landscape',dest='landscape',help='Format in landscape mode',metavar=None,default=False,action='store_true') | |
parser.add_option('-R','--regexp',dest='pageregexp',help='Regular expression string to determine page breaks (if supplied, this will be used to split text into pages, instead of using line count)',metavar=None) | |
parser.add_option('-S','--subject',dest='subject',help='Optional subject for the document',metavar=None) | |
parser.add_option('-A','--author',dest='author',help='Optional author for the document',metavar=None) | |
parser.add_option('-K','--keywords',dest='keywords',help='Optional list of keywords for the document (separated by commas)',metavar=None) | |
optlist, args = parser.parse_args() | |
# print optlist.__dict__, args | |
if len(args)==0: | |
sys.exit('Error: input file argument missing') | |
elif len(args)>1: | |
sys.exit('Error: Too many arguments') | |
self._ifile = args[0] | |
d = optlist.__dict__ | |
if d.get('isolatin'): self._IsoEnc=True | |
if d.get('formfeed'): self._doFFs = True | |
if d.get('twocolumns'): self._columns = 2 | |
if d.get('landscape'): self._landscape = True | |
self._font = '/' + d.get('font') | |
psize = d.get('papersize') | |
if psize=='A4': | |
self._pageWd=595 | |
self._pageHt=842 | |
elif psize=='A3': | |
self._pageWd=842 | |
self._pageHt=1190 | |
fsize = int(d.get('fontsize')) | |
if fsize < 1: fsize = 1 | |
self._ptSize = fsize | |
lspace = int(d.get('linespace')) | |
if lspace<1: lspace = 1 | |
self._vertSpace = lspace | |
lines = int(d.get('lines')) | |
if lines<1: lines = 1 | |
self._lines = int(lines) | |
chars = int(d.get('chars')) | |
if chars<4: chars = 4 | |
self._cols = chars | |
tab = int(d.get('tabspace')) | |
if tab<1: tab = 1 | |
self._tab = tab | |
w = int(d.get('width')) | |
if w<72: w=72 | |
self._pageWd = w | |
h = int(d.get('height')) | |
if h<72: h=72 | |
self._pageHt = h | |
# Very optional args | |
author = d.get('author') | |
if author: self._author = author | |
subject = d.get('subject') | |
if subject: self._subject = subject | |
keywords = d.get('keywords') | |
if keywords: | |
self._keywords = keywords.split(',') | |
pagebreak = d.get('pageregexp') | |
if pagebreak: | |
self._pagebreakre = re.compile(pagebreak, re.UNICODE|re.IGNORECASE) | |
outfile = d.get('outfile') | |
if outfile: self._ofile = outfile | |
if self._landscape: | |
print 'Landscape option on...' | |
if self._columns==2: | |
print 'Printing in two columns...' | |
if self._doFFs: | |
print 'Ignoring form feed character...' | |
if self._IsoEnc: | |
print 'Using ISO Latin Encoding...' | |
print 'Using font',self._font[1:],'size =', self._ptSize | |
def writestr(self, str): | |
""" Write string to output file descriptor. | |
All output operations go through this function. | |
We keep the current file position also here""" | |
# update current file position | |
self._fpos += len(str) | |
for x in range(0, len(str)): | |
if str[x] == '\n': | |
self._fpos += LF_EXTRA | |
try: | |
self._ofs.write(str) | |
except IOError, e: | |
print e | |
return -1 | |
return 0 | |
def convert(self): | |
""" Perform the actual conversion """ | |
if self._landscape: | |
# swap page width & height | |
tmp = self._pageHt | |
self._pageHt = self._pageWd | |
self._pageWd = tmp | |
if self._lines==0: | |
self._lines = (self._pageHt - 72)/self._vertSpace | |
if self._lines < 1: | |
self._lines=1 | |
try: | |
self._ifs=open(self._ifile) | |
except IOError, (strerror, errno): | |
print 'Error: Could not open file to read --->', self._ifile | |
sys.exit(3) | |
if self._ofile=="": | |
self._ofile = os.path.splitext(self._ifile)[0] + '.pdf' | |
try: | |
self._ofs = open(self._ofile, 'wb') | |
except IOError, (strerror, errno): | |
print 'Error: Could not open file to write --->', self._ofile | |
sys.exit(3) | |
print 'Input file=>',self._ifile | |
print 'Writing pdf file',self._ofile, '...' | |
self.writeheader() | |
self.writepages() | |
self.writerest() | |
print 'Wrote file', self._ofile | |
self._ifs.close() | |
self._ofs.close() | |
return 0 | |
def writeheader(self): | |
"""Write the PDF header""" | |
ws = self.writestr | |
title = self._ifile | |
t=time.localtime() | |
timestr=str(time.strftime("D:%Y%m%d%H%M%S", t)) | |
ws("%PDF-1.4\n") | |
self._locations[1] = self._fpos | |
ws("1 0 obj\n") | |
ws("<<\n") | |
buf = "".join(("/Creator (", self._appname, " By Anand B Pillai )\n")) | |
ws(buf) | |
buf = "".join(("/CreationDate (", timestr, ")\n")) | |
ws(buf) | |
buf = "".join(("/Producer (", self._appname, "(\\251 Anand B Pillai))\n")) | |
ws(buf) | |
if self._subject: | |
title = self._subject | |
buf = "".join(("/Subject (",self._subject,")\n")) | |
ws(buf) | |
if self._author: | |
buf = "".join(("/Author (",self._author,")\n")) | |
ws(buf) | |
if self._keywords: | |
buf = "".join(("/Keywords (",' '.join(self._keywords),")\n")) | |
ws(buf) | |
if title: | |
buf = "".join(("/Title (", title, ")\n")) | |
ws(buf) | |
ws(">>\n") | |
ws("endobj\n") | |
self._locations[2] = self._fpos | |
ws("2 0 obj\n") | |
ws("<<\n") | |
ws("/Type /Catalog\n") | |
ws("/Pages 3 0 R\n") | |
ws(">>\n") | |
ws("endobj\n") | |
self._locations[4] = self._fpos | |
ws("4 0 obj\n") | |
ws("<<\n") | |
buf = "".join(("/BaseFont ", str(self._font), " /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >>\n")) | |
ws(buf) | |
if self._IsoEnc: | |
ws(ENCODING_STR) | |
ws(">>\n") | |
ws("endobj\n") | |
self._locations[5] = self._fpos | |
ws("5 0 obj\n") | |
ws("<<\n") | |
ws(" /Font << /F1 4 0 R >>\n") | |
ws(" /ProcSet [ /PDF /Text ]\n") | |
ws(">>\n") | |
ws("endobj\n") | |
def startpage(self): | |
""" Start a page of data """ | |
ws = self.writestr | |
self._pageNo += 1 | |
self._curobj += 1 | |
self._locations.append(self._fpos) | |
self._locations[self._curobj]=self._fpos | |
self._pageObs.append(self._curobj) | |
self._pageObs[self._pageNo] = self._curobj | |
buf = "".join((str(self._curobj), " 0 obj\n")) | |
ws(buf) | |
ws("<<\n") | |
ws("/Type /Page\n") | |
ws("/Parent 3 0 R\n") | |
ws("/Resources 5 0 R\n") | |
self._curobj += 1 | |
buf = "".join(("/Contents ", str(self._curobj), " 0 R\n")) | |
ws(buf) | |
ws(">>\n") | |
ws("endobj\n") | |
self._locations.append(self._fpos) | |
self._locations[self._curobj] = self._fpos | |
buf = "".join((str(self._curobj), " 0 obj\n")) | |
ws(buf) | |
ws("<<\n") | |
buf = "".join(("/Length ", str(self._curobj + 1), " 0 R\n")) | |
ws(buf) | |
ws(">>\n") | |
ws("stream\n") | |
strmPos = self._fpos | |
ws("BT\n"); | |
buf = "".join(("/F1 ", str(self._ptSize), " Tf\n")) | |
ws(buf) | |
buf = "".join(("1 0 0 1 50 ", str(self._pageHt - 40), " Tm\n")) | |
ws(buf) | |
buf = "".join((str(self._vertSpace), " TL\n")) | |
ws(buf) | |
return strmPos | |
def endpage(self, streamStart): | |
"""End a page of data """ | |
ws = self.writestr | |
ws("ET\n") | |
streamEnd = self._fpos | |
ws("endstream\n") | |
ws("endobj\n") | |
self._curobj += 1 | |
self._locations.append(self._fpos) | |
self._locations[self._curobj] = self._fpos | |
buf = "".join((str(self._curobj), " 0 obj\n")) | |
ws(buf) | |
buf = "".join((str(streamEnd - streamStart), '\n')) | |
ws(buf) | |
ws('endobj\n') | |
def writepages(self): | |
"""Write pages as PDF""" | |
ws = self.writestr | |
beginstream=0 | |
lineNo, charNo=0,0 | |
ch, column=0,0 | |
padding,i=0,0 | |
atEOF=0 | |
linebuf = '' | |
while not atEOF: | |
beginstream = self.startpage() | |
column=1 | |
while column <= self._columns: | |
column += 1 | |
atFF=0 | |
atBOP=0 | |
lineNo=0 | |
# Special flag for regexp page break | |
pagebreak = False | |
while lineNo < self._lines and not atFF and not atEOF and not pagebreak: | |
linebuf = '' | |
lineNo += 1 | |
ws("(") | |
charNo=0 | |
while charNo < self._cols: | |
charNo += 1 | |
ch = self._ifs.read(1) | |
cond = ((ch != '\n') and not(ch==FF and self._doFFs) and (ch != '')) | |
if not cond: | |
# See if this dude matches the pagebreak regexp | |
if self._pagebreakre and self._pagebreakre.search(linebuf.strip()): | |
pagebreak = True | |
linebuf = '' | |
break | |
else: | |
linebuf = linebuf + ch | |
if ord(ch) >= 32 and ord(ch) <= 127: | |
if ch == '(' or ch == ')' or ch == '\\': | |
ws("\\") | |
ws(ch) | |
else: | |
if ord(ch) == 9: | |
padding =self._tab - ((charNo - 1) % self._tab) | |
for i in range(padding): | |
ws(" ") | |
charNo += (padding -1) | |
else: | |
if ch != FF: | |
# write \xxx form for dodgy character | |
buf = "".join(('\\', ch)) | |
ws(buf) | |
else: | |
# dont print anything for a FF | |
charNo -= 1 | |
ws(")'\n") | |
if ch == FF: | |
atFF=1 | |
if lineNo == self._lines: | |
atBOP=1 | |
if atBOP: | |
pos=0 | |
ch = self._ifs.read(1) | |
pos= self._ifs.tell() | |
if ch == FF: | |
ch = self._ifs.read(1) | |
pos=self._ifs.tell() | |
# python's EOF signature | |
if ch == '': | |
atEOF=1 | |
else: | |
# push position back by one char | |
self._ifs.seek(pos-1) | |
elif atFF: | |
ch = self._ifs.read(1) | |
pos=self._ifs.tell() | |
if ch == '': | |
atEOF=1 | |
else: | |
self._ifs.seek(pos-1) | |
if column < self._columns: | |
buf = "".join(("1 0 0 1 ", | |
str((self._pageWd/2 + 25)), | |
" ", | |
str(self._pageHt - 40), | |
" Tm\n")) | |
ws(buf) | |
self.endpage(beginstream) | |
def writerest(self): | |
"""Finish the file""" | |
ws = self.writestr | |
self._locations[3] = self._fpos | |
ws("3 0 obj\n") | |
ws("<<\n") | |
ws("/Type /Pages\n") | |
buf = "".join(("/Count ", str(self._pageNo), "\n")) | |
ws(buf) | |
buf = "".join(("/MediaBox [ 0 0 ", str(self._pageWd), " ", str(self._pageHt), " ]\n")) | |
ws(buf) | |
ws("/Kids [ ") | |
for i in range(1, self._pageNo+1): | |
buf = "".join((str(self._pageObs[i]), " 0 R ")) | |
ws(buf) | |
ws("]\n") | |
ws(">>\n") | |
ws("endobj\n") | |
xref = self._fpos | |
ws("xref\n") | |
buf = "".join(("0 ", str((self._curobj) + 1), "\n")) | |
ws(buf) | |
buf = "".join(("0000000000 65535 f ", str(LINE_END))) | |
ws(buf) | |
for i in range(1, self._curobj + 1): | |
val = self._locations[i] | |
buf = "".join((string.zfill(str(val), 10), " 00000 n ", str(LINE_END))) | |
ws(buf) | |
ws("trailer\n") | |
ws("<<\n") | |
buf = "".join(("/Size ", str(self._curobj + 1), "\n")) | |
ws(buf) | |
ws("/Root 2 0 R\n") | |
ws("/Info 1 0 R\n") | |
ws(">>\n") | |
ws("startxref\n") | |
buf = "".join((str(xref), "\n")) | |
ws(buf) | |
ws("%%EOF\n") | |
def main(): | |
pdfclass=PyText2Pdf() | |
pdfclass.parse_args() | |
pdfclass.convert() | |
if __name__ == "__main__": | |
main() | |
## end of http://code.activestate.com/recipes/532908/ }}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! This is awesome, how can I add page numbers with this script?