Created
August 6, 2021 13:54
-
-
Save rpresser/e7d22b0e7a3e84927fdcd46dec0ee068 to your computer and use it in GitHub Desktop.
Part of the xml output from pdftoxml against the NYT crossword PDF download from 2021-08-06
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="utf-8"?> | |
<!-- This is an excerpt from the output of pdftoxml against the NYT crossword PDF download from 2021-08-06. | |
--> | |
<!-- the XML header. Get the page width from PAGE; you can use it to separate clues from boxes. | |
Note that all PDF coordinates are in points (72 points to the inch). --> | |
<DOCUMENT> | |
<METADATA> | |
<PDFFILENAME>Aug0621.pdf</PDFFILENAME> | |
<PROCESS name="pdftoxml" cmd=""> | |
<VERSION value="2.0"> | |
<COMMENT /> | |
</VERSION> | |
<CREATIONDATE>Fri Aug 6 09:23:54 2021</CREATIONDATE> | |
</PROCESS> | |
</METADATA> | |
<PAGE width="612" height="792" number="1" id="p1"> | |
<MEDIABOX x1="0" y1="0" x2="612" y2="792" /> | |
<CROPBOX x1="0" y1="0" x2="612" y2="792" /> | |
<BLEEDBOX x1="0" y1="0" x2="612" y2="792" /> | |
<ARTBOX x1="0" y1="0" x2="612" y2="792" /> | |
<TRIMBOX x1="0" y1="0" x2="612" y2="792" /> | |
<!-- snip many lines --> | |
<!-- This is clue #1. The first token has the text "1" and is in font "nytfranklin". | |
Other lines are the words of the clue in font "arialunicodems". --> | |
<TEXT width="71.038" height="9.64818" id="p1_t59" x="30.1301" y="65.7503"> | |
<TOKEN sid="p1_s998" id="p1_w69" font-name="nytfranklin" bold="yes" italic="no" font-size="9.9" font-color="#000000" rotation="0" angle="0" x="30.1301" y="66.6511" base="73.6702" width="5.643" height="7.1676">1</TOKEN> | |
<TOKEN sid="p1_s999" id="p1_w70" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="65.7503" base="74.0003" width="22" height="10.549">Fruit</TOKEN> | |
<TOKEN sid="p1_s1000" id="p1_w71" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="65.0881" y="65.7503" base="74.0003" width="23.848" height="10.549">used</TOKEN> | |
<TOKEN sid="p1_s1001" id="p1_w72" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="91.9941" y="65.7503" base="74.0003" width="9.174" height="10.549">to</TOKEN> | |
</TEXT> | |
<TEXT width="81.312" height="10.549" id="p1_t60" x="40.0301" y="79.5804"> | |
<TOKEN sid="p1_s1002" id="p1_w73" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="79.5804" base="87.8304" width="26.895" height="10.549">flavor</TOKEN> | |
<TOKEN sid="p1_s1003" id="p1_w74" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="69.9831" y="79.5804" base="87.8304" width="15.29" height="10.549">the</TOKEN> | |
<TOKEN sid="p1_s1004" id="p1_w75" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="88.3311" y="79.5804" base="87.8304" width="33.011" height="10.549">liqueur</TOKEN> | |
</TEXT> | |
<TEXT width="42.801" height="10.549" id="p1_t61" x="40.0301" y="93.4105"> | |
<TOKEN sid="p1_s1005" id="p1_w76" font-name="arialunicodems" bold="no" italic="no" font-size="11" font-color="#000000" rotation="0" angle="0" x="40.0301" y="93.4105" base="101.66" width="42.801" height="10.549">patxaran</TOKEN> | |
</TEXT> | |
<!-- snip many more lines --> | |
<!-- This is box #1. The text content is a plain number and the font is "arialunicodems". | |
To confirm, check the x location is greater than 1/3 the page width. --> | |
<TEXT width="4.27008" height="7.36512" id="p1_t3" x="264.89" y="50.83"> | |
<TOKEN sid="p1_s20" id="p1_w13" font-name="arialunicodems" bold="no" italic="no" font-size="7.68" font-color="#000000" rotation="0" angle="0" x="264.89" y="50.83" base="56.59" width="4.27008" height="7.36512">1</TOKEN> | |
</TEXT> | |
<!-- snip many more lines --> | |
<!-- This include goes to a "vectorial image" in XML format, which is very similar to SVG. | |
See https://sourceforge.net/p/pdf2xml/discussion/714681/thread/dd35f74f/ | |
This needs to be parsed to get the filled and non-numbered empty squares in the grid. | |
--> | |
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="Aug0621.xml_data/image-1.vec" /> | |
<!-- the XML footer --> | |
</PAGE> | |
</DOCUMENT> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment