Last active
April 20, 2025 12:45
-
-
Save FrancoisCapon/8db243283cc18be24a9ce6351e478b13 to your computer and use it in GitHub Desktop.
Extracts embedded fonts from a Microsoft Word Document (.docx)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Standard Library | |
import zipfile, xml.etree.ElementTree, shutil, os, sys, pathlib | |
# Classes and Associations | |
# [Docx] (1)---(1) [DocxFonts] (1)---(1..n) [DocxFont] (1)---(0..n) [DocxOdttfFont] | |
class Docx(): | |
# https://ecma-international.org/publications-and-standards/standards/ecma-376/ | |
# Part 1 “Fundamentals And Markup Language Reference”, 5th edition, December 2016 | |
# 11. WordprocessingML | |
def __init__(self, file_name): | |
self.file_name = file_name | |
self.zipfile = zipfile.PyZipFile(file_name) | |
self.yaml_log = f"---\nfile_name: {self.file_name}\n" | |
def extract_fonts(self): | |
DocxFonts(self).extract() | |
return self.yaml_log | |
class DocxFonts(): # [Docx] (1)---(1) [DocxFonts] | |
# [Content_Types].xml | |
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?> | |
# <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"> | |
# <Default Extension="odttf" ContentType="application/vnd.openxmlformats-officedocument.obfuscatedFont"/> | |
# ... | |
# <Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/> | |
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?> | |
# <w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" | |
# xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" | |
# xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" | |
# ... | |
# <w:font w:name="Aptos"> | |
zipped_xml_table_name = 'word/fontTable.xml' | |
r_namspace_uri = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' | |
w_namspace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' | |
namespaces = {'r': r_namspace_uri, 'w': w_namspace_uri} | |
def __init__(self, docx): | |
self.docx = docx | |
self.xml_table = docx.zipfile.read(self.zipped_xml_table_name) | |
self.xml_table = xml.etree.ElementTree.fromstring(self.xml_table) | |
self.xml_fonts = self.xml_table.findall('.//w:font', DocxFonts.namespaces) | |
def extract(self): | |
self.extracted_directory = pathlib.Path(f"{self.docx.file_name}.fonts") | |
shutil.rmtree(self.extracted_directory, ignore_errors = True) | |
os.makedirs(self.extracted_directory) | |
self.docx.yaml_log += f"fonts:\n" | |
for xml_font in self.xml_fonts: | |
DocxFont(self, xml_font).extract() | |
class DocxFont(): # [DocxFonts] (1)---(1..n) [DocxFont] | |
# https://ecma-international.org/publications-and-standards/standards/ecma-376/ | |
# Part 1 “Fundamentals And Markup Language Reference”, 5th edition, December 2016 | |
# 17. WordprocessingML Reference Material | |
# 17.8 Fonts | |
# 17.8.1 Font Embedding | |
# <w:font w:name="Times New Roman"> | |
# <w:panose1 w:val="02020603050405020304"/> | |
# <w:charset w:val="00"/> | |
# <w:family w:val="roman"/> | |
# <w:pitch w:val="variable"/> | |
# <w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/> | |
# </w:font> | |
# <w:font w:name="Aptos"> | |
# <w:charset w:val="00"/> | |
# <w:family w:val="swiss"/> | |
# <w:pitch w:val="variable"/> | |
# <w:sig w:usb0="20000287" w:usb1="00000003" w:usb2="00000000" w:usb3="00000000" w:csb0="0000019F" w:csb1="00000000"/> | |
# <w:embedRegular r:id="rId1" w:fontKey="{695C1753-A322-488E-9BAB-90BBBA6B2839}"/> | |
# <w:embedItalic r:id="rId2" w:fontKey="{5CB96690-C171-4250-B3CC-DF32ED694182}"/> | |
# </w:font> | |
def __init__(self, docx_fonts, xml_font): | |
self.docx_fonts = docx_fonts | |
self.xml_font = xml_font | |
self.name = self.xml_font.attrib[f"{{{ DocxFonts.w_namspace_uri}}}name"] | |
self.xml_odttf_fonts = self.xml_font.findall('.//*[@w:fontKey]', DocxFonts.namespaces) | |
self.docx_fonts.docx.yaml_log += f" - font: {self.name}\n" | |
def extract(self): | |
for xml_odttf_font in self.xml_odttf_fonts: | |
DocxOdttfFont(self, xml_odttf_font).extract() | |
class DocxOdttfFont(): # [DocxFont] (1)---(0..n) [DocxOdttfFont] | |
# [Content_Types].xml | |
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?> | |
# <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"> | |
# <Default Extension="odttf" ContentType="application/vnd.openxmlformats-officedocument.obfuscatedFont"/> | |
extension = 'odttf' | |
# word/_rels/fontTable.xml.rels | |
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?> | |
# <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"> | |
# <Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/font" Target="fonts/font8.odttf"/> | |
zipped_directory = 'word/fonts' | |
prefix = 'font' | |
# <w:embedItalic r:id="rId2" w:fontKey="{5CB96690-C171-4250-B3CC-DF32ED694182}"/> | |
# AptosItalic.ttf | |
def __init__(self, docx_font, xml_odttf_font): | |
self.docx_font = docx_font | |
self.font_name = self.docx_font.name.replace(' ','') | |
self.xml_font = xml_odttf_font | |
self.style = self.xml_font.tag.replace(f"{{{DocxFonts.w_namspace_uri}}}embed", '') | |
self.ttf_file_name = f"{self.font_name}{self.style}.ttf" | |
self.id = self.xml_font.attrib[f"{{{DocxFonts.r_namspace_uri}}}id"][3:] | |
self.key_hexa = self.xml_font.attrib[f"{{{DocxFonts.w_namspace_uri}}}fontKey"][1:-1].replace('-','') | |
def extract(self): | |
zipped_odttf_path = f"{self.zipped_directory}/{self.prefix}" | |
zipped_odttf_path += f"{self.id}.{self.extension}" | |
odttf_data = self.docx_font.docx_fonts.docx.zipfile.read(zipped_odttf_path) | |
self.ttf_path = self.docx_font.docx_fonts.extracted_directory / self.ttf_file_name | |
self.ttf_path.write_bytes(odttf_data) | |
self.deobfuscate() | |
self.docx_font.docx_fonts.docx.yaml_log += f" ttf: {self.ttf_file_name}\n" | |
def deobfuscate(self): | |
# https://discuss.python.org/t/xor-operator-between-bytes/17910/13 | |
obfucated_bytes_count = 32 | |
key_bytes = bytearray.fromhex(self.key_hexa) | |
# 32 bytes big endian int : 16 bytes big endian + 16 bytes big endian | |
double_key_int = int.from_bytes(key_bytes * 2, byteorder='big', signed=False) | |
ttf_file = self.ttf_path.open('r+b') # rw binary | |
obfucated_int = int.from_bytes(ttf_file.read(obfucated_bytes_count), byteorder='little', signed=False) | |
deobfucated_int = obfucated_int ^ double_key_int | |
ttf_file.seek(0) | |
ttf_file.write(deobfucated_int.to_bytes(obfucated_bytes_count, byteorder='little', signed=False)) | |
ttf_file.close() | |
if __name__ == '__main__': | |
if len(sys.argv) < 2 : | |
print(f""" | |
usage: {sys.argv[0]} first.docx ... last.docx | |
first.docx.fonts/FontOneStyle.ttf | |
first.docx.fonts/FontTwoStyle.ttf | |
... | |
... | |
last.docx.fonts/FontOneStyle.ttf | |
""") | |
exit() | |
# doxc2ttf.py *.docx | |
for file in sys.argv[1:]: | |
try: | |
print(Docx(file).extract_fonts()) | |
except Exception as exception: | |
print(exception) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment