Skip to content

Instantly share code, notes, and snippets.

@FrancoisCapon
Last active April 20, 2025 12:45
Show Gist options
  • Save FrancoisCapon/8db243283cc18be24a9ce6351e478b13 to your computer and use it in GitHub Desktop.
Save FrancoisCapon/8db243283cc18be24a9ce6351e478b13 to your computer and use it in GitHub Desktop.
Extracts embedded fonts from a Microsoft Word Document (.docx)
#!/usr/bin/env python3
# Standard Library
import zipfile, xml.etree.ElementTree, shutil, os, sys, pathlib
# Classes and Associations
# [Docx] (1)---(1) [DocxFonts] (1)---(1..n) [DocxFont] (1)---(0..n) [DocxOdttfFont]
class Docx():
# https://ecma-international.org/publications-and-standards/standards/ecma-376/
# Part 1 “Fundamentals And Markup Language Reference”, 5th edition, December 2016
# 11. WordprocessingML
def __init__(self, file_name):
self.file_name = file_name
self.zipfile = zipfile.PyZipFile(file_name)
self.yaml_log = f"---\nfile_name: {self.file_name}\n"
def extract_fonts(self):
DocxFonts(self).extract()
return self.yaml_log
class DocxFonts(): # [Docx] (1)---(1) [DocxFonts]
# [Content_Types].xml
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
# <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
# <Default Extension="odttf" ContentType="application/vnd.openxmlformats-officedocument.obfuscatedFont"/>
# ...
# <Override PartName="/word/fontTable.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"/>
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
# <w:fonts xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
# xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
# xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
# ...
# <w:font w:name="Aptos">
zipped_xml_table_name = 'word/fontTable.xml'
r_namspace_uri = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
w_namspace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
namespaces = {'r': r_namspace_uri, 'w': w_namspace_uri}
def __init__(self, docx):
self.docx = docx
self.xml_table = docx.zipfile.read(self.zipped_xml_table_name)
self.xml_table = xml.etree.ElementTree.fromstring(self.xml_table)
self.xml_fonts = self.xml_table.findall('.//w:font', DocxFonts.namespaces)
def extract(self):
self.extracted_directory = pathlib.Path(f"{self.docx.file_name}.fonts")
shutil.rmtree(self.extracted_directory, ignore_errors = True)
os.makedirs(self.extracted_directory)
self.docx.yaml_log += f"fonts:\n"
for xml_font in self.xml_fonts:
DocxFont(self, xml_font).extract()
class DocxFont(): # [DocxFonts] (1)---(1..n) [DocxFont]
# https://ecma-international.org/publications-and-standards/standards/ecma-376/
# Part 1 “Fundamentals And Markup Language Reference”, 5th edition, December 2016
# 17. WordprocessingML Reference Material
# 17.8 Fonts
# 17.8.1 Font Embedding
# <w:font w:name="Times New Roman">
# <w:panose1 w:val="02020603050405020304"/>
# <w:charset w:val="00"/>
# <w:family w:val="roman"/>
# <w:pitch w:val="variable"/>
# <w:sig w:usb0="E0002AFF" w:usb1="C0007843" w:usb2="00000009" w:usb3="00000000" w:csb0="000001FF" w:csb1="00000000"/>
# </w:font>
# <w:font w:name="Aptos">
# <w:charset w:val="00"/>
# <w:family w:val="swiss"/>
# <w:pitch w:val="variable"/>
# <w:sig w:usb0="20000287" w:usb1="00000003" w:usb2="00000000" w:usb3="00000000" w:csb0="0000019F" w:csb1="00000000"/>
# <w:embedRegular r:id="rId1" w:fontKey="{695C1753-A322-488E-9BAB-90BBBA6B2839}"/>
# <w:embedItalic r:id="rId2" w:fontKey="{5CB96690-C171-4250-B3CC-DF32ED694182}"/>
# </w:font>
def __init__(self, docx_fonts, xml_font):
self.docx_fonts = docx_fonts
self.xml_font = xml_font
self.name = self.xml_font.attrib[f"{{{ DocxFonts.w_namspace_uri}}}name"]
self.xml_odttf_fonts = self.xml_font.findall('.//*[@w:fontKey]', DocxFonts.namespaces)
self.docx_fonts.docx.yaml_log += f" - font: {self.name}\n"
def extract(self):
for xml_odttf_font in self.xml_odttf_fonts:
DocxOdttfFont(self, xml_odttf_font).extract()
class DocxOdttfFont(): # [DocxFont] (1)---(0..n) [DocxOdttfFont]
# [Content_Types].xml
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
# <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
# <Default Extension="odttf" ContentType="application/vnd.openxmlformats-officedocument.obfuscatedFont"/>
extension = 'odttf'
# word/_rels/fontTable.xml.rels
# <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
# <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
# <Relationship Id="rId8" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/font" Target="fonts/font8.odttf"/>
zipped_directory = 'word/fonts'
prefix = 'font'
# <w:embedItalic r:id="rId2" w:fontKey="{5CB96690-C171-4250-B3CC-DF32ED694182}"/>
# AptosItalic.ttf
def __init__(self, docx_font, xml_odttf_font):
self.docx_font = docx_font
self.font_name = self.docx_font.name.replace(' ','')
self.xml_font = xml_odttf_font
self.style = self.xml_font.tag.replace(f"{{{DocxFonts.w_namspace_uri}}}embed", '')
self.ttf_file_name = f"{self.font_name}{self.style}.ttf"
self.id = self.xml_font.attrib[f"{{{DocxFonts.r_namspace_uri}}}id"][3:]
self.key_hexa = self.xml_font.attrib[f"{{{DocxFonts.w_namspace_uri}}}fontKey"][1:-1].replace('-','')
def extract(self):
zipped_odttf_path = f"{self.zipped_directory}/{self.prefix}"
zipped_odttf_path += f"{self.id}.{self.extension}"
odttf_data = self.docx_font.docx_fonts.docx.zipfile.read(zipped_odttf_path)
self.ttf_path = self.docx_font.docx_fonts.extracted_directory / self.ttf_file_name
self.ttf_path.write_bytes(odttf_data)
self.deobfuscate()
self.docx_font.docx_fonts.docx.yaml_log += f" ttf: {self.ttf_file_name}\n"
def deobfuscate(self):
# https://discuss.python.org/t/xor-operator-between-bytes/17910/13
obfucated_bytes_count = 32
key_bytes = bytearray.fromhex(self.key_hexa)
# 32 bytes big endian int : 16 bytes big endian + 16 bytes big endian
double_key_int = int.from_bytes(key_bytes * 2, byteorder='big', signed=False)
ttf_file = self.ttf_path.open('r+b') # rw binary
obfucated_int = int.from_bytes(ttf_file.read(obfucated_bytes_count), byteorder='little', signed=False)
deobfucated_int = obfucated_int ^ double_key_int
ttf_file.seek(0)
ttf_file.write(deobfucated_int.to_bytes(obfucated_bytes_count, byteorder='little', signed=False))
ttf_file.close()
if __name__ == '__main__':
if len(sys.argv) < 2 :
print(f"""
usage: {sys.argv[0]} first.docx ... last.docx
first.docx.fonts/FontOneStyle.ttf
first.docx.fonts/FontTwoStyle.ttf
...
...
last.docx.fonts/FontOneStyle.ttf
""")
exit()
# doxc2ttf.py *.docx
for file in sys.argv[1:]:
try:
print(Docx(file).extract_fonts())
except Exception as exception:
print(exception)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment