Created
July 7, 2020 13:11
-
-
Save nikopartanen/f4bc1a093e777e44ca89306eea83d661 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import xml.etree.cElementTree as ET | |
import re | |
# Tieto noista ylä- ja alaindekseistä on merkitty tällaisiin elementteihin: | |
# textStyle {offset:13; length:1;superscript:true;} | |
# Niistä pitää poimia milloin niitä on, j amiten pitkiä ne ovat | |
def get_offset_info(offsets): | |
offset_dicts = [] | |
for offset in offsets: | |
info = {} | |
info['type'] = re.findall('((super|sub)script)', offset)[0][0] | |
info['start'] = int(re.findall('(?<=offset:)\d+', offset)[0]) | |
info['length'] = int(re.findall('(?<=length:)\d+', offset)[0]) | |
offset_dicts.append(info) | |
for position, offset in enumerate(offset_dicts): | |
if position == 0: | |
offset['scriptlength'] = offset['length'] + 2 | |
if position != 0: | |
offset['start'] = offset['start'] + (2 * position) | |
offset['scriptlength'] = offset['length'] + 2 | |
return(offset_dicts) | |
# Sitten täytyy lisätä sinne tekstiriviin vastaava tieto | |
# Noiden offsettien muuttaminen on siten ärsyttävää, että | |
# ne aina nostavat sen merkkijonon pituutta, mikä pitää | |
# ottaa huomioon. | |
def return_marked_string(string, offsets): | |
#string = "142 13, 3020 7 Ja 173, 249" | |
offset_info = get_offset_info(offsets) | |
for offset in offset_info: | |
if offset['type'] == 'subscript': | |
marker = '/' | |
if offset['type'] == 'superscript': | |
marker = '\\' | |
start = offset['start'] | |
length = offset['length'] | |
string = string[:start] + marker + string[start:start + length] + marker + string[start + length:] | |
offset_strings = [] | |
for offset in offset_info: | |
offset_string = f"textStyle {{offset:{offset['start']}; length:{offset['scriptlength']};{offset['type']}:true;}}" | |
offset_strings.append(offset_string) | |
fixed_string = ' '.join(offset_strings) | |
return(string, fixed_string) | |
# Tämä käsittelee tuon XML:n noita funktioita käyttäen | |
def convert_subscript_to_text(page_xml, target_file): | |
tree = ET.parse(page_xml) | |
root = tree.getroot() | |
xmlns = {'page': '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}'} | |
for line in root.findall('.//{page}TextLine'.format(**xmlns)): | |
for word in line.findall('.//{page}Word'.format(**xmlns)): | |
line.remove(word) | |
for line in root.findall('.//{page}TextLine'.format(**xmlns)): | |
custom = line.get('custom') | |
if 'superscript' in custom or 'subscript' in custom: | |
styles = re.findall('textStyle ({.+?})', custom) | |
textnode = line.find('.//{page}TextEquiv/{page}Unicode'.format(**xmlns)) | |
new_string, new_style = return_marked_string(textnode.text, styles) | |
new_custom = re.sub(r'textStyle.+', new_style, custom) | |
line.set('custom', new_custom) | |
textnode.text = new_string | |
ET.register_namespace('',"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15") | |
tree.write(target_file, | |
xml_declaration=True,encoding='utf-8', | |
method="xml") | |
convert_subscript_to_text(page_xml = '/Users/niko/Downloads/export_job_1079370/386199/Worterbuch_1937/page/Worterbuch_1937-004_2R.xml', | |
target_file = 'test/Worterbuch_1937-004_2R.xml') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment