Last active
April 26, 2021 16:54
-
-
Save jpmckinney/f3bdbb62620a9974ba1ff254392f6b6d to your computer and use it in GitHub Desktop.
Exports the descriptions of data quality checks from Pelican.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generates: | |
# https://docs.google.com/document/d/1l5SL1hUR9n8IDYVLSYsU85hdB3L_u2e3iD-txQPc0zU/edit | |
# https://docs.google.com/spreadsheets/d/1WPPOSf7xM9LR3VlTGx7wyzc8q_5xQ3HyMoB8eaEWHDM/edit#gid=0 | |
# Gist: https://gist.github.com/jpmckinney/f3bdbb62620a9974ba1ff254392f6b6d | |
# GitHub: https://github.com/open-contracting/pelican/issues/63 | |
# CRM: https://crm.open-contracting.org/issues/5908 | |
import csv | |
import json | |
import re | |
from html.parser import HTMLParser | |
import polib | |
from jsonpointer import JsonPointerException, resolve_pointer | |
def convert(html): | |
return re.sub(r'<code>(.+?)</code>', r"""<span style="font-family:'Roboto Mono'">\1</span>""", html) | |
# https://github.com/django/django/blob/master/django/utils/html.py | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
super().__init__() | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(value): | |
s = MLStripper() | |
s.feed(value) | |
s.close() | |
return s.get_data() | |
# Convert frontend/src/messages/en.js to JSON using a tool like https://www.convertonline.io/convert/js-to-json | |
with open('en.json') as f: | |
data = json.load(f) | |
toc = { | |
'resourceLevel': [ | |
'coherent', | |
'consistent', | |
'reference', | |
], | |
'datasetLevel': [ | |
'distribution', | |
'misc', | |
'reference', | |
'consistent', | |
], | |
} | |
replacements = { | |
'field.exists': 'fieldDetail.coverage.exists', | |
'field.non_empty': 'fieldDetail.coverage.non_empty', | |
'field.': 'fieldDetail.quality.', | |
'resource.': 'resourceLevel.', | |
'dataset.': 'datasetLevel.', | |
} | |
field_replacements = { | |
'.name': '.count_header', | |
'.description': '.count_header_tooltip', | |
} | |
dataset_replacements = { | |
'.description': '.description_long', | |
} | |
html_replacements = { | |
r'<p>': '\n', | |
r'</p>': '\n', | |
r'<ul>': '\n', | |
r'</ul>': '', | |
r'<li>': '- ', | |
r'</li>': '\n', | |
} | |
filename = 'backend/dqt/locale/en/LC_MESSAGES/django.po' | |
po = polib.pofile(filename) | |
for entry in po: | |
pointer = entry.msgid | |
for old, new in replacements.items(): | |
pointer = pointer.replace(old, new) | |
if pointer.startswith('fieldDetail.'): | |
for old, new in field_replacements.items(): | |
pointer = pointer.replace(old, new) | |
if pointer.startswith('datasetLevel.'): | |
for old, new in dataset_replacements.items(): | |
pointer = pointer.replace(old, new) | |
pointer = '/' + pointer.replace('.', '/') | |
try: | |
message = resolve_pointer(data, pointer) | |
for pattern, replacement in html_replacements.items(): | |
message = re.sub(pattern, replacement, message) | |
entry.msgstr = strip_tags(message).strip() | |
except JsonPointerException as e: | |
print(f'{entry.msgid} -> {pointer}: {e}') | |
po.save(filename) | |
with open('en.html', 'w') as f: | |
f.write('<h2>{}</h2>'.format(data['field']['all'][4:])) | |
f.write(convert(data['field']['description'])) | |
for key in ('coverage', 'quality'): | |
obj = data['fieldDetail'][key] | |
obj.pop('failureSamplesPrefix') | |
f.write('<h3>{}</h3>'.format(obj.pop('label'))) | |
for check in obj.values(): | |
f.write('<h4>{name}</h4><p>{description}</p>'.format( | |
name=check['count_header'], description=convert(check['count_header_tooltip']))) | |
for level, types in toc.items(): | |
f.write('<h2>{}</h2>'.format(data[level]['subheadline'][4:])) | |
f.write(convert(data[level]['description'])) | |
for category in types: | |
label = data[level][category].pop('categoryName', None) | |
if label: | |
f.write('<h3>{}</h3>'.format(label)) | |
for key, check in data[level][category].items(): | |
f.write('<h4>{name}</h4><p>{description}</p>'.format( | |
name=check['name'], description=convert(check.get('description_long', check['description'])))) | |
f.write('<h2>{}</h2>'.format(data['timeLevel']['subheadline'][4:])) | |
f.write(convert(data['timeLevel']['description'])) | |
for key in ('ocid', 'phase_stable', 'tender_title'): | |
check = data['timeLevel'][key] | |
f.write('<h4>{name}</h4><p>{description}</p>'.format( | |
name=check['name'], description=convert(check['descriptionLong']))) | |
# If we wanted to build a whitelist of tags recognized by Google Docs, we can use this file to test what's recognized: | |
# https://github.com/cbracco/html5-test-page | |
with open('en.html') as f: | |
print("The tags used in en.html which need to be supported by Google Docs are:") | |
for tag in sorted(set(re.findall(r'<[^=/][^>]*>', f.read()))): | |
print(tag) | |
with open('en.csv', 'w') as f: | |
writer = csv.writer(f) | |
level = 'resourceLevel' | |
writer.writerow(['resource', '', data[level]['subheadline'][4:], data[level]['description']]) | |
for category in toc[level]: | |
writer.writerow(['resource', category, data[level][category].get('categoryName', None), '']) | |
for key, check in data[level][category].items(): | |
if key == 'categoryName': | |
continue | |
writer.writerow([ | |
'resource', | |
f'{category}.{key}', | |
check['name'], | |
check.get('description_long', check['description']), | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment