Last active
May 22, 2022 09:56
-
-
Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Drop Glottolog bibfiles for control characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "7ac9103f-50a3-4b07-b5ce-27f024588879", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'anla.bib',\n", | |
" 'apics.bib',\n", | |
" 'asjp2010.bib',\n", | |
" 'autotyp.bib',\n", | |
" 'bahasa.bib',\n", | |
" 'bibliolux.bib',\n", | |
" 'bowern.bib',\n", | |
" 'eballiso2009.bib',\n", | |
" 'fabreall2009ann.bib',\n", | |
" 'gilbertese.bib',\n", | |
" 'gj.bib',\n", | |
" 'goba.bib',\n", | |
" 'guldemann.bib',\n", | |
" 'haspelmath.bib',\n", | |
" 'hedvig-tirailleur.bib',\n", | |
" 'lapolla-tibeto-burman.bib',\n", | |
" 'lewinmanx.bib',\n", | |
" 'ludger-paschen-germanic.bib',\n", | |
" 'marctang.bib',\n", | |
" 'mpieva.bib',\n", | |
" 'otomanguean.bib',\n", | |
" 'ozbib.bib',\n", | |
" 'phoible.bib',\n", | |
" 'sala.bib',\n", | |
" 'schikowski_chintang.bib',\n", | |
" 'seifart.bib',\n", | |
" 'silpng.bib',\n", | |
" 'sn.bib',\n", | |
" 'stampe.bib',\n", | |
" 'wals.bib',\n", | |
" 'weball.bib',\n", | |
" 'zorcpapers.bib',\n", | |
" 'zurich.bib'}" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"\"\"\"https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block)\"\"\"\n", | |
"\n", | |
"import collections\n", | |
"import configparser\n", | |
"import pathlib\n", | |
"import re\n", | |
"\n", | |
"GLOTTOLOG = pathlib.Path('~/Desktop/glottolog').expanduser()\n", | |
"\n", | |
"REFERENCES = GLOTTOLOG / 'references'\n", | |
"\n", | |
"CONFIG = REFERENCES / 'BIBFILES.ini'\n", | |
"\n", | |
"BIBTEX = REFERENCES / 'bibtex'\n", | |
"\n", | |
"HH_BIB = BIBTEX / 'hh.bib'\n", | |
"\n", | |
"SKIP = {'degruyter.bib',\n", | |
" 'sil16.bib'}\n", | |
"\n", | |
"ENCODING = 'utf-8'\n", | |
"\n", | |
"\n", | |
"class Config(configparser.ConfigParser):\n", | |
" \n", | |
" @classmethod\n", | |
" def from_path(cls, path, *, encoding=ENCODING):\n", | |
" inst = cls()\n", | |
" with path.open(encoding=encoding) as f:\n", | |
" inst.read_file(f)\n", | |
" return inst\n", | |
"\n", | |
"\n", | |
"config = Config.from_path(CONFIG)\n", | |
"editable = {filename for filename, section in config.items()\n", | |
" if 'enhancement of existing items possible' in section['curation'].lower()\n", | |
" and filename not in SKIP}\n", | |
"editable" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "09fe3cd1-02ae-4fb2-868b-1208b0bb20e6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Counter()" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"C1_CONTROL_CHAR = re.compile(r'[\\u0080-\\u009f]')\n", | |
"\n", | |
"\n", | |
"def count_control_chars(path, *, verbose=True, encoding=ENCODING):\n", | |
" result = collections.Counter()\n", | |
" with path.open(encoding=ENCODING) as f:\n", | |
" for i, line in enumerate(f, start=1):\n", | |
" chars = C1_CONTROL_CHAR.findall(line)\n", | |
" if chars:\n", | |
" codes = collections.Counter(f'U+{ord(c):04X}' for c in chars)\n", | |
" stats = ', '.join(codepoint + ('*' * (count - 1) if count > 1 else '')\n", | |
" for codepoint, count in codes.most_common())\n", | |
" print(path.name, i, stats)\n", | |
" result += codes\n", | |
" return result\n", | |
"\n", | |
"\n", | |
"count_control_chars(HH_BIB)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "b94b529b-e206-4960-988e-47eacdee31cd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"47" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bibfiles = sorted(BIBTEX.glob('*.bib'))\n", | |
"len(bibfiles)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "30ba69d2-85e9-421d-ad4b-681d28661cba", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"benjamins.bib 28915 U+0097**, U+0092\n", | |
"benjamins.bib 28954 U+0093, U+0094\n", | |
"benjamins.bib 179283 U+0092\n", | |
"benjamins.bib 179331 U+0092**, U+0091*\n", | |
"benjamins.bib 179707 U+009A\n", | |
"benjamins.bib 182008 U+009A\n", | |
"benjamins.bib 201477 U+0093, U+0092, U+0094\n", | |
"benjamins.bib 201503 U+0092\n", | |
"benjamins.bib 201509 U+0092*, U+0093, U+0094\n", | |
"benjamins.bib 201535 U+0097*\n", | |
"benjamins.bib 211596 U+0092**\n", | |
"benjamins.bib 217734 U+0092\n", | |
"benjamins.bib 217785 U+0092\n", | |
"benjamins.bib 233323 U+009A*\n", | |
"benjamins.bib 233851 U+009A*\n", | |
"benjamins.bib 234367 U+009A\n", | |
"benjamins.bib 234637 U+009A\n", | |
"benjamins.bib 234647 U+009A***********\n", | |
"benjamins.bib 250920 U+0092\n", | |
"benjamins.bib 250930 U+0092**\n", | |
"benjamins.bib 250962 U+0096\n", | |
"benjamins.bib 251058 U+0092*\n", | |
"benjamins.bib 251092 U+0092*\n", | |
"benjamins.bib 251173 U+0092*, U+0091\n", | |
"benjamins.bib 251190 U+0096\n", | |
"benjamins.bib 251263 U+0092\n", | |
"benjamins.bib 251297 U+0092\n", | |
"benjamins.bib 268623 U+009A\n", | |
"benjamins.bib 275729 U+009A\n", | |
"benjamins.bib 310265 U+0092\n", | |
"benjamins.bib 310282 U+0097***, U+0091, U+0092\n", | |
"benjamins.bib 310299 U+0092, U+0093, U+0094\n", | |
"benjamins.bib 310315 U+0093*, U+0094*, U+0092*, U+0097*\n", | |
"benjamins.bib 310331 U+0092**, U+0093, U+0094\n", | |
"benjamins.bib 319306 U+0092\n", | |
"benjamins.bib 319316 U+0093*, U+0094*\n", | |
"benjamins.bib 319348 U+0093, U+0094\n", | |
"benjamins.bib 427487 U+0092\n", | |
"benjamins.bib 427536 U+0093**, U+0094**\n", | |
"benjamins.bib 427553 U+0093, U+0094\n", | |
"benjamins.bib 427570 U+0097*\n", | |
"benjamins.bib 442233 U+008A\n", | |
"benjamins.bib 442329 U+008A\n", | |
"benjamins.bib 469395 U+0097***\n", | |
"benjamins.bib 660000 U+0091, U+0092\n", | |
"benjamins.bib 660010 U+0091**, U+0092**\n", | |
"benjamins.bib 660027 U+0092***, U+0091**\n", | |
"benjamins.bib 660378 U+0092*******, U+0093, U+0094\n", | |
"degruyter.bib 530160 U+0091\n", | |
"degruyter.bib 568722 U+0096\n", | |
"degruyter.bib 568750 U+0091, U+0092\n", | |
"degruyter.bib 569352 U+0092*, U+0091\n", | |
"degruyter.bib 569366 U+0091, U+0092\n", | |
"degruyter.bib 569422 U+0091, U+0092\n", | |
"degruyter.bib 569576 U+0091, U+0092\n", | |
"degruyter.bib 569744 U+0091, U+0092\n", | |
"degruyter.bib 569758 U+0091, U+0092\n", | |
"degruyter.bib 569954 U+0091, U+0092\n", | |
"degruyter.bib 570122 U+0096\n", | |
"degruyter.bib 570752 U+0091, U+0092\n", | |
"degruyter.bib 578214 U+0096\n", | |
"degruyter.bib 578508 U+0096\n", | |
"degruyter.bib 578886 U+0096\n", | |
"degruyter.bib 579880 U+0091, U+0092\n", | |
"degruyter.bib 580650 U+0084, U+0094\n", | |
"degruyter.bib 580804 U+0096\n", | |
"degruyter.bib 581406 U+0092\n", | |
"degruyter.bib 581700 U+0091, U+0092\n", | |
"degruyter.bib 581770 U+0091, U+0092\n", | |
"degruyter.bib 581784 U+0091, U+0092\n", | |
"degruyter.bib 581882 U+0084*, U+0094*\n", | |
"degruyter.bib 582120 U+0091, U+0092\n", | |
"degruyter.bib 582302 U+0093, U+0094\n", | |
"degruyter.bib 582330 U+0084, U+0093\n", | |
"degruyter.bib 582344 U+0092, U+0084, U+0094\n", | |
"degruyter.bib 582414 U+0094, U+0093\n", | |
"degruyter.bib 582470 U+0084, U+0093\n", | |
"degruyter.bib 585229 U+0096\n", | |
"degruyter.bib 586629 U+0092*\n", | |
"degruyter.bib 622167 U+0092*\n", | |
"degruyter.bib 622257 U+0093, U+0094\n", | |
"degruyter.bib 622272 U+0092\n", | |
"degruyter.bib 622407 U+0092, U+0096\n", | |
"degruyter.bib 622422 U+0092\n", | |
"degruyter.bib 697231 U+0092*\n", | |
"degruyter.bib 697246 U+0093****, U+0094****, U+0092\n", | |
"degruyter.bib 697261 U+0092\n", | |
"degruyter.bib 697320 U+0092**\n", | |
"degruyter.bib 697350 U+0092*\n", | |
"degruyter.bib 697365 U+0092\n", | |
"degruyter.bib 713347 U+0096\n", | |
"degruyter.bib 713515 U+0096*\n", | |
"degruyter.bib 713809 U+0096\n", | |
"degruyter.bib 713977 U+0096*\n", | |
"degruyter.bib 715588 U+0096\n", | |
"degruyter.bib 715854 U+0096\n", | |
"degruyter.bib 716063 U+0086\n", | |
"degruyter.bib 716120 U+0084, U+0094\n", | |
"degruyter.bib 716470 U+0084, U+0094\n", | |
"degruyter.bib 716568 U+0084*, U+0094*\n", | |
"degruyter.bib 716582 U+0084, U+0094\n", | |
"degruyter.bib 716596 U+0096\n", | |
"degruyter.bib 716680 U+0096*\n", | |
"degruyter.bib 717660 U+0096\n", | |
"degruyter.bib 804023 U+0084, U+0093\n", | |
"degruyter.bib 804039 U+0084, U+0093, U+0096\n", | |
"degruyter.bib 804055 U+009A*, U+0084, U+0093\n", | |
"degruyter.bib 804104 U+0096\n", | |
"degruyter.bib 804152 U+0084, U+0093\n", | |
"degruyter.bib 804295 U+0084, U+0093\n", | |
"degruyter.bib 804311 U+0096\n", | |
"degruyter.bib 804327 U+0084, U+009E, U+0093\n", | |
"degruyter.bib 804343 U+0096\n", | |
"degruyter.bib 804359 U+0096\n", | |
"degruyter.bib 804375 U+0096\n", | |
"degruyter.bib 804390 U+0084, U+0093, U+0096\n", | |
"degruyter.bib 804447 U+0096*\n", | |
"degruyter.bib 804533 U+0084, U+0093, U+0096\n", | |
"degruyter.bib 804558 U+0096\n", | |
"degruyter.bib 804879 U+0096\n", | |
"degruyter.bib 805198 U+009A*\n", | |
"degruyter.bib 805246 U+0096\n", | |
"degruyter.bib 805321 U+0096\n", | |
"degruyter.bib 805336 U+0084, U+0093, U+0096\n", | |
"degruyter.bib 805361 U+0084**, U+0093*, U+0096, U+0094\n", | |
"degruyter.bib 805461 U+0093, U+0094\n", | |
"degruyter.bib 805485 U+0084, U+0093, U+0096\n", | |
"degruyter.bib 805558 U+0096***, U+009A*\n", | |
"degruyter.bib 805757 U+0096\n", | |
"degruyter.bib 805882 U+0096\n", | |
"degruyter.bib 805912 U+0084, U+0096, U+0093\n", | |
"degruyter.bib 805927 U+0096\n", | |
"degruyter.bib 805952 U+009A, U+0096\n", | |
"degruyter.bib 805984 U+0096\n", | |
"degruyter.bib 806166 U+0096**\n", | |
"degruyter.bib 806198 U+009A\n", | |
"degruyter.bib 806905 U+0096\n", | |
"degruyter.bib 808589 U+0096\n", | |
"degruyter.bib 808841 U+0096\n", | |
"degruyter.bib 810227 U+0096*\n", | |
"degruyter.bib 810997 U+0096\n", | |
"degruyter.bib 811053 U+0084, U+0094\n", | |
"degruyter.bib 845233 U+0093, U+0094\n", | |
"degruyter.bib 845250 U+0096***\n", | |
"degruyter.bib 845335 U+0096*\n", | |
"degruyter.bib 845480 U+0096*\n", | |
"degruyter.bib 845582 U+0093, U+0094\n", | |
"degruyter.bib 845604 U+0096\n", | |
"degruyter.bib 845715 U+0096\n", | |
"degruyter.bib 846030 U+0096\n", | |
"evobib.bib 3918 U+0094*\n", | |
"evobib.bib 5236 U+0092\n", | |
"evobib.bib 6018 U+0092\n", | |
"evobib.bib 7279 U+0094\n", | |
"evobib.bib 8854 U+0092*\n", | |
"evobib.bib 12428 U+0092\n", | |
"evobib.bib 12897 U+0092\n", | |
"evobib.bib 12980 U+0092\n", | |
"evobib.bib 13107 U+0094*\n", | |
"evobib.bib 13879 U+0088*\n", | |
"evobib.bib 16104 U+0094\n", | |
"evobib.bib 19572 U+0092\n", | |
"evobib.bib 20442 U+0094*\n", | |
"evobib.bib 21535 U+0094*\n", | |
"evobib.bib 22127 U+0094\n", | |
"evobib.bib 28276 U+0094\n", | |
"evobib.bib 29672 U+0092\n", | |
"evobib.bib 33284 U+0092\n", | |
"evobib.bib 33656 U+0094**\n", | |
"evobib.bib 34386 U+008C\n", | |
"evobib.bib 34387 U+008C\n", | |
"evobib.bib 34578 U+0092\n", | |
"evobib.bib 35756 U+0094\n", | |
"evobib.bib 37019 U+0094\n", | |
"evobib.bib 37080 U+0094*\n", | |
"evobib.bib 39648 U+0094\n", | |
"evobib.bib 40097 U+0092\n", | |
"evobib.bib 40353 U+0092\n", | |
"evobib.bib 40434 U+0092\n", | |
"ldh.bib 12171 U+0092\n", | |
"ldh.bib 12817 U+0093, U+0094\n", | |
"ldh.bib 13861 U+0096\n", | |
"sil16.bib 75677 U+0080\n", | |
"sil16.bib 85046 U+0080\n", | |
"sil16.bib 124097 U+0090\n", | |
"sil16.bib 129034 U+0080*\n", | |
"sil16.bib 137779 U+0090\n", | |
"sil16.bib 143544 U+0080**\n", | |
"sil16.bib 146284 U+0080*\n" | |
] | |
} | |
], | |
"source": [ | |
"total = collections.Counter()\n", | |
"\n", | |
"for path in bibfiles:\n", | |
" total += count_control_chars(path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "749dc209-c154-40ff-a9f1-e9ecacbe8900", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('U+0092', 101),\n", | |
" ('U+0096', 70),\n", | |
" ('U+0094', 58),\n", | |
" ('U+0093', 42),\n", | |
" ('U+009A', 30),\n", | |
" ('U+0091', 26),\n", | |
" ('U+0084', 26),\n", | |
" ('U+0097', 17),\n", | |
" ('U+0080', 9),\n", | |
" ('U+008A', 2),\n", | |
" ('U+0088', 2),\n", | |
" ('U+008C', 2),\n", | |
" ('U+0090', 2),\n", | |
" ('U+0086', 1),\n", | |
" ('U+009E', 1)]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"total.most_common()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "729d15df-e759-4a08-bf61-fe625a6b4778", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for path in bibfiles:\n", | |
" if path.name not in editable:\n", | |
" continue\n", | |
"\n", | |
" old = path.read_text(encoding=ENCODING)\n", | |
" new, replaced = C1_CONTROL_CHAR.subn('', old)\n", | |
"\n", | |
" if replaced:\n", | |
" print(f'{path.name} replaced {replaced} characters')\n", | |
" path.write_text(new, encoding=ENCODING)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment