Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active May 22, 2022 09:56
Show Gist options
  • Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Save xflr6/888d878abbae5298f25b939774e3382b to your computer and use it in GitHub Desktop.
Drop Glottolog bibfiles for control characters
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7ac9103f-50a3-4b07-b5ce-27f024588879",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'anla.bib',\n",
" 'apics.bib',\n",
" 'asjp2010.bib',\n",
" 'autotyp.bib',\n",
" 'bahasa.bib',\n",
" 'bibliolux.bib',\n",
" 'bowern.bib',\n",
" 'eballiso2009.bib',\n",
" 'fabreall2009ann.bib',\n",
" 'gilbertese.bib',\n",
" 'gj.bib',\n",
" 'goba.bib',\n",
" 'guldemann.bib',\n",
" 'haspelmath.bib',\n",
" 'hedvig-tirailleur.bib',\n",
" 'lapolla-tibeto-burman.bib',\n",
" 'lewinmanx.bib',\n",
" 'ludger-paschen-germanic.bib',\n",
" 'marctang.bib',\n",
" 'mpieva.bib',\n",
" 'otomanguean.bib',\n",
" 'ozbib.bib',\n",
" 'phoible.bib',\n",
" 'sala.bib',\n",
" 'schikowski_chintang.bib',\n",
" 'seifart.bib',\n",
" 'silpng.bib',\n",
" 'sn.bib',\n",
" 'stampe.bib',\n",
" 'wals.bib',\n",
" 'weball.bib',\n",
" 'zorcpapers.bib',\n",
" 'zurich.bib'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block)\"\"\"\n",
"\n",
"import collections\n",
"import configparser\n",
"import pathlib\n",
"import re\n",
"\n",
"GLOTTOLOG = pathlib.Path('~/Desktop/glottolog').expanduser()\n",
"\n",
"REFERENCES = GLOTTOLOG / 'references'\n",
"\n",
"CONFIG = REFERENCES / 'BIBFILES.ini'\n",
"\n",
"BIBTEX = REFERENCES / 'bibtex'\n",
"\n",
"HH_BIB = BIBTEX / 'hh.bib'\n",
"\n",
"SKIP = {'degruyter.bib',\n",
" 'sil16.bib'}\n",
"\n",
"ENCODING = 'utf-8'\n",
"\n",
"\n",
"class Config(configparser.ConfigParser):\n",
" \n",
" @classmethod\n",
" def from_path(cls, path, *, encoding=ENCODING):\n",
" inst = cls()\n",
" with path.open(encoding=encoding) as f:\n",
" inst.read_file(f)\n",
" return inst\n",
"\n",
"\n",
"config = Config.from_path(CONFIG)\n",
"editable = {filename for filename, section in config.items()\n",
" if 'enhancement of existing items possible' in section['curation'].lower()\n",
" and filename not in SKIP}\n",
"editable"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "09fe3cd1-02ae-4fb2-868b-1208b0bb20e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter()"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"C1_CONTROL_CHAR = re.compile(r'[\\u0080-\\u009f]')\n",
"\n",
"\n",
"def count_control_chars(path, *, verbose=True, encoding=ENCODING):\n",
" result = collections.Counter()\n",
" with path.open(encoding=ENCODING) as f:\n",
" for i, line in enumerate(f, start=1):\n",
" chars = C1_CONTROL_CHAR.findall(line)\n",
" if chars:\n",
" codes = collections.Counter(f'U+{ord(c):04X}' for c in chars)\n",
" stats = ', '.join(codepoint + ('*' * (count - 1) if count > 1 else '')\n",
" for codepoint, count in codes.most_common())\n",
" print(path.name, i, stats)\n",
" result += codes\n",
" return result\n",
"\n",
"\n",
"count_control_chars(HH_BIB)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b94b529b-e206-4960-988e-47eacdee31cd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"47"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bibfiles = sorted(BIBTEX.glob('*.bib'))\n",
"len(bibfiles)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "30ba69d2-85e9-421d-ad4b-681d28661cba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"benjamins.bib 28915 U+0097**, U+0092\n",
"benjamins.bib 28954 U+0093, U+0094\n",
"benjamins.bib 179283 U+0092\n",
"benjamins.bib 179331 U+0092**, U+0091*\n",
"benjamins.bib 179707 U+009A\n",
"benjamins.bib 182008 U+009A\n",
"benjamins.bib 201477 U+0093, U+0092, U+0094\n",
"benjamins.bib 201503 U+0092\n",
"benjamins.bib 201509 U+0092*, U+0093, U+0094\n",
"benjamins.bib 201535 U+0097*\n",
"benjamins.bib 211596 U+0092**\n",
"benjamins.bib 217734 U+0092\n",
"benjamins.bib 217785 U+0092\n",
"benjamins.bib 233323 U+009A*\n",
"benjamins.bib 233851 U+009A*\n",
"benjamins.bib 234367 U+009A\n",
"benjamins.bib 234637 U+009A\n",
"benjamins.bib 234647 U+009A***********\n",
"benjamins.bib 250920 U+0092\n",
"benjamins.bib 250930 U+0092**\n",
"benjamins.bib 250962 U+0096\n",
"benjamins.bib 251058 U+0092*\n",
"benjamins.bib 251092 U+0092*\n",
"benjamins.bib 251173 U+0092*, U+0091\n",
"benjamins.bib 251190 U+0096\n",
"benjamins.bib 251263 U+0092\n",
"benjamins.bib 251297 U+0092\n",
"benjamins.bib 268623 U+009A\n",
"benjamins.bib 275729 U+009A\n",
"benjamins.bib 310265 U+0092\n",
"benjamins.bib 310282 U+0097***, U+0091, U+0092\n",
"benjamins.bib 310299 U+0092, U+0093, U+0094\n",
"benjamins.bib 310315 U+0093*, U+0094*, U+0092*, U+0097*\n",
"benjamins.bib 310331 U+0092**, U+0093, U+0094\n",
"benjamins.bib 319306 U+0092\n",
"benjamins.bib 319316 U+0093*, U+0094*\n",
"benjamins.bib 319348 U+0093, U+0094\n",
"benjamins.bib 427487 U+0092\n",
"benjamins.bib 427536 U+0093**, U+0094**\n",
"benjamins.bib 427553 U+0093, U+0094\n",
"benjamins.bib 427570 U+0097*\n",
"benjamins.bib 442233 U+008A\n",
"benjamins.bib 442329 U+008A\n",
"benjamins.bib 469395 U+0097***\n",
"benjamins.bib 660000 U+0091, U+0092\n",
"benjamins.bib 660010 U+0091**, U+0092**\n",
"benjamins.bib 660027 U+0092***, U+0091**\n",
"benjamins.bib 660378 U+0092*******, U+0093, U+0094\n",
"degruyter.bib 530160 U+0091\n",
"degruyter.bib 568722 U+0096\n",
"degruyter.bib 568750 U+0091, U+0092\n",
"degruyter.bib 569352 U+0092*, U+0091\n",
"degruyter.bib 569366 U+0091, U+0092\n",
"degruyter.bib 569422 U+0091, U+0092\n",
"degruyter.bib 569576 U+0091, U+0092\n",
"degruyter.bib 569744 U+0091, U+0092\n",
"degruyter.bib 569758 U+0091, U+0092\n",
"degruyter.bib 569954 U+0091, U+0092\n",
"degruyter.bib 570122 U+0096\n",
"degruyter.bib 570752 U+0091, U+0092\n",
"degruyter.bib 578214 U+0096\n",
"degruyter.bib 578508 U+0096\n",
"degruyter.bib 578886 U+0096\n",
"degruyter.bib 579880 U+0091, U+0092\n",
"degruyter.bib 580650 U+0084, U+0094\n",
"degruyter.bib 580804 U+0096\n",
"degruyter.bib 581406 U+0092\n",
"degruyter.bib 581700 U+0091, U+0092\n",
"degruyter.bib 581770 U+0091, U+0092\n",
"degruyter.bib 581784 U+0091, U+0092\n",
"degruyter.bib 581882 U+0084*, U+0094*\n",
"degruyter.bib 582120 U+0091, U+0092\n",
"degruyter.bib 582302 U+0093, U+0094\n",
"degruyter.bib 582330 U+0084, U+0093\n",
"degruyter.bib 582344 U+0092, U+0084, U+0094\n",
"degruyter.bib 582414 U+0094, U+0093\n",
"degruyter.bib 582470 U+0084, U+0093\n",
"degruyter.bib 585229 U+0096\n",
"degruyter.bib 586629 U+0092*\n",
"degruyter.bib 622167 U+0092*\n",
"degruyter.bib 622257 U+0093, U+0094\n",
"degruyter.bib 622272 U+0092\n",
"degruyter.bib 622407 U+0092, U+0096\n",
"degruyter.bib 622422 U+0092\n",
"degruyter.bib 697231 U+0092*\n",
"degruyter.bib 697246 U+0093****, U+0094****, U+0092\n",
"degruyter.bib 697261 U+0092\n",
"degruyter.bib 697320 U+0092**\n",
"degruyter.bib 697350 U+0092*\n",
"degruyter.bib 697365 U+0092\n",
"degruyter.bib 713347 U+0096\n",
"degruyter.bib 713515 U+0096*\n",
"degruyter.bib 713809 U+0096\n",
"degruyter.bib 713977 U+0096*\n",
"degruyter.bib 715588 U+0096\n",
"degruyter.bib 715854 U+0096\n",
"degruyter.bib 716063 U+0086\n",
"degruyter.bib 716120 U+0084, U+0094\n",
"degruyter.bib 716470 U+0084, U+0094\n",
"degruyter.bib 716568 U+0084*, U+0094*\n",
"degruyter.bib 716582 U+0084, U+0094\n",
"degruyter.bib 716596 U+0096\n",
"degruyter.bib 716680 U+0096*\n",
"degruyter.bib 717660 U+0096\n",
"degruyter.bib 804023 U+0084, U+0093\n",
"degruyter.bib 804039 U+0084, U+0093, U+0096\n",
"degruyter.bib 804055 U+009A*, U+0084, U+0093\n",
"degruyter.bib 804104 U+0096\n",
"degruyter.bib 804152 U+0084, U+0093\n",
"degruyter.bib 804295 U+0084, U+0093\n",
"degruyter.bib 804311 U+0096\n",
"degruyter.bib 804327 U+0084, U+009E, U+0093\n",
"degruyter.bib 804343 U+0096\n",
"degruyter.bib 804359 U+0096\n",
"degruyter.bib 804375 U+0096\n",
"degruyter.bib 804390 U+0084, U+0093, U+0096\n",
"degruyter.bib 804447 U+0096*\n",
"degruyter.bib 804533 U+0084, U+0093, U+0096\n",
"degruyter.bib 804558 U+0096\n",
"degruyter.bib 804879 U+0096\n",
"degruyter.bib 805198 U+009A*\n",
"degruyter.bib 805246 U+0096\n",
"degruyter.bib 805321 U+0096\n",
"degruyter.bib 805336 U+0084, U+0093, U+0096\n",
"degruyter.bib 805361 U+0084**, U+0093*, U+0096, U+0094\n",
"degruyter.bib 805461 U+0093, U+0094\n",
"degruyter.bib 805485 U+0084, U+0093, U+0096\n",
"degruyter.bib 805558 U+0096***, U+009A*\n",
"degruyter.bib 805757 U+0096\n",
"degruyter.bib 805882 U+0096\n",
"degruyter.bib 805912 U+0084, U+0096, U+0093\n",
"degruyter.bib 805927 U+0096\n",
"degruyter.bib 805952 U+009A, U+0096\n",
"degruyter.bib 805984 U+0096\n",
"degruyter.bib 806166 U+0096**\n",
"degruyter.bib 806198 U+009A\n",
"degruyter.bib 806905 U+0096\n",
"degruyter.bib 808589 U+0096\n",
"degruyter.bib 808841 U+0096\n",
"degruyter.bib 810227 U+0096*\n",
"degruyter.bib 810997 U+0096\n",
"degruyter.bib 811053 U+0084, U+0094\n",
"degruyter.bib 845233 U+0093, U+0094\n",
"degruyter.bib 845250 U+0096***\n",
"degruyter.bib 845335 U+0096*\n",
"degruyter.bib 845480 U+0096*\n",
"degruyter.bib 845582 U+0093, U+0094\n",
"degruyter.bib 845604 U+0096\n",
"degruyter.bib 845715 U+0096\n",
"degruyter.bib 846030 U+0096\n",
"evobib.bib 3918 U+0094*\n",
"evobib.bib 5236 U+0092\n",
"evobib.bib 6018 U+0092\n",
"evobib.bib 7279 U+0094\n",
"evobib.bib 8854 U+0092*\n",
"evobib.bib 12428 U+0092\n",
"evobib.bib 12897 U+0092\n",
"evobib.bib 12980 U+0092\n",
"evobib.bib 13107 U+0094*\n",
"evobib.bib 13879 U+0088*\n",
"evobib.bib 16104 U+0094\n",
"evobib.bib 19572 U+0092\n",
"evobib.bib 20442 U+0094*\n",
"evobib.bib 21535 U+0094*\n",
"evobib.bib 22127 U+0094\n",
"evobib.bib 28276 U+0094\n",
"evobib.bib 29672 U+0092\n",
"evobib.bib 33284 U+0092\n",
"evobib.bib 33656 U+0094**\n",
"evobib.bib 34386 U+008C\n",
"evobib.bib 34387 U+008C\n",
"evobib.bib 34578 U+0092\n",
"evobib.bib 35756 U+0094\n",
"evobib.bib 37019 U+0094\n",
"evobib.bib 37080 U+0094*\n",
"evobib.bib 39648 U+0094\n",
"evobib.bib 40097 U+0092\n",
"evobib.bib 40353 U+0092\n",
"evobib.bib 40434 U+0092\n",
"ldh.bib 12171 U+0092\n",
"ldh.bib 12817 U+0093, U+0094\n",
"ldh.bib 13861 U+0096\n",
"sil16.bib 75677 U+0080\n",
"sil16.bib 85046 U+0080\n",
"sil16.bib 124097 U+0090\n",
"sil16.bib 129034 U+0080*\n",
"sil16.bib 137779 U+0090\n",
"sil16.bib 143544 U+0080**\n",
"sil16.bib 146284 U+0080*\n"
]
}
],
"source": [
"total = collections.Counter()\n",
"\n",
"for path in bibfiles:\n",
" total += count_control_chars(path)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "749dc209-c154-40ff-a9f1-e9ecacbe8900",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('U+0092', 101),\n",
" ('U+0096', 70),\n",
" ('U+0094', 58),\n",
" ('U+0093', 42),\n",
" ('U+009A', 30),\n",
" ('U+0091', 26),\n",
" ('U+0084', 26),\n",
" ('U+0097', 17),\n",
" ('U+0080', 9),\n",
" ('U+008A', 2),\n",
" ('U+0088', 2),\n",
" ('U+008C', 2),\n",
" ('U+0090', 2),\n",
" ('U+0086', 1),\n",
" ('U+009E', 1)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total.most_common()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "729d15df-e759-4a08-bf61-fe625a6b4778",
"metadata": {},
"outputs": [],
"source": [
"for path in bibfiles:\n",
" if path.name not in editable:\n",
" continue\n",
"\n",
" old = path.read_text(encoding=ENCODING)\n",
" new, replaced = C1_CONTROL_CHAR.subn('', old)\n",
"\n",
" if replaced:\n",
" print(f'{path.name} replaced {replaced} characters')\n",
" path.write_text(new, encoding=ENCODING)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment