Created
September 23, 2017 01:18
-
-
Save twairball/2bd45f0f1dbfe3543a34193edfde2f1b to your computer and use it in GitHub Desktop.
WMT17 Zh-En corpus have different number of lines?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import jieba\n", | |
"import nltk\n", | |
"import os\n", | |
"\n", | |
"\"\"\"\n", | |
"Notebook exploring the weird misalignment between Zh and En corpus. \n", | |
"Inspecting the files on unix console we expect 227,330 lines in both corpus. \n", | |
"However, looping through the file line-by-line in python we find:\n", | |
" EN: 227568\n", | |
" ZH: 227603 (diff: 35)\n", | |
"\n", | |
"\n", | |
"On console: \n", | |
"$ wc -l training/news-commentary-v12.zh-en.zh\n", | |
"227330 training/news-commentary-v12.zh-en.zh\n", | |
"\n", | |
"$ wc -l training/news-commentary-v12.zh-en.en\n", | |
"227330 training/news-commentary-v12.zh-en.en\n", | |
"\n", | |
"\n", | |
"WMT17 training Dataset corpus can be downloaded from:\n", | |
"http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz\n", | |
"\n", | |
"\n", | |
"\n", | |
"\"\"\"\n", | |
"zh_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.zh\"\n", | |
"en_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.en\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(227603, 227573)" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"\"\"\" Counting using splitlines. somehow we get a different count for En. \"\"\"\n", | |
"def count_splitlines(filename):\n", | |
" return len(open(filename).read().splitlines())\n", | |
"\n", | |
"count_splitlines(zh_filepath), count_splitlines(en_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(227603, 227568)" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"\"\"\" readlines() matches what we typically expect when reading line-by-line from python. \"\"\"\n", | |
"def count_readlines(filename):\n", | |
" return len(open(filename).readlines())\n", | |
"\n", | |
"count_readlines(zh_filepath), count_readlines(en_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"\"\"\" Count lines that are blank. \"\"\"\n", | |
"def blank_line_count(filename):\n", | |
" with open(filename) as fd:\n", | |
" count = sum(1 for line in fd if len(line.strip()) == 0)\n", | |
" return count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(39, 146)" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"blank_line_count(zh_filepath), blank_line_count(en_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0, 0)" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"\"\"\" find occurences of weird line breaks? \\n, \\r, \\t, \\v\"\"\"\n", | |
"import re\n", | |
"def cr_count(filename, substr=\"\\r\"):\n", | |
" full = open(filename).read()\n", | |
" return len(re.findall(substr, full))\n", | |
"\n", | |
"cr_count(zh_filepath, \"\\r\"), cr_count(en_filepath, \"\\r\") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def report_blank_lines(filename):\n", | |
" tot = 0\n", | |
" with open(filename) as f:\n", | |
" for i, l in enumerate(f):\n", | |
" if len(l.strip()) < 1:\n", | |
" print(\"[%d] %s[END]\" % (i, l))\n", | |
" tot = tot + 1\n", | |
" print(\" total: %d\" % tot)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[27660] \n", | |
"[END]\n", | |
"[51225] \n", | |
"[END]\n", | |
"[66871] \n", | |
"[END]\n", | |
"[75770] \n", | |
"[END]\n", | |
"[75775] \n", | |
"[END]\n", | |
"[82330] \n", | |
"[END]\n", | |
"[89880] \n", | |
"[END]\n", | |
"[91075] \n", | |
"[END]\n", | |
"[105145] \n", | |
"[END]\n", | |
"[119307] \n", | |
"[END]\n", | |
"[126515] \n", | |
"[END]\n", | |
"[128127] \n", | |
"[END]\n", | |
"[137127] \n", | |
"[END]\n", | |
"[137604] \n", | |
"[END]\n", | |
"[145516] \n", | |
"[END]\n", | |
"[146597] \n", | |
"[END]\n", | |
"[147274] \n", | |
"[END]\n", | |
"[151833] \n", | |
"[END]\n", | |
"[166718] \n", | |
"[END]\n", | |
"[167566] \n", | |
"[END]\n", | |
"[167574] \n", | |
"[END]\n", | |
"[167586] \n", | |
"[END]\n", | |
"[167591] \n", | |
"[END]\n", | |
"[167598] \n", | |
"[END]\n", | |
"[172120] \n", | |
"[END]\n", | |
"[176885] \n", | |
"[END]\n", | |
"[178064] \n", | |
"[END]\n", | |
"[178066] \n", | |
"[END]\n", | |
"[178643] \n", | |
"[END]\n", | |
"[178983] \n", | |
"[END]\n", | |
"[178985] \n", | |
"[END]\n", | |
"[179010] \n", | |
"[END]\n", | |
"[179817] \n", | |
"[END]\n", | |
"[180836] \n", | |
"[END]\n", | |
"[183407] \n", | |
"[END]\n", | |
"[190958] \n", | |
"[END]\n", | |
"[193349] \n", | |
"[END]\n", | |
"[197577] \n", | |
"[END]\n", | |
"[206624] \n", | |
"[END]\n", | |
" total: 39\n" | |
] | |
} | |
], | |
"source": [ | |
"report_blank_lines(zh_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[4088] \n", | |
"[END]\n", | |
"[8112] \n", | |
"[END]\n", | |
"[13275] \n", | |
"[END]\n", | |
"[13276] \n", | |
"[END]\n", | |
"[13357] \n", | |
"[END]\n", | |
"[13358] \n", | |
"[END]\n", | |
"[13581] \n", | |
"[END]\n", | |
"[13582] \n", | |
"[END]\n", | |
"[13783] \n", | |
"[END]\n", | |
"[13784] \n", | |
"[END]\n", | |
"[14646] \n", | |
"[END]\n", | |
"[14740] \n", | |
"[END]\n", | |
"[15454] \n", | |
"[END]\n", | |
"[15514] \n", | |
"[END]\n", | |
"[15515] \n", | |
"[END]\n", | |
"[16244] \n", | |
"[END]\n", | |
"[20289] \n", | |
"[END]\n", | |
"[23012] \n", | |
"[END]\n", | |
"[24964] \n", | |
"[END]\n", | |
"[24965] \n", | |
"[END]\n", | |
"[27670] \n", | |
"[END]\n", | |
"[31465] \n", | |
"[END]\n", | |
"[31466] \n", | |
"[END]\n", | |
"[32799] \n", | |
"[END]\n", | |
"[35079] \n", | |
"[END]\n", | |
"[35080] \n", | |
"[END]\n", | |
"[37662] \n", | |
"[END]\n", | |
"[37663] \n", | |
"[END]\n", | |
"[39318] \n", | |
"[END]\n", | |
"[39321] \n", | |
"[END]\n", | |
"[45101] \n", | |
"[END]\n", | |
"[48448] \n", | |
"[END]\n", | |
"[48450] \n", | |
"[END]\n", | |
"[48451] \n", | |
"[END]\n", | |
"[48454] \n", | |
"[END]\n", | |
"[49221] \n", | |
"[END]\n", | |
"[49222] \n", | |
"[END]\n", | |
"[51263] \n", | |
"[END]\n", | |
"[55062] \n", | |
"[END]\n", | |
"[64458] \n", | |
"[END]\n", | |
"[66912] \n", | |
"[END]\n", | |
"[70704] \n", | |
"[END]\n", | |
"[74897] \n", | |
"[END]\n", | |
"[75821] \n", | |
"[END]\n", | |
"[75826] \n", | |
"[END]\n", | |
"[75975] \n", | |
"[END]\n", | |
"[79343] \n", | |
"[END]\n", | |
"[80313] \n", | |
"[END]\n", | |
"[80691] \n", | |
"[END]\n", | |
"[82383] \n", | |
"[END]\n", | |
"[82385] \n", | |
"[END]\n", | |
"[83542] \n", | |
"[END]\n", | |
"[85636] \n", | |
"[END]\n", | |
"[88839] \n", | |
"[END]\n", | |
"[91118] \n", | |
"[END]\n", | |
"[91831] \n", | |
"[END]\n", | |
"[91832] \n", | |
"[END]\n", | |
"[92274] \n", | |
"[END]\n", | |
"[98705] \n", | |
"[END]\n", | |
"[100113] \n", | |
"[END]\n", | |
"[102805] \n", | |
"[END]\n", | |
"[103524] \n", | |
"[END]\n", | |
"[103525] \n", | |
"[END]\n", | |
"[103531] \n", | |
"[END]\n", | |
"[103532] \n", | |
"[END]\n", | |
"[104059] \n", | |
"[END]\n", | |
"[105195] \n", | |
"[END]\n", | |
"[105196] \n", | |
"[END]\n", | |
"[105204] \n", | |
"[END]\n", | |
"[109823] \n", | |
"[END]\n", | |
"[112173] \n", | |
"[END]\n", | |
"[112174] \n", | |
"[END]\n", | |
"[112522] \n", | |
"[END]\n", | |
"[114362] \n", | |
"[END]\n", | |
"[116957] \n", | |
"[END]\n", | |
"[116958] \n", | |
"[END]\n", | |
"[119007] \n", | |
"[END]\n", | |
"[120015] \n", | |
"[END]\n", | |
"[121140] \n", | |
"[END]\n", | |
"[121142] \n", | |
"[END]\n", | |
"[121147] \n", | |
"[END]\n", | |
"[123323] \n", | |
"[END]\n", | |
"[123324] \n", | |
"[END]\n", | |
"[126575] \n", | |
"[END]\n", | |
"[127835] \n", | |
"[END]\n", | |
"[127836] \n", | |
"[END]\n", | |
"[128796] \n", | |
"[END]\n", | |
"[128803] \n", | |
"[END]\n", | |
"[133237] \n", | |
"[END]\n", | |
"[141777] \n", | |
"[END]\n", | |
"[142861] \n", | |
"[END]\n", | |
"[142895] \n", | |
"[END]\n", | |
"[144866] \n", | |
"[END]\n", | |
"[145908] \n", | |
"[END]\n", | |
"[146305] \n", | |
"[END]\n", | |
"[146306] \n", | |
"[END]\n", | |
"[146751] \n", | |
"[END]\n", | |
"[147268] \n", | |
"[END]\n", | |
"[147269] \n", | |
"[END]\n", | |
"[147881] \n", | |
"[END]\n", | |
"[151364] \n", | |
"[END]\n", | |
"[151905] \n", | |
"[END]\n", | |
"[156970] \n", | |
"[END]\n", | |
"[162701] \n", | |
"[END]\n", | |
"[164167] \n", | |
"[END]\n", | |
"[166196] \n", | |
"[END]\n", | |
"[166202] \n", | |
"[END]\n", | |
"[166791] \n", | |
"[END]\n", | |
"[167671] \n", | |
"[END]\n", | |
"[169043] \n", | |
"[END]\n", | |
"[169044] \n", | |
"[END]\n", | |
"[172187] \n", | |
"[END]\n", | |
"[172202] \n", | |
"[END]\n", | |
"[174471] \n", | |
"[END]\n", | |
"[174472] \n", | |
"[END]\n", | |
"[177674] \n", | |
"[END]\n", | |
"[178739] \n", | |
"[END]\n", | |
"[179035] \n", | |
"[END]\n", | |
"[179036] \n", | |
"[END]\n", | |
"[179379] \n", | |
"[END]\n", | |
"[179808] \n", | |
"[END]\n", | |
"[180869] \n", | |
"[END]\n", | |
"[182652] \n", | |
"[END]\n", | |
"[184332] \n", | |
"[END]\n", | |
"[184333] \n", | |
"[END]\n", | |
"[184483] \n", | |
"[END]\n", | |
"[184484] \n", | |
"[END]\n", | |
"[190377] \n", | |
"[END]\n", | |
"[191032] \n", | |
"[END]\n", | |
"[191407] \n", | |
"[END]\n", | |
"[192485] \n", | |
"[END]\n", | |
"[192486] \n", | |
"[END]\n", | |
"[195724] \n", | |
"[END]\n", | |
"[197527] \n", | |
"[END]\n", | |
"[199217] \n", | |
"[END]\n", | |
"[199218] \n", | |
"[END]\n", | |
"[199819] \n", | |
"[END]\n", | |
"[202672] \n", | |
"[END]\n", | |
"[211684] \n", | |
"[END]\n", | |
"[214254] \n", | |
"[END]\n", | |
"[216153] \n", | |
"[END]\n", | |
"[216416] \n", | |
"[END]\n", | |
"[216638] \n", | |
"[END]\n", | |
"[217317] \n", | |
"[END]\n", | |
"[221007] \n", | |
"[END]\n", | |
"[225697] \n", | |
"[END]\n", | |
" total: 146\n" | |
] | |
} | |
], | |
"source": [ | |
"report_blank_lines(en_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment