Last active
January 11, 2019 02:05
-
-
Save naoh16/78265a0aedce491fdda0d8784b4eee5e to your computer and use it in GitHub Desktop.
Simple latex report checker: Level 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!env python | |
# -*- coding: utf-8 -*- | |
'''Edu report checker: Level 2 | |
形態素解析も行いながら,発展的な内容もチェックしています. | |
Message Examples | |
------------------- | |
- [LV2-1-1] 1段落に{:d}単語含まれています.多くとも 600 単語以下となるよう,文や段落の構成を考え直してみましょう. | |
- [LV2-2-1] 1文に{:d}単語含まれているようです.多くとも 60 単語以下となるよう,短い文にすることを考えてみましょう. | |
- [LV2-2-2] 1文に読点が{:d}個含まれています.多くとも 6 個以下になるよう,文の区切りを考え直してみましょう. | |
- [LV2-3-1] 助詞の「の」が{:d}回繰り返されており,わかりにくい文になっています.3 回未満を目安として,適切に読点を入れたり,1文の長さを再考する,などを考えてみましょう. | |
- [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています.修正を検討しましょう. | |
- [LV2-4-1] 敬体(「です,ます」など)が使われています.レポートでは常体(「である」など)を使いましょう(ただし,感想を除く). | |
- [LV2-4-2] 弱い表現が使われています.レポートでは主体的に主張した文章を書きましょう(ただし,感想を除く). | |
Note | |
----- | |
- 利用には 'Janome <https://github.com/mocobeta/janome>'_ が必要です. | |
''' | |
import sys | |
import re | |
# Parameters for Check (1) | |
THRESHOLD_MAX_WORDS_IN_PARAGRAPH = 600 | |
FORMAT_MAX_WORDS_IN_PARAGRAPH = '* [LV2-1-1] 1段落に{:d}単語含まれています.多くとも{:d}単語以下となるよう,文や段落の構成を考え直してみましょう.' | |
# Parameters for Check (2a) | |
THRESHOLD_MAX_WORDS_IN_SENTENCE = 60 | |
FORMAT_MAX_WORDS_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-1] 1文に{:d}単語含まれているようです.多くとも{:d}単語以下となるよう,短い文にすることを考えてみましょう.' | |
# Parameters for Check (2b) | |
THRESHOLD_MAX_COMMA_IN_SENTENCE = 6 | |
FORMAT_MAX_COMMA_IN_SENTENCE = '> {:.32s}...\n * [LV2-2-2] 1文に読点が{:d}個含まれています.多くとも{:d}個以下になるよう,文の区切りを考え直してみましょう.' | |
# Parameters for Check (3a) | |
THREASHOLD_MAX_JOSHI_NO = 3 | |
FORMAT_MAX_JOSHI_NO = '> {:.32s}...\n * [LV2-3-1] 助詞の「の」が{:d}回繰り返されており,わかりにくい文になっています.{:d}回未満を目安として,適切に読点を入れたり,1文の長さを再考する,などを考えてみましょう.' | |
# Parameters for Check (3b) | |
FORMAT_DOUBLE_NEGTIVE = '> {:.32s}...\n * [LV2-3-2] 二重否定と思われるセンテンス「{:s}」が含まれています.修正を検討しましょう.' | |
# Parameters for Check (4a) | |
FORMAT_DESUMASU = '> {:.32s}...\n * [LV2-4-1] 敬体(「です,ます」など)が使われています.レポートでは常体(「である」など)を使いましょう(ただし,感想を除く).' | |
# Parameters for Cchek (4b) | |
FORMAT_WEAK_WORD = '> {:.32s}...\n * [LV2-4-2] 弱い表現が使われています.レポートでは主体的に主張した文章を書きましょう(ただし,感想を除く).' | |
def warning(linenum_st, linenum_en, source_text, messages): | |
print('\033[1;32mWARNING: Lines {:d}--{:d}\033[0m: {:s}'.format(linenum_st, linenum_en, source_text)) | |
for msg in messages: | |
print('{}'.format(msg)) | |
print() | |
def load_texfile(src_filename): | |
paragraphs = [] | |
with open(src_filename, 'r', encoding="utf-8") as f: | |
n = 0 | |
n_parstart = 0 | |
in_verbatim = False | |
in_table = False | |
in_figure = False | |
in_itemize = False | |
sentence = "" | |
for line in f: | |
n += 1 | |
line = line.rstrip() | |
if in_verbatim: | |
if re.search(r'\\end\{verbatim\}', line): | |
in_verbatim = False | |
continue | |
if re.search(r'\\begin\{verbatim\}', line): | |
in_verbatim = True | |
continue | |
if in_table: | |
if re.search(r'\\end\{table\}', line): | |
in_table = False | |
continue | |
if re.search(r'\\begin\{table\}', line): | |
in_verbatim = True | |
continue | |
if in_figure: | |
if re.search(r'\\end\{figure\}', line): | |
in_figure = False | |
continue | |
if re.search(r'\\begin\{figure\}', line): | |
in_figure = True | |
continue | |
if in_itemize: | |
if re.search(r'\\end\{(itemize|enumerate)\}', line): | |
in_itemize = False | |
continue | |
if re.search(r'\\begin\{(itemize|enumerate)\}', line): | |
in_itemize = True | |
continue | |
# remove comments | |
line = re.sub(r'(?<!\\)%.*', '', line) | |
# remove some markups | |
line = re.sub(r'\\(sub)*section\{[^\}]+\}', '', line) | |
line = re.sub(r'\\(label|ref|cite)\{[^\}]+\}', '', line) | |
line = re.sub(r'\\(small|large|huge)', '', line) | |
line = re.sub(r'\\LaTeX\s*', 'LaTeX', line) | |
if n_parstart == 0 and line != '': | |
n_parstart = n | |
sentence = sentence + line | |
if (line == '' and len(sentence) > 0) or line == '\\par': | |
paragraphs.append({'line_start': n_parstart, 'line_end': n-1, | |
'str': sentence + line}) | |
sentence = '' | |
n_parstart = 0 | |
return paragraphs | |
def preprocess_sentences(paragraphs): | |
''' preprocess for sentences | |
par['str'] is processed 'by reference,' not 'by value'. | |
''' | |
for par in paragraphs: | |
#par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' \g<2> ', par['str']) | |
par['str'] = re.sub(r'\\verb(.)([^\1]*?)\1', r' VERB ', par['str']) | |
par['str'] = re.sub(r'\$([^\$]*)\$', r'\g<1>', par['str']) | |
par['str'] = re.sub(r'\\\w+', '', par['str']) | |
par['str'] = re.sub(r'\{|\}', '', par['str']) | |
par['str'] = re.sub(r'(?<!\w)\s+', '', par['str']) | |
def report_check_level2(src_filename): | |
from janome.tokenizer import Tokenizer | |
pos_tagger = Tokenizer() | |
# Load text | |
paragraphs = load_texfile(src_filename) | |
preprocess_sentences(paragraphs) | |
for par in paragraphs: | |
num_of_words_in_paragraph = 0 | |
num_of_words_in_sentence = 0 | |
num_of_comma_in_sentence = 0 | |
num_of_joshi_no = 0 | |
has_negative = False | |
has_desumasu = False | |
has_weak_word = False | |
str_sentence = "" | |
str_short_sentence = "" | |
str_negative = "" | |
warning_messages = [] | |
for token in pos_tagger.tokenize(par['str']): | |
pos0,pos1 = token.part_of_speech.split(',')[:2] | |
#print('{:s}+{:s}+{:s}'.format(token.surface, token.base_form, token.part_of_speech), end=' ') # debug | |
#print('{:s}+{:s}+{:s}'.format(token.surface, pos0, pos1), end=' ') # debug | |
str_sentence += token.surface | |
str_short_sentence += token.surface | |
if pos0 != '記号' and token.surface != '(' and token.surface != ')': | |
num_of_words_in_paragraph += 1 | |
num_of_words_in_sentence += 1 | |
if pos0 == '助詞': | |
if token.surface == 'の': | |
num_of_joshi_no += 1 | |
else: | |
# Check (3a) | |
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO: | |
warning_messages.append( FORMAT_MAX_JOSHI_NO.format( | |
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO)) | |
num_of_joshi_no = 0 | |
# Check (4a) DESU, MASU | |
if pos0 == '助動詞' and re.search(r'です|ます', token.base_form): | |
has_desumasu = True | |
# Check (4b) Weak words | |
if pos1 == '副助詞' and token.base_form == 'かも': | |
has_weak_word = True | |
if pos0 == '助動詞' and token.infl_form == '未然形' and token.base_form == 'だ': | |
has_weak_word = True | |
if (pos0 == '助動詞' or pos0 == '形容詞') and token.infl_form == '未然ウ接続': | |
has_weak_word = True | |
if pos0 == '形容詞' and token.base_form == '難い': | |
has_weak_word = True | |
if pos0 == '形容詞' and (re.search(r'無い|ない', token.base_form)): | |
has_negative = True | |
if has_negative: | |
str_negative += token.surface | |
if has_negative and pos0 == '助動詞': | |
# Check (3b) Double Negative | |
if token.surface == 'ない': | |
warning_messages.append( FORMAT_DOUBLE_NEGTIVE.format( | |
str_short_sentence, str_negative)) | |
has_negative = False | |
str_negative = '' | |
if pos1 == '読点': | |
num_of_comma_in_sentence += 1 | |
# Check (3a) | |
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO: | |
warning_messages.append( FORMAT_MAX_JOSHI_NO.format( | |
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO)) | |
# Check (4a) DESU, MASU | |
if has_desumasu: | |
warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) ) | |
# Check (4b) Weak words | |
if has_weak_word: | |
warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) ) | |
# reset | |
num_of_joshi_no = 0 | |
str_short_sentence = '...,' | |
has_negative = False | |
has_weak_word = False | |
if pos1 == '句点': # '.' | |
# Check (2a) | |
if num_of_words_in_sentence > THRESHOLD_MAX_WORDS_IN_SENTENCE: | |
warning_messages.append( FORMAT_MAX_WORDS_IN_SENTENCE.format( | |
str_sentence, num_of_words_in_sentence, THRESHOLD_MAX_WORDS_IN_SENTENCE)) | |
# Check (2b) | |
if num_of_comma_in_sentence > THRESHOLD_MAX_COMMA_IN_SENTENCE: | |
warning_messages.append( FORMAT_MAX_COMMA_IN_SENTENCE.format( | |
str_sentence, num_of_comma_in_sentence, THRESHOLD_MAX_COMMA_IN_SENTENCE)) | |
# Check (3a) | |
if num_of_joshi_no >= THREASHOLD_MAX_JOSHI_NO: | |
warning_messages.append( FORMAT_MAX_JOSHI_NO.format( | |
str_short_sentence, num_of_joshi_no, THREASHOLD_MAX_JOSHI_NO)) | |
# Check (4a) DESU, MASU | |
if has_desumasu: | |
warning_messages.append( FORMAT_DESUMASU.format(str_short_sentence) ) | |
# Check (4b) Weak words | |
if has_weak_word: | |
warning_messages.append( FORMAT_WEAK_WORD.format(str_short_sentence) ) | |
# reset | |
num_of_words_in_sentence = 0 | |
num_of_comma_in_sentence = 0 | |
str_sentence = '' | |
num_of_joshi_no = 0 | |
str_short_sentence = '' | |
has_negative = False | |
has_desumasu = False | |
has_weak_word = False | |
#print('') # debug | |
# Check (1) | |
if num_of_words_in_paragraph > THRESHOLD_MAX_WORDS_IN_PARAGRAPH: | |
warning_messages.append( FORMAT_MAX_WORDS_IN_PARAGRAPH.format( | |
num_of_words_in_paragraph, THRESHOLD_MAX_WORDS_IN_PARAGRAPH)) | |
if len(warning_messages) > 0: | |
warning(par['line_start'], par['line_end'], '', warning_messages) | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
print('Usage: python {:s} filename1.tex [filename2.tex ...]'.format(sys.argv[0])) | |
exit(1) | |
else: | |
for filename in sys.argv[1:]: | |
report_check_level2(filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment