akngs · November 26, 2017 08:56
diff --git a/wordfreq.md b/wordfreq.md
diff --git a/wordfreq.py b/wordfreq.py
 import argparse
 import csv
 import sys
 from collections import Counter
 from urllib.request import urlopen

 from konlpy.tag import Twitter
 from lxml import html


 def main():
    parser = argparse.ArgumentParser(
        description='Extract frequent words from URL')
    parser.add_argument('--url', help='URL to fetch')
    parser.add_argument('--xpath', help='XPath expression')
    args = parser.parse_args()

    text = fetch(args.url, args.xpath)

    words = extract_words(text)
    to_csv(sys.stdout, words)


 def fetch(url, xpath):
    res = urlopen(url).read().decode('utf-8')
    page = html.fromstring(res)
    return '\n'.join(page.xpath(xpath))


 def extract_words(text, max_n=500, min_freq=2):
    # Twitter analyzer is the fastest so far
    analyzer = Twitter()
    nouns = [n for n in analyzer.nouns(text) if len(n) > 2]
    count = Counter(nouns)

    return [
        {'word': n, 'freq': freq}
        for n, freq in count.most_common(max_n)
        if freq >= min_freq
    ]


 def to_csv(stream, objs):
    w = csv.DictWriter(stream, objs[0].keys())
    w.writeheader()
    w.writerows(objs)


 if __name__ == '__main__':
    main()
	import argparse
	import csv
	import sys
	from collections import Counter
	from urllib.request import urlopen

	from konlpy.tag import Twitter
	from lxml import html


	def main():
	parser = argparse.ArgumentParser(
	description='Extract frequent words from URL')
	parser.add_argument('--url', help='URL to fetch')
	parser.add_argument('--xpath', help='XPath expression')
	args = parser.parse_args()

	text = fetch(args.url, args.xpath)

	words = extract_words(text)
	to_csv(sys.stdout, words)


	def fetch(url, xpath):
	res = urlopen(url).read().decode('utf-8')
	page = html.fromstring(res)
	return '\n'.join(page.xpath(xpath))


	def extract_words(text, max_n=500, min_freq=2):
	# Twitter analyzer is the fastest so far
	analyzer = Twitter()
	nouns = [n for n in analyzer.nouns(text) if len(n) > 2]
	count = Counter(nouns)

	return [
	{'word': n, 'freq': freq}
	for n, freq in count.most_common(max_n)
	if freq >= min_freq
	]


	def to_csv(stream, objs):
	w = csv.DictWriter(stream, objs[0].keys())
	w.writeheader()
	w.writerows(objs)


	if __name__ == '__main__':
	main()