- Install Python 3
- Install KoNLPy
- Run
python wordfreq.py --xpath './/div[@class="statement"]/div[@class="content glossary"]/text()' --url http://pokr.kr/meeting/1933823653/dialog
Last active
November 26, 2017 08:56
-
-
Save akngs/8078b7493e29137697e127eb62db70b9 to your computer and use it in GitHub Desktop.
Extract frequent words from URL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import csv | |
import sys | |
from collections import Counter | |
from urllib.request import urlopen | |
from konlpy.tag import Twitter | |
from lxml import html | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Extract frequent words from URL') | |
parser.add_argument('--url', help='URL to fetch') | |
parser.add_argument('--xpath', help='XPath expression') | |
args = parser.parse_args() | |
text = fetch(args.url, args.xpath) | |
words = extract_words(text) | |
to_csv(sys.stdout, words) | |
def fetch(url, xpath): | |
res = urlopen(url).read().decode('utf-8') | |
page = html.fromstring(res) | |
return '\n'.join(page.xpath(xpath)) | |
def extract_words(text, max_n=500, min_freq=2): | |
# Twitter analyzer is the fastest so far | |
analyzer = Twitter() | |
nouns = [n for n in analyzer.nouns(text) if len(n) > 2] | |
count = Counter(nouns) | |
return [ | |
{'word': n, 'freq': freq} | |
for n, freq in count.most_common(max_n) | |
if freq >= min_freq | |
] | |
def to_csv(stream, objs): | |
w = csv.DictWriter(stream, objs[0].keys()) | |
w.writeheader() | |
w.writerows(objs) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment