Created
June 6, 2018 22:19
-
-
Save pshapiro/fe8b0c9cfd57481dfb8e247aacd06c18 to your computer and use it in GitHub Desktop.
Use Text Summarization Algorithms to Help Aid the Writing of Meta Descriptions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
from sumy.parsers.html import HtmlParser | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lsa import LsaSummarizer as Lsa | |
from sumy.summarizers.luhn import LuhnSummarizer as Luhn | |
from sumy.summarizers.text_rank import TextRankSummarizer as TxtRank | |
from sumy.summarizers.lex_rank import LexRankSummarizer as LexRank | |
from sumy.summarizers.sum_basic import SumBasicSummarizer as SumBasic | |
from sumy.summarizers.kl import KLSummarizer as KL | |
from sumy.summarizers.edmundson import EdmundsonSummarizer as Edmundson | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.utils import get_stop_words | |
LANGUAGE = "english" | |
SENTENCES_COUNT = 1 | |
urlinput = os.path.join(os.path.dirname(__file__), input('Enter input text file: ')) | |
urls = open(urlinput, "r") | |
outputcsv = os.path.join(os.path.dirname(__file__), input('Enter a filename (minus file extension): ')+'.csv') | |
f = csv.writer(open(outputcsv, "w+", newline="\n", encoding="utf-8")) | |
f.writerow(["URL", "Copy", "Summarization Algorithm"]) | |
for line in iter(urls): | |
stemmer = Stemmer(LANGUAGE) | |
lsaSummarizer = Lsa(stemmer) | |
lsaSummarizer.stop_words = get_stop_words(LANGUAGE) | |
luhnSummarizer = Luhn(stemmer) | |
luhnSummarizer.stop_words = get_stop_words(LANGUAGE) | |
# edmundsonSummarizer.bonus_words = get_bonus_words | |
lexrankSummarizer = LexRank(stemmer) | |
lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) | |
textrankSummarizer = TxtRank(stemmer) | |
textrankSummarizer.stop_words = get_stop_words(LANGUAGE) | |
sumbasicSummarizer = SumBasic(stemmer) | |
sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) | |
klSummarizer = KL(stemmer) | |
klSummarizer.stop_words = get_stop_words(LANGUAGE) | |
parser = HtmlParser.from_url(line, Tokenizer(LANGUAGE)) | |
for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"LSA"]) | |
print("Summarizing URL via LSA: " + line) | |
for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"Luhn"]) | |
print("Summarizing URL via Luhn: " + line) | |
for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"LexRank"]) | |
print("Summarizing URL via LexRank: " + line) | |
for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"TextRank"]) | |
print("Summarizing URL via TextRank: " + line) | |
for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"SumBasic"]) | |
print("Summarizing URL via SumBasic: " + line) | |
for sentence in klSummarizer(parser.document, SENTENCES_COUNT): | |
print(sentence) | |
f.writerow([line,sentence,"KL-Sum"]) | |
print("Summarizing URL via KL-Sum: " + line) | |
urls.close() | |
print ("Writing to " + outputcsv + " complete.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
is there anything i am missing. i get this error. thanks in adavance
Traceback (most recent call last):
File "sample.py", line 49, in
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/sumy/parsers/html.py", line 34, in from_url
data = fetch_url(url)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/sumy/utils.py", line 23, in fetch_url
with closing(requests.get(url, headers=_HTTP_HEADERS)) as response:
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for