Created
December 22, 2017 20:57
-
-
Save melodykramer/5178c6fde521621b1b981575546b845c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##libraries | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import csv | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
## getting the url we want to scrape | |
nieman_page = 'http://www.niemanlab.org/2017/12/the-rise-of-skeptical-reading/' | |
## querying the page and returning the html to the variable page | |
page = urllib2.urlopen(nieman_page).read() | |
## parsing html using BeautifulSoup and storing html in variable soup | |
soup = BeautifulSoup(page, 'html.parser') | |
## writes to csv | |
file = csv.writer(open("Predictions.csv", "w")) | |
file.writerow(["Headline", "Blurb", "Byline", "Text"]) | |
## prints headline | |
for headline in soup.findAll(attrs={'class' : 'simple-headline'}): | |
headline = headline.text | |
## prints blurb | |
for blurb in soup.findAll(attrs={'class' : 'simple-post-deck'}): | |
blurb = blurb.text | |
## prints byline | |
for author in soup.findAll(attrs={'class' : 'predix-byline'}): | |
author = author.text | |
## prints text + author bio | |
for prediction in soup.findAll(attrs={'class' : 'predix-storybody'}): | |
prediction = prediction.text | |
file.writerow([headline, blurb, author, prediction]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment