Last active
January 4, 2020 10:42
-
-
Save BenjaminFraser/9dc9c2fc5ffda8235f0bef8c04736b7f to your computer and use it in GitHub Desktop.
Example of web scraping the URLs of applicable news articles in order to form a dataset for NLP.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
from time import sleep | |
# select Guardian website - Military news and obtain a set of URLs | |
crawl_url = 'https://www.theguardian.com/uk/military?page=' | |
# form a set to store unique urls of all articles | |
guardian_urls = set() | |
# scrape first 10 pages | |
for page_no in range(1, 11): | |
page = requests.get(crawl_url + str(page_no)) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
# search only for links with data-link-name as 'article' | |
news_list = soup.find_all('a', attrs={'data-link-name' : 'article'}) | |
# form a set of all unique urls to build our dataset | |
for link in news_list: | |
news_url = link.get('href') | |
guardian_urls.add(news_url) | |
# delay to avoid contravening robots.txt of webpage | |
sleep(1.0) | |
print("{0} article URLs were obtained from {1}.".format(len(guardian_urls), | |
crawl_url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment