BenjaminFraser · January 4, 2020 10:42
diff --git a/web_scrape_example_1.py b/web_scrape_example_1.py
 from bs4 import BeautifulSoup
 import requests
 from time import sleep

 # select Guardian website - Military news and obtain a set of URLs
 crawl_url = 'https://www.theguardian.com/uk/military?page='

 # form a set to store unique urls of all articles
 guardian_urls = set()

 # scrape first 10 pages
 for page_no in range(1, 11):
    page = requests.get(crawl_url + str(page_no))
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # search only for links with data-link-name as 'article'
    news_list = soup.find_all('a', attrs={'data-link-name' : 'article'})
    
    # form a set of all unique urls to build our dataset
    for link in news_list:
        news_url = link.get('href')
        guardian_urls.add(news_url)
    
    # delay to avoid contravening robots.txt of webpage 
    sleep(1.0)

 print("{0} article URLs were obtained from {1}.".format(len(guardian_urls),
                                                            crawl_url))
	from bs4 import BeautifulSoup
	import requests
	from time import sleep

	# select Guardian website - Military news and obtain a set of URLs
	crawl_url = 'https://www.theguardian.com/uk/military?page='

	# form a set to store unique urls of all articles
	guardian_urls = set()

	# scrape first 10 pages
	for page_no in range(1, 11):
	page = requests.get(crawl_url + str(page_no))
	soup = BeautifulSoup(page.text, 'html.parser')

	# search only for links with data-link-name as 'article'
	news_list = soup.find_all('a', attrs={'data-link-name' : 'article'})

	# form a set of all unique urls to build our dataset
	for link in news_list:
	news_url = link.get('href')
	guardian_urls.add(news_url)

	# delay to avoid contravening robots.txt of webpage
	sleep(1.0)

	print("{0} article URLs were obtained from {1}.".format(len(guardian_urls),
	crawl_url))