Last active
April 14, 2020 14:41
-
-
Save Denniskamau/23e4e12b52c87c9e8b9924658c11225a to your computer and use it in GitHub Desktop.
Getting started with web scrapping for image data example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from bs4 import BeautifulSoup | |
class Scrapper(): | |
def initializeScrapping(self,url): | |
# Set the url of the page you want to scrap for data\ | |
urlpage = url | |
# Using urllib open the page | |
page = urllib.request.urlopen(urlpage) | |
#Parse the webpage | |
soup = BeautifulSoup(page, 'html.parser') | |
# Get the page data from the div with a class of product list view | |
producet_list = soup.find('div',class_="prod-list-view") | |
# Traverse the DOM | |
items = producet_list.find('section') | |
book_list = items.find('ol',class_="product-list row") | |
book_data = book_list.findAll('li',class_='col-xs-6 col-md-3') | |
number_of_books = len(book_data) | |
self.startScrapping(number_of_books,book_data) | |
def startScrapping(self,items,book_data): | |
#Get the current working directory | |
current_directory = os.getcwd() | |
# Create a folder named books to store the srapped images | |
path = os.path.join(current_directory,r"books") | |
self.createDirectory(path) | |
counter = 1 | |
#Loop through the product list | |
for book in book_data: | |
try: | |
product = book.find('div',class_="product") | |
url = product.find('a') | |
full_url = url.get('href') | |
page=urllib.request.urlopen('https://textbookcentre.com'+full_url) | |
soup = BeautifulSoup(page, 'html.parser') | |
data = soup.find('article',class_='product_page') | |
image = data.find('div',id='product-images') | |
image = image.find('a') | |
image_url = image.get('href') | |
#Get the title of the book so as the save each book with its title | |
title_data = data.find('div',class_='col-sm-6 product_main') | |
title = title_data.find('h1') | |
fullpath = os.path.join(path,title.text) | |
#Save the book | |
urllib.request.urlretrieve('https://textbookcentre.com'+image_url, " {}/{}.jpg".format(path,title.text)) | |
if counter == items: | |
print('INFO: finished') | |
return counter | |
else: | |
print('INFO: saved {} {}'.format(title.text,counter)) | |
counter +=1 | |
except Exception as e: | |
print('ERROR:',e) | |
def createDirectory(self,path): | |
# Create directory to store the images | |
try: | |
os.mkdir(path) | |
except Exception as e: | |
print ("Creation of the directory failed",e ) | |
else: | |
print ("Successfully created the directory %s " % path) | |
def main(): | |
url = "https://textbookcentre.com/catalogue/category/text-books/primary-school/" | |
scrapper = Scrapper() | |
scrapper.initializeScrapping(url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment