Skip to content

Instantly share code, notes, and snippets.

@Denniskamau
Last active April 14, 2020 14:41
Show Gist options
  • Save Denniskamau/23e4e12b52c87c9e8b9924658c11225a to your computer and use it in GitHub Desktop.
Save Denniskamau/23e4e12b52c87c9e8b9924658c11225a to your computer and use it in GitHub Desktop.
Getting started with web scrapping for image data example
import urllib.request
from bs4 import BeautifulSoup
class Scrapper():
def initializeScrapping(self,url):
# Set the url of the page you want to scrap for data\
urlpage = url
# Using urllib open the page
page = urllib.request.urlopen(urlpage)
#Parse the webpage
soup = BeautifulSoup(page, 'html.parser')
# Get the page data from the div with a class of product list view
producet_list = soup.find('div',class_="prod-list-view")
# Traverse the DOM
items = producet_list.find('section')
book_list = items.find('ol',class_="product-list row")
book_data = book_list.findAll('li',class_='col-xs-6 col-md-3')
number_of_books = len(book_data)
self.startScrapping(number_of_books,book_data)
def startScrapping(self,items,book_data):
#Get the current working directory
current_directory = os.getcwd()
# Create a folder named books to store the srapped images
path = os.path.join(current_directory,r"books")
self.createDirectory(path)
counter = 1
#Loop through the product list
for book in book_data:
try:
product = book.find('div',class_="product")
url = product.find('a')
full_url = url.get('href')
page=urllib.request.urlopen('https://textbookcentre.com'+full_url)
soup = BeautifulSoup(page, 'html.parser')
data = soup.find('article',class_='product_page')
image = data.find('div',id='product-images')
image = image.find('a')
image_url = image.get('href')
#Get the title of the book so as the save each book with its title
title_data = data.find('div',class_='col-sm-6 product_main')
title = title_data.find('h1')
fullpath = os.path.join(path,title.text)
#Save the book
urllib.request.urlretrieve('https://textbookcentre.com'+image_url, " {}/{}.jpg".format(path,title.text))
if counter == items:
print('INFO: finished')
return counter
else:
print('INFO: saved {} {}'.format(title.text,counter))
counter +=1
except Exception as e:
print('ERROR:',e)
def createDirectory(self,path):
# Create directory to store the images
try:
os.mkdir(path)
except Exception as e:
print ("Creation of the directory failed",e )
else:
print ("Successfully created the directory %s " % path)
def main():
url = "https://textbookcentre.com/catalogue/category/text-books/primary-school/"
scrapper = Scrapper()
scrapper.initializeScrapping(url)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment