adraguidev · September 16, 2021 18:19
diff --git a/beautifulsoup.py b/beautifulsoup.py
 import requests #pip install requests
 from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

 #Load our first page:
 r = request.get("URL")

 #Convert to a beautiful soup object
 soup = bs(r.content)

 #Print our html
 print(soup.prettify())

 #Start scrapping
 #find and find_all

 first_header = soup.find("h2")
 first_header

 headers = soup.find_all("h2") #Create a list
 print(headers)

 #Pass in a list of elements to look for
 first_header = soup.find(["h1","h2"])

 headers = soup.find_all(["h1","h2"])
 print(headers)

 # You can pass in attributes to the find/find_all function
 paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
 print(paragraph)

 #You can nest fin/find_all calls
 body = soup.find('body')
 div = body.find('div')
 header = div.find('h1')
 header

 #We can search specific string sin our find/find_all calls
 import re

 paragraphs = soup.find_all("p",string=re.compile("Some"))
 paragraphs

 headers = soup.find_all("h2",string=re.compile("(H|h)eader"))
 headers

 #CSS selector
 content = soup.select("div p")
 content

 paragraphs = soup.select("h2 ~ p") #p conained in h2
 paragraphs

 bold_text = soup.select("p#paragraph-id b")
 bold_text

 paragraphs = soup.select("body < p")
 paragraphs

 #Grab by element with especific property
 soup.select("[align=middle]")

 #Get different properties of the HTML

 header = soup.find("h2")
 header.string

 #If multiple child elements use get_text
 div = soup.find("div")
 print(div.prettify())
 print(div.get_text())

 #Get a specific property from an element
 link = soup.find("a")
 link = ['href']

 paragraphs = soup.select("p#paragraph-id")
 paragraph[0]['id']

 #Path Syntax
 soup.body.div.h1.string
 print(soup.body.prettify())

 soup.body.find("div").find_next_siblings()

diff --git a/image_extractor.py b/image_extractor.py
 from urllib.request import urlopen, urljoin
 import re


 def download_page(url):
    return urlopen(url).read().decode('utf-8')


 def extract_image_locations(page):
    img_regex = re.compile('<img[^>]+src=["\'](.*?)["\']', re.IGNORECASE)
    return img_regex.findall(page)


 if __name__ == '__main__':
    target_url = 'http://www.apress.com/'
    apress = download_page(target_url)
    image_locations = extract_image_locations(apress)

    for src in image_locations:
        print(urljoin(target_url, src))
diff --git a/link_extractor.py b/link_extractor.py
 from urllib.request import urlopen, urljoin
 import re


 def download_page(url):
    return urlopen(url).read().decode('utf-8')


 def extract_links(page):
    link_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    return link_regex.findall(page)


 if __name__ == '__main__':
    target_url = 'https://www.ladyposhperu.com/jeans?page=100'
    ladyposh = download_page(target_url)
    links = extract_links(ladyposh)

    for link in links:
        print(urljoin(target_url, link))
diff --git a/parsingrobots.py b/parsingrobots.py
 from urllib import robotparser
 robot_parser = robotparser.RobotFileParser()
 def prepare(robots_txt_url):
    robot_parser.set_url(robots_txt_url)
    robot_parser.read()

 def is_allowed(target_url, user_agent='*'):
    return robot_parser.can_fetch(user_agent,target_url)
 

 if __name__ == '__main__':
    prepare("https://www.ladyposhperu.com/robots.txt")
    
    print(is_allowed('https://www.ladyposhperu.com/jeans?page=2'))
	import requests #pip install requests
	from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

	#Load our first page:
	r = request.get("URL")

	#Convert to a beautiful soup object
	soup = bs(r.content)

	#Print our html
	print(soup.prettify())

	#Start scrapping
	#find and find_all

	first_header = soup.find("h2")
	first_header

	headers = soup.find_all("h2") #Create a list
	print(headers)

	#Pass in a list of elements to look for
	first_header = soup.find(["h1","h2"])

	headers = soup.find_all(["h1","h2"])
	print(headers)

	# You can pass in attributes to the find/find_all function
	paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
	print(paragraph)

	#You can nest fin/find_all calls
	body = soup.find('body')
	div = body.find('div')
	header = div.find('h1')
	header

	#We can search specific string sin our find/find_all calls
	import re

	paragraphs = soup.find_all("p",string=re.compile("Some"))
	paragraphs

	headers = soup.find_all("h2",string=re.compile("(H\|h)eader"))
	headers

	#CSS selector
	content = soup.select("div p")
	content

	paragraphs = soup.select("h2 ~ p") #p conained in h2
	paragraphs

	bold_text = soup.select("p#paragraph-id b")
	bold_text

	paragraphs = soup.select("body < p")
	paragraphs

	#Grab by element with especific property
	soup.select("[align=middle]")

	#Get different properties of the HTML

	header = soup.find("h2")
	header.string

	#If multiple child elements use get_text
	div = soup.find("div")
	print(div.prettify())
	print(div.get_text())

	#Get a specific property from an element
	link = soup.find("a")
	link = ['href']

	paragraphs = soup.select("p#paragraph-id")
	paragraph[0]['id']

	#Path Syntax
	soup.body.div.h1.string
	print(soup.body.prettify())

	soup.body.find("div").find_next_siblings()
	from urllib.request import urlopen, urljoin
	import re


	def download_page(url):
	return urlopen(url).read().decode('utf-8')


	def extract_image_locations(page):
	img_regex = re.compile('<img[^>]+src=["\'](.*?)["\']', re.IGNORECASE)
	return img_regex.findall(page)


	if __name__ == '__main__':
	target_url = 'http://www.apress.com/'
	apress = download_page(target_url)
	image_locations = extract_image_locations(apress)

	for src in image_locations:
	print(urljoin(target_url, src))
	from urllib import robotparser
	robot_parser = robotparser.RobotFileParser()
	def prepare(robots_txt_url):
	robot_parser.set_url(robots_txt_url)
	robot_parser.read()

	def is_allowed(target_url, user_agent='*'):
	return robot_parser.can_fetch(user_agent,target_url)


	if __name__ == '__main__':
	prepare("https://www.ladyposhperu.com/robots.txt")

	print(is_allowed('https://www.ladyposhperu.com/jeans?page=2'))