Skip to content

Instantly share code, notes, and snippets.

@adraguidev
Last active September 16, 2021 18:19
Show Gist options
  • Save adraguidev/c6388ac03ceb74ffefda2a5365507137 to your computer and use it in GitHub Desktop.
Save adraguidev/c6388ac03ceb74ffefda2a5365507137 to your computer and use it in GitHub Desktop.
Scrapping
import requests #pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4
#Load our first page:
r = request.get("URL")
#Convert to a beautiful soup object
soup = bs(r.content)
#Print our html
print(soup.prettify())
#Start scrapping
#find and find_all
first_header = soup.find("h2")
first_header
headers = soup.find_all("h2") #Create a list
print(headers)
#Pass in a list of elements to look for
first_header = soup.find(["h1","h2"])
headers = soup.find_all(["h1","h2"])
print(headers)
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
print(paragraph)
#You can nest fin/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header
#We can search specific string sin our find/find_all calls
import re
paragraphs = soup.find_all("p",string=re.compile("Some"))
paragraphs
headers = soup.find_all("h2",string=re.compile("(H|h)eader"))
headers
#CSS selector
content = soup.select("div p")
content
paragraphs = soup.select("h2 ~ p") #p conained in h2
paragraphs
bold_text = soup.select("p#paragraph-id b")
bold_text
paragraphs = soup.select("body < p")
paragraphs
#Grab by element with especific property
soup.select("[align=middle]")
#Get different properties of the HTML
header = soup.find("h2")
header.string
#If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())
#Get a specific property from an element
link = soup.find("a")
link = ['href']
paragraphs = soup.select("p#paragraph-id")
paragraph[0]['id']
#Path Syntax
soup.body.div.h1.string
print(soup.body.prettify())
soup.body.find("div").find_next_siblings()
from urllib.request import urlopen, urljoin
import re
def download_page(url):
return urlopen(url).read().decode('utf-8')
def extract_image_locations(page):
img_regex = re.compile('<img[^>]+src=["\'](.*?)["\']', re.IGNORECASE)
return img_regex.findall(page)
if __name__ == '__main__':
target_url = 'http://www.apress.com/'
apress = download_page(target_url)
image_locations = extract_image_locations(apress)
for src in image_locations:
print(urljoin(target_url, src))
from urllib import robotparser
robot_parser = robotparser.RobotFileParser()
def prepare(robots_txt_url):
robot_parser.set_url(robots_txt_url)
robot_parser.read()
def is_allowed(target_url, user_agent='*'):
return robot_parser.can_fetch(user_agent,target_url)
if __name__ == '__main__':
prepare("https://www.ladyposhperu.com/robots.txt")
print(is_allowed('https://www.ladyposhperu.com/jeans?page=2'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment