Last active
September 16, 2021 18:19
-
-
Save adraguidev/c6388ac03ceb74ffefda2a5365507137 to your computer and use it in GitHub Desktop.
Scrapping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests #pip install requests | |
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4 | |
#Load our first page: | |
r = request.get("URL") | |
#Convert to a beautiful soup object | |
soup = bs(r.content) | |
#Print our html | |
print(soup.prettify()) | |
#Start scrapping | |
#find and find_all | |
first_header = soup.find("h2") | |
first_header | |
headers = soup.find_all("h2") #Create a list | |
print(headers) | |
#Pass in a list of elements to look for | |
first_header = soup.find(["h1","h2"]) | |
headers = soup.find_all(["h1","h2"]) | |
print(headers) | |
# You can pass in attributes to the find/find_all function | |
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"}) | |
print(paragraph) | |
#You can nest fin/find_all calls | |
body = soup.find('body') | |
div = body.find('div') | |
header = div.find('h1') | |
header | |
#We can search specific string sin our find/find_all calls | |
import re | |
paragraphs = soup.find_all("p",string=re.compile("Some")) | |
paragraphs | |
headers = soup.find_all("h2",string=re.compile("(H|h)eader")) | |
headers | |
#CSS selector | |
content = soup.select("div p") | |
content | |
paragraphs = soup.select("h2 ~ p") #p conained in h2 | |
paragraphs | |
bold_text = soup.select("p#paragraph-id b") | |
bold_text | |
paragraphs = soup.select("body < p") | |
paragraphs | |
#Grab by element with especific property | |
soup.select("[align=middle]") | |
#Get different properties of the HTML | |
header = soup.find("h2") | |
header.string | |
#If multiple child elements use get_text | |
div = soup.find("div") | |
print(div.prettify()) | |
print(div.get_text()) | |
#Get a specific property from an element | |
link = soup.find("a") | |
link = ['href'] | |
paragraphs = soup.select("p#paragraph-id") | |
paragraph[0]['id'] | |
#Path Syntax | |
soup.body.div.h1.string | |
print(soup.body.prettify()) | |
soup.body.find("div").find_next_siblings() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen, urljoin | |
import re | |
def download_page(url): | |
return urlopen(url).read().decode('utf-8') | |
def extract_image_locations(page): | |
img_regex = re.compile('<img[^>]+src=["\'](.*?)["\']', re.IGNORECASE) | |
return img_regex.findall(page) | |
if __name__ == '__main__': | |
target_url = 'http://www.apress.com/' | |
apress = download_page(target_url) | |
image_locations = extract_image_locations(apress) | |
for src in image_locations: | |
print(urljoin(target_url, src)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen, urljoin | |
import re | |
def download_page(url): | |
return urlopen(url).read().decode('utf-8') | |
def extract_links(page): | |
link_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) | |
return link_regex.findall(page) | |
if __name__ == '__main__': | |
target_url = 'https://www.ladyposhperu.com/jeans?page=100' | |
ladyposh = download_page(target_url) | |
links = extract_links(ladyposh) | |
for link in links: | |
print(urljoin(target_url, link)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import robotparser | |
robot_parser = robotparser.RobotFileParser() | |
def prepare(robots_txt_url): | |
robot_parser.set_url(robots_txt_url) | |
robot_parser.read() | |
def is_allowed(target_url, user_agent='*'): | |
return robot_parser.can_fetch(user_agent,target_url) | |
if __name__ == '__main__': | |
prepare("https://www.ladyposhperu.com/robots.txt") | |
print(is_allowed('https://www.ladyposhperu.com/jeans?page=2')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment