I alwasy forget to spoof my UA string and pull in cookies from a proper browser when performing various CI/CD tasks.
Maybe it'll be of use to others.
Python packages:
- beautifulsoup4
- requests
- fake-useragent
- browser-cookie3
# Extremely quick and dirty binary downloader | |
# A better method would be to use a requests Session | |
# There is no error checking outside of the return code | |
# There is no caching check. | |
# Clobbering is implied | |
def download_jpg(href, headers, cookiejar, chunk_size=8192): | |
try: | |
with requests.get(href, headers=headers, cookiejar=cookiejar, stream=True) as r: | |
if r.status_code >= 400: | |
print(f"Error: {r.status_code}") | |
with open(dest, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=chunk_size): | |
f.write(chunk) | |
except requests.exceptions.ConnectionError: | |
print("Unable to resolve") | |
except requests.exceptions.InvalidSchema: | |
print("Invalid schema") |
import browser_cookie3 | |
import requests | |
from fake_useragent import UserAgent | |
from bs4 import BeautifulSoup | |
# Spoof our UA string | |
ua = UserAgent() | |
user_agent = ua.firefox | |
headers = { | |
'User-Agent': user_agent, | |
} | |
# Extract our cookies | |
cookiejar = browser_cookie3.firefox(domain_name='example.com') | |
# The usual | |
resp = requests.get(url, cookies=cookiejar, headers=headers) | |
html = resp.text | |
soup = BeautifulSoup(html, features="html.parser") | |
# And the rest |