I alwasy forget to spoof my UA string and pull in cookies from a proper browser when performing various CI/CD tasks.
Maybe it'll be of use to others.
Python packages:
- beautifulsoup4
- requests
- fake-useragent
- browser-cookie3
| # Extremely quick and dirty binary downloader | |
| # A better method would be to use a requests Session | |
| # There is no error checking outside of the return code | |
| # There is no caching check. | |
| # Clobbering is implied | |
| def download_jpg(href, headers, cookiejar, chunk_size=8192): | |
| try: | |
| with requests.get(href, headers=headers, cookiejar=cookiejar, stream=True) as r: | |
| if r.status_code >= 400: | |
| print(f"Error: {r.status_code}") | |
| with open(dest, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=chunk_size): | |
| f.write(chunk) | |
| except requests.exceptions.ConnectionError: | |
| print("Unable to resolve") | |
| except requests.exceptions.InvalidSchema: | |
| print("Invalid schema") |
| import browser_cookie3 | |
| import requests | |
| from fake_useragent import UserAgent | |
| from bs4 import BeautifulSoup | |
| # Spoof our UA string | |
| ua = UserAgent() | |
| user_agent = ua.firefox | |
| headers = { | |
| 'User-Agent': user_agent, | |
| } | |
| # Extract our cookies | |
| cookiejar = browser_cookie3.firefox(domain_name='example.com') | |
| # The usual | |
| resp = requests.get(url, cookies=cookiejar, headers=headers) | |
| html = resp.text | |
| soup = BeautifulSoup(html, features="html.parser") | |
| # And the rest |