Last active
April 8, 2018 15:01
-
-
Save Buffer0x7cd/73b86672fc0293642f14967022b90331 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' Create a virtualenv with python3 or run with default os installation (adjust the sheband based on you enviornment) | |
Install script dependencies with pip (requests, beautifulsoup4, lxml | |
make the script executable with chmod +x crawler.py | |
run the script with ./crawler.py | |
''' | |
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
import shutil | |
screencast_list = [] | |
ROOT_DOMAIN = "https://www.destroyallsoftware.com" | |
SCREENCAST_DOMAIN = "https://www.destroyallsoftware.com/screencasts/catalog/" | |
DOWNLOAD_SUFFIX = "/download?resoluation=1080p" | |
TARGET_HOST = "https://www.destroyallsoftware.com/screencasts/catalog" | |
FILENAME_START_INDEX=55 | |
FILENAME_END_INDEX=-27 | |
FILE_SUFFIX = ".mp4" | |
VALID_LINK_LENGTH = 81 | |
def build_screencastlist(html_doc): | |
soup = BeautifulSoup(html_doc, "lxml") | |
for link in soup.find_all('a'): | |
if link.get('href'): | |
tmpScreenCastLink = ROOT_DOMAIN+link.get('href')+DOWNLOAD_SUFFIX | |
if tmpScreenCastLink[:55] == SCREENCAST_DOMAIN and len(tmpScreenCastLink) > VALID_LINK_LENGTH: | |
screencast_list.append(tmpScreenCastLink) | |
def get_screencast(url): | |
screencast_name = url[FILENAME_START_INDEX:FILENAME_END_INDEX]+FILE_SUFFIX | |
file = requests.get(url, stream=True) | |
if file.status_code == 200: | |
with open (screencast_name, 'wb') as f: | |
file.raw.decode_content=True | |
shutil.copyfileobj(file.raw,f) | |
return screencast_name | |
else: | |
print("Error during downloading file {} return code: {}".format(url, file.status_code)) | |
return | |
def main(): | |
req = requests.get(TARGET_HOST) | |
if req.status_code == 200: | |
print("Connection Succeeded") | |
html_doc = req.text | |
build_screencastlist(html_doc) | |
for link in screencast_list: | |
print(link) | |
screencast = get_screencast(link) | |
if screencast: | |
print("Completed downloading screencasr: {}".format(screencast)) | |
else: | |
print("Some error occured during establishing initial connection") | |
print(req.status_code) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment