Skip to content

Instantly share code, notes, and snippets.

@bindiego
Created June 12, 2025 07:33
Show Gist options
  • Save bindiego/6dc95b570295712e0b493d6981690e2b to your computer and use it in GitHub Desktop.
Save bindiego/6dc95b570295712e0b493d6981690e2b to your computer and use it in GitHub Desktop.
multi-threading, YouTube videos and image resources download script
#!/usr/bin/env python3
import csv
import os
import requests
import time
import threading
import yt_dlp # Replaced pytube with yt-dlp
from urllib.error import HTTPError # Kept for download_image, though yt-dlp might also raise it
from concurrent.futures import ThreadPoolExecutor
# Configurable number of threads
MAX_THREADS = 1
# Output directory for downloads
DOWNLOAD_DIR = "downloads"
# Path to the CSV file
# CSV_FILE_PATH = "history_asset_details.csv"
CSV_FILE_PATH = "new_asset_details.csv"
YOUTUBE_RETRY_ATTEMPTS = 3 # Number of times to retry a failed download
YOUTUBE_INITIAL_BACKOFF = 5 # Initial seconds to wait before retrying
def download_image(asset_id, image_url):
"""Downloads an image from a URL and saves it."""
try:
response = requests.get(image_url, stream=True)
response.raise_for_status() # Raise an exception for HTTP errors
file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.jpg")
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Thread {threading.get_ident()}: Downloaded image {asset_id}.jpg from {image_url}")
except requests.exceptions.RequestException as e:
print(f"⚠️ Thread {threading.get_ident()}: Error downloading image {asset_id} from {image_url}: {e}")
except Exception as e:
print(f"❌ Thread {threading.get_ident()}: An unexpected error occurred while downloading image {asset_id}: {e}")
def download_youtube_video(asset_id, video_url):
"""Downloads a YouTube video in the best MP4 quality using yt-dlp."""
output_template = os.path.join(DOWNLOAD_DIR, f"{asset_id}.%(ext)s")
ydl_opts = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'outtmpl': output_template,
'quiet': True, # Suppress console output from yt-dlp itself
'noplaylist': True, # Only download single video if URL is part of a playlist
'retries': YOUTUBE_RETRY_ATTEMPTS, # Use yt-dlp's retry mechanism
# 'backoff_factor': YOUTUBE_INITIAL_BACKOFF, # yt-dlp has its own backoff logic
# 'ignoreerrors': True, # Continue on download errors (e.g. if a format is not available)
}
try:
print(f"Thread {threading.get_ident()}: Attempting to download YouTube video {asset_id} from {video_url} using yt-dlp")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
# Check if the file was downloaded with .mp4 extension, yt-dlp might choose another if mp4 is not best
# Forcing mp4 might require different format selection or postprocessing.
# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' prioritizes mp4.
# We'll assume yt-dlp handles the extension correctly based on the format string.
# If the specific filename f"{asset_id}.mp4" is required, we might need to rename.
# For now, let's assume the output template works as intended or the user can adjust if needed.
# A more robust check would be to see if os.path.exists(os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4"))
# yt-dlp might download to a temp file then rename, or directly to the final name.
# The `outtmpl` should handle the final filename.
# We need to confirm the final filename. If yt-dlp adds an extension like .mkv and we need .mp4,
# we might need to add postprocessing options to convert.
# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' should ensure mp4.
# Let's check for the expected file
expected_file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4")
# yt-dlp might also produce .mkv or other extensions if mp4 is not available for "best"
# We should check for the existence of the file based on outtmpl pattern if strict mp4 is not guaranteed.
# However, the format string is designed to prefer mp4.
# A simple check for existence of the .mp4 file:
if os.path.exists(expected_file_path):
print(f"✅ Thread {threading.get_ident()}: Downloaded YouTube video {asset_id}.mp4 from {video_url}")
else:
# Check if another extension was used by yt-dlp due to format availability
# This part can be complex if we need to find the exact downloaded file name if not mp4
# For now, assume the format string works or print a warning.
print(f"⚠️ Thread {threading.get_ident()}: YouTube video {asset_id} downloaded, but expected file {expected_file_path} not found. Check download directory for other extensions or yt-dlp logs.")
# Attempt to find the downloaded file if it's not .mp4
found_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.startswith(asset_id + ".")]
if found_files:
print(f"ℹ️ Thread {threading.get_ident()}: Found downloaded file(s): {', '.join(found_files)} for asset_id {asset_id}")
else:
print(f"❌ Thread {threading.get_ident()}: No file found for asset_id {asset_id} after download attempt.")
except yt_dlp.utils.DownloadError as e:
# yt-dlp specific download errors
print(f"❌ Thread {threading.get_ident()}: yt-dlp DownloadError for {asset_id} from {video_url}: {e}")
except Exception as e:
# General errors
print(f"❌ Thread {threading.get_ident()}: Error downloading YouTube video {asset_id} from {video_url} using yt-dlp: {e}")
def process_row(row):
"""Processes a single row from the CSV file."""
try:
asset_id = row.get('asset_id')
asset_type = row.get('asset_type')
if asset_type == "IMAGE":
image_url = row.get('image_url')
download_image(asset_id, image_url)
elif asset_type == "YOUTUBE_VIDEO":
video_url = row.get('video_url')
download_youtube_video(asset_id, video_url)
else:
# print(f"Thread {threading.get_ident()}: Ignoring asset_type: {asset_type} for asset_id: {asset_id}")
pass
except Exception as e:
print(f"❌ Thread {threading.get_ident()}: Error processing row {row}: {e}")
def main():
"""Main function to parse CSV and initiate downloads."""
# Create downloads directory if it doesn't exist
if not os.path.exists(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
print(f"Created directory: {DOWNLOAD_DIR}")
try:
with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
# Ensure fieldnames are correctly identified, especially if there are BOM characters
# print(f"CSV Headers: {reader.fieldnames}")
# Filter out rows that might be problematic or empty
valid_rows = [row for row in reader if row.get('asset_id')
and row.get('asset_type')
and (row.get('image_url') or row.get('video_url'))]
if not valid_rows:
print("No valid rows found in the CSV file to process.")
return
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
executor.map(process_row, valid_rows)
except FileNotFoundError:
print(f"Error: The file {CSV_FILE_PATH} was not found.")
except Exception as e:
print(f"An error occurred during CSV processing: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment