Created
June 12, 2025 07:33
-
-
Save bindiego/6dc95b570295712e0b493d6981690e2b to your computer and use it in GitHub Desktop.
multi-threading, YouTube videos and image resources download script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import csv | |
import os | |
import requests | |
import time | |
import threading | |
import yt_dlp # Replaced pytube with yt-dlp | |
from urllib.error import HTTPError # Kept for download_image, though yt-dlp might also raise it | |
from concurrent.futures import ThreadPoolExecutor | |
# Configurable number of threads | |
MAX_THREADS = 1 | |
# Output directory for downloads | |
DOWNLOAD_DIR = "downloads" | |
# Path to the CSV file | |
# CSV_FILE_PATH = "history_asset_details.csv" | |
CSV_FILE_PATH = "new_asset_details.csv" | |
YOUTUBE_RETRY_ATTEMPTS = 3 # Number of times to retry a failed download | |
YOUTUBE_INITIAL_BACKOFF = 5 # Initial seconds to wait before retrying | |
def download_image(asset_id, image_url): | |
"""Downloads an image from a URL and saves it.""" | |
try: | |
response = requests.get(image_url, stream=True) | |
response.raise_for_status() # Raise an exception for HTTP errors | |
file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.jpg") | |
with open(file_path, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
print(f"✅ Thread {threading.get_ident()}: Downloaded image {asset_id}.jpg from {image_url}") | |
except requests.exceptions.RequestException as e: | |
print(f"⚠️ Thread {threading.get_ident()}: Error downloading image {asset_id} from {image_url}: {e}") | |
except Exception as e: | |
print(f"❌ Thread {threading.get_ident()}: An unexpected error occurred while downloading image {asset_id}: {e}") | |
def download_youtube_video(asset_id, video_url): | |
"""Downloads a YouTube video in the best MP4 quality using yt-dlp.""" | |
output_template = os.path.join(DOWNLOAD_DIR, f"{asset_id}.%(ext)s") | |
ydl_opts = { | |
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best', | |
'outtmpl': output_template, | |
'quiet': True, # Suppress console output from yt-dlp itself | |
'noplaylist': True, # Only download single video if URL is part of a playlist | |
'retries': YOUTUBE_RETRY_ATTEMPTS, # Use yt-dlp's retry mechanism | |
# 'backoff_factor': YOUTUBE_INITIAL_BACKOFF, # yt-dlp has its own backoff logic | |
# 'ignoreerrors': True, # Continue on download errors (e.g. if a format is not available) | |
} | |
try: | |
print(f"Thread {threading.get_ident()}: Attempting to download YouTube video {asset_id} from {video_url} using yt-dlp") | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([video_url]) | |
# Check if the file was downloaded with .mp4 extension, yt-dlp might choose another if mp4 is not best | |
# Forcing mp4 might require different format selection or postprocessing. | |
# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' prioritizes mp4. | |
# We'll assume yt-dlp handles the extension correctly based on the format string. | |
# If the specific filename f"{asset_id}.mp4" is required, we might need to rename. | |
# For now, let's assume the output template works as intended or the user can adjust if needed. | |
# A more robust check would be to see if os.path.exists(os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4")) | |
# yt-dlp might download to a temp file then rename, or directly to the final name. | |
# The `outtmpl` should handle the final filename. | |
# We need to confirm the final filename. If yt-dlp adds an extension like .mkv and we need .mp4, | |
# we might need to add postprocessing options to convert. | |
# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' should ensure mp4. | |
# Let's check for the expected file | |
expected_file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4") | |
# yt-dlp might also produce .mkv or other extensions if mp4 is not available for "best" | |
# We should check for the existence of the file based on outtmpl pattern if strict mp4 is not guaranteed. | |
# However, the format string is designed to prefer mp4. | |
# A simple check for existence of the .mp4 file: | |
if os.path.exists(expected_file_path): | |
print(f"✅ Thread {threading.get_ident()}: Downloaded YouTube video {asset_id}.mp4 from {video_url}") | |
else: | |
# Check if another extension was used by yt-dlp due to format availability | |
# This part can be complex if we need to find the exact downloaded file name if not mp4 | |
# For now, assume the format string works or print a warning. | |
print(f"⚠️ Thread {threading.get_ident()}: YouTube video {asset_id} downloaded, but expected file {expected_file_path} not found. Check download directory for other extensions or yt-dlp logs.") | |
# Attempt to find the downloaded file if it's not .mp4 | |
found_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.startswith(asset_id + ".")] | |
if found_files: | |
print(f"ℹ️ Thread {threading.get_ident()}: Found downloaded file(s): {', '.join(found_files)} for asset_id {asset_id}") | |
else: | |
print(f"❌ Thread {threading.get_ident()}: No file found for asset_id {asset_id} after download attempt.") | |
except yt_dlp.utils.DownloadError as e: | |
# yt-dlp specific download errors | |
print(f"❌ Thread {threading.get_ident()}: yt-dlp DownloadError for {asset_id} from {video_url}: {e}") | |
except Exception as e: | |
# General errors | |
print(f"❌ Thread {threading.get_ident()}: Error downloading YouTube video {asset_id} from {video_url} using yt-dlp: {e}") | |
def process_row(row): | |
"""Processes a single row from the CSV file.""" | |
try: | |
asset_id = row.get('asset_id') | |
asset_type = row.get('asset_type') | |
if asset_type == "IMAGE": | |
image_url = row.get('image_url') | |
download_image(asset_id, image_url) | |
elif asset_type == "YOUTUBE_VIDEO": | |
video_url = row.get('video_url') | |
download_youtube_video(asset_id, video_url) | |
else: | |
# print(f"Thread {threading.get_ident()}: Ignoring asset_type: {asset_type} for asset_id: {asset_id}") | |
pass | |
except Exception as e: | |
print(f"❌ Thread {threading.get_ident()}: Error processing row {row}: {e}") | |
def main(): | |
"""Main function to parse CSV and initiate downloads.""" | |
# Create downloads directory if it doesn't exist | |
if not os.path.exists(DOWNLOAD_DIR): | |
os.makedirs(DOWNLOAD_DIR) | |
print(f"Created directory: {DOWNLOAD_DIR}") | |
try: | |
with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as csvfile: | |
reader = csv.DictReader(csvfile) | |
# Ensure fieldnames are correctly identified, especially if there are BOM characters | |
# print(f"CSV Headers: {reader.fieldnames}") | |
# Filter out rows that might be problematic or empty | |
valid_rows = [row for row in reader if row.get('asset_id') | |
and row.get('asset_type') | |
and (row.get('image_url') or row.get('video_url'))] | |
if not valid_rows: | |
print("No valid rows found in the CSV file to process.") | |
return | |
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: | |
executor.map(process_row, valid_rows) | |
except FileNotFoundError: | |
print(f"Error: The file {CSV_FILE_PATH} was not found.") | |
except Exception as e: | |
print(f"An error occurred during CSV processing: {e}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment