bindiego · June 12, 2025 07:33
diff --git a/resource_download.py b/resource_download.py
 #!/usr/bin/env python3

 import csv
 import os
 import requests
 import time
 import threading
 import yt_dlp # Replaced pytube with yt-dlp
 from urllib.error import HTTPError # Kept for download_image, though yt-dlp might also raise it
 from concurrent.futures import ThreadPoolExecutor

 # Configurable number of threads
 MAX_THREADS = 1

 # Output directory for downloads
 DOWNLOAD_DIR = "downloads"

 # Path to the CSV file
 # CSV_FILE_PATH = "history_asset_details.csv"
 CSV_FILE_PATH = "new_asset_details.csv"

 YOUTUBE_RETRY_ATTEMPTS = 3 # Number of times to retry a failed download
 YOUTUBE_INITIAL_BACKOFF = 5 # Initial seconds to wait before retrying

 def download_image(asset_id, image_url):
    """Downloads an image from a URL and saves it."""
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an exception for HTTP errors
        file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.jpg")
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Thread {threading.get_ident()}: Downloaded image {asset_id}.jpg from {image_url}")
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Thread {threading.get_ident()}: Error downloading image {asset_id} from {image_url}: {e}")
    except Exception as e:
        print(f"❌ Thread {threading.get_ident()}: An unexpected error occurred while downloading image {asset_id}: {e}")

 def download_youtube_video(asset_id, video_url):
    """Downloads a YouTube video in the best MP4 quality using yt-dlp."""
    output_template = os.path.join(DOWNLOAD_DIR, f"{asset_id}.%(ext)s")
    
    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
        'outtmpl': output_template,
        'quiet': True, # Suppress console output from yt-dlp itself
        'noplaylist': True, # Only download single video if URL is part of a playlist
        'retries': YOUTUBE_RETRY_ATTEMPTS, # Use yt-dlp's retry mechanism
        # 'backoff_factor': YOUTUBE_INITIAL_BACKOFF, # yt-dlp has its own backoff logic
        # 'ignoreerrors': True, # Continue on download errors (e.g. if a format is not available)
    }

    try:
        print(f"Thread {threading.get_ident()}: Attempting to download YouTube video {asset_id} from {video_url} using yt-dlp")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
        # Check if the file was downloaded with .mp4 extension, yt-dlp might choose another if mp4 is not best
        # Forcing mp4 might require different format selection or postprocessing.
        # The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' prioritizes mp4.
        # We'll assume yt-dlp handles the extension correctly based on the format string.
        # If the specific filename f"{asset_id}.mp4" is required, we might need to rename.
        # For now, let's assume the output template works as intended or the user can adjust if needed.
        # A more robust check would be to see if os.path.exists(os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4"))
        
        # yt-dlp might download to a temp file then rename, or directly to the final name.
        # The `outtmpl` should handle the final filename.
        # We need to confirm the final filename. If yt-dlp adds an extension like .mkv and we need .mp4,
        # we might need to add postprocessing options to convert.
        # The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' should ensure mp4.
        
        # Let's check for the expected file
        expected_file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4")
        # yt-dlp might also produce .mkv or other extensions if mp4 is not available for "best"
        # We should check for the existence of the file based on outtmpl pattern if strict mp4 is not guaranteed.
        # However, the format string is designed to prefer mp4.

        # A simple check for existence of the .mp4 file:
        if os.path.exists(expected_file_path):
             print(f"✅ Thread {threading.get_ident()}: Downloaded YouTube video {asset_id}.mp4 from {video_url}")
        else:
            # Check if another extension was used by yt-dlp due to format availability
            # This part can be complex if we need to find the exact downloaded file name if not mp4
            # For now, assume the format string works or print a warning.
            print(f"⚠️ Thread {threading.get_ident()}: YouTube video {asset_id} downloaded, but expected file {expected_file_path} not found. Check download directory for other extensions or yt-dlp logs.")
            # Attempt to find the downloaded file if it's not .mp4
            found_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.startswith(asset_id + ".")]
            if found_files:
                print(f"ℹ️ Thread {threading.get_ident()}: Found downloaded file(s): {', '.join(found_files)} for asset_id {asset_id}")
            else:
                print(f"❌ Thread {threading.get_ident()}: No file found for asset_id {asset_id} after download attempt.")


    except yt_dlp.utils.DownloadError as e:
        # yt-dlp specific download errors
        print(f"❌ Thread {threading.get_ident()}: yt-dlp DownloadError for {asset_id} from {video_url}: {e}")
    except Exception as e:
        # General errors
        print(f"❌ Thread {threading.get_ident()}: Error downloading YouTube video {asset_id} from {video_url} using yt-dlp: {e}")

 def process_row(row):
    """Processes a single row from the CSV file."""
    try:
        asset_id = row.get('asset_id')
        asset_type = row.get('asset_type')

        if asset_type == "IMAGE":
            image_url = row.get('image_url')
            download_image(asset_id, image_url)
        elif asset_type == "YOUTUBE_VIDEO":
            video_url = row.get('video_url')
            download_youtube_video(asset_id, video_url)
        else:
            # print(f"Thread {threading.get_ident()}: Ignoring asset_type: {asset_type} for asset_id: {asset_id}")
            pass
    except Exception as e:
        print(f"❌ Thread {threading.get_ident()}: Error processing row {row}: {e}")


 def main():
    """Main function to parse CSV and initiate downloads."""
    # Create downloads directory if it doesn't exist
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)
        print(f"Created directory: {DOWNLOAD_DIR}")

    try:
        with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            # Ensure fieldnames are correctly identified, especially if there are BOM characters
            # print(f"CSV Headers: {reader.fieldnames}")
            
            # Filter out rows that might be problematic or empty
            valid_rows = [row for row in reader if row.get('asset_id') 
                and row.get('asset_type') 
                and (row.get('image_url') or row.get('video_url'))]

            if not valid_rows:
                print("No valid rows found in the CSV file to process.")
                return

            with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
                executor.map(process_row, valid_rows)
                
    except FileNotFoundError:
        print(f"Error: The file {CSV_FILE_PATH} was not found.")
    except Exception as e:
        print(f"An error occurred during CSV processing: {e}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import csv
	import os
	import requests
	import time
	import threading
	import yt_dlp # Replaced pytube with yt-dlp
	from urllib.error import HTTPError # Kept for download_image, though yt-dlp might also raise it
	from concurrent.futures import ThreadPoolExecutor

	# Configurable number of threads
	MAX_THREADS = 1

	# Output directory for downloads
	DOWNLOAD_DIR = "downloads"

	# Path to the CSV file
	# CSV_FILE_PATH = "history_asset_details.csv"
	CSV_FILE_PATH = "new_asset_details.csv"

	YOUTUBE_RETRY_ATTEMPTS = 3 # Number of times to retry a failed download
	YOUTUBE_INITIAL_BACKOFF = 5 # Initial seconds to wait before retrying

	def download_image(asset_id, image_url):
	"""Downloads an image from a URL and saves it."""
	try:
	response = requests.get(image_url, stream=True)
	response.raise_for_status() # Raise an exception for HTTP errors
	file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.jpg")
	with open(file_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print(f"✅ Thread {threading.get_ident()}: Downloaded image {asset_id}.jpg from {image_url}")
	except requests.exceptions.RequestException as e:
	print(f"⚠️ Thread {threading.get_ident()}: Error downloading image {asset_id} from {image_url}: {e}")
	except Exception as e:
	print(f"❌ Thread {threading.get_ident()}: An unexpected error occurred while downloading image {asset_id}: {e}")

	def download_youtube_video(asset_id, video_url):
	"""Downloads a YouTube video in the best MP4 quality using yt-dlp."""
	output_template = os.path.join(DOWNLOAD_DIR, f"{asset_id}.%(ext)s")

	ydl_opts = {
	'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
	'outtmpl': output_template,
	'quiet': True, # Suppress console output from yt-dlp itself
	'noplaylist': True, # Only download single video if URL is part of a playlist
	'retries': YOUTUBE_RETRY_ATTEMPTS, # Use yt-dlp's retry mechanism
	# 'backoff_factor': YOUTUBE_INITIAL_BACKOFF, # yt-dlp has its own backoff logic
	# 'ignoreerrors': True, # Continue on download errors (e.g. if a format is not available)
	}

	try:
	print(f"Thread {threading.get_ident()}: Attempting to download YouTube video {asset_id} from {video_url} using yt-dlp")
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([video_url])
	# Check if the file was downloaded with .mp4 extension, yt-dlp might choose another if mp4 is not best
	# Forcing mp4 might require different format selection or postprocessing.
	# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' prioritizes mp4.
	# We'll assume yt-dlp handles the extension correctly based on the format string.
	# If the specific filename f"{asset_id}.mp4" is required, we might need to rename.
	# For now, let's assume the output template works as intended or the user can adjust if needed.
	# A more robust check would be to see if os.path.exists(os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4"))

	# yt-dlp might download to a temp file then rename, or directly to the final name.
	# The `outtmpl` should handle the final filename.
	# We need to confirm the final filename. If yt-dlp adds an extension like .mkv and we need .mp4,
	# we might need to add postprocessing options to convert.
	# The format string 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' should ensure mp4.

	# Let's check for the expected file
	expected_file_path = os.path.join(DOWNLOAD_DIR, f"{asset_id}.mp4")
	# yt-dlp might also produce .mkv or other extensions if mp4 is not available for "best"
	# We should check for the existence of the file based on outtmpl pattern if strict mp4 is not guaranteed.
	# However, the format string is designed to prefer mp4.

	# A simple check for existence of the .mp4 file:
	if os.path.exists(expected_file_path):
	print(f"✅ Thread {threading.get_ident()}: Downloaded YouTube video {asset_id}.mp4 from {video_url}")
	else:
	# Check if another extension was used by yt-dlp due to format availability
	# This part can be complex if we need to find the exact downloaded file name if not mp4
	# For now, assume the format string works or print a warning.
	print(f"⚠️ Thread {threading.get_ident()}: YouTube video {asset_id} downloaded, but expected file {expected_file_path} not found. Check download directory for other extensions or yt-dlp logs.")
	# Attempt to find the downloaded file if it's not .mp4
	found_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.startswith(asset_id + ".")]
	if found_files:
	print(f"ℹ️ Thread {threading.get_ident()}: Found downloaded file(s): {', '.join(found_files)} for asset_id {asset_id}")
	else:
	print(f"❌ Thread {threading.get_ident()}: No file found for asset_id {asset_id} after download attempt.")


	except yt_dlp.utils.DownloadError as e:
	# yt-dlp specific download errors
	print(f"❌ Thread {threading.get_ident()}: yt-dlp DownloadError for {asset_id} from {video_url}: {e}")
	except Exception as e:
	# General errors
	print(f"❌ Thread {threading.get_ident()}: Error downloading YouTube video {asset_id} from {video_url} using yt-dlp: {e}")

	def process_row(row):
	"""Processes a single row from the CSV file."""
	try:
	asset_id = row.get('asset_id')
	asset_type = row.get('asset_type')

	if asset_type == "IMAGE":
	image_url = row.get('image_url')
	download_image(asset_id, image_url)
	elif asset_type == "YOUTUBE_VIDEO":
	video_url = row.get('video_url')
	download_youtube_video(asset_id, video_url)
	else:
	# print(f"Thread {threading.get_ident()}: Ignoring asset_type: {asset_type} for asset_id: {asset_id}")
	pass
	except Exception as e:
	print(f"❌ Thread {threading.get_ident()}: Error processing row {row}: {e}")


	def main():
	"""Main function to parse CSV and initiate downloads."""
	# Create downloads directory if it doesn't exist
	if not os.path.exists(DOWNLOAD_DIR):
	os.makedirs(DOWNLOAD_DIR)
	print(f"Created directory: {DOWNLOAD_DIR}")

	try:
	with open(CSV_FILE_PATH, mode='r', encoding='utf-8') as csvfile:
	reader = csv.DictReader(csvfile)
	# Ensure fieldnames are correctly identified, especially if there are BOM characters
	# print(f"CSV Headers: {reader.fieldnames}")

	# Filter out rows that might be problematic or empty
	valid_rows = [row for row in reader if row.get('asset_id')
	and row.get('asset_type')
	and (row.get('image_url') or row.get('video_url'))]

	if not valid_rows:
	print("No valid rows found in the CSV file to process.")
	return

	with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
	executor.map(process_row, valid_rows)

	except FileNotFoundError:
	print(f"Error: The file {CSV_FILE_PATH} was not found.")
	except Exception as e:
	print(f"An error occurred during CSV processing: {e}")

	if __name__ == "__main__":
	main()