Created
September 30, 2023 00:11
-
-
Save agucova/a34ea38c014121ba388154e8e0a5ae38 to your computer and use it in GitHub Desktop.
Script for converting <img> references in an Anki CSV to inline webp images.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import typer | |
import pandas as pd | |
import requests | |
import base64 | |
from bs4 import BeautifulSoup | |
from PIL import Image | |
from io import BytesIO | |
from typing import Optional | |
def img_to_base64(url: str) -> str: | |
response = requests.get(url) | |
img = Image.open(BytesIO(response.content)) | |
buffered = BytesIO() | |
img.save(buffered, format="WEBP") | |
image_data = base64.b64encode(buffered.getvalue()).decode('utf-8') | |
return f'data:image/webp;base64,{image_data}' | |
def convert_images_to_base64(html: Optional[str]) -> str: | |
if html is None or pd.isna(html): | |
return html | |
soup = BeautifulSoup(html, 'html.parser') | |
images = soup.find_all('img') | |
for img in images: | |
src = img['src'] | |
if not src.startswith('data:'): | |
base64_img = img_to_base64(src) | |
img['src'] = base64_img | |
return str(soup) | |
def main(input_file: str, output_file: str): | |
""" | |
Convert image URLs in a CSV file to base64 encoded WebP images. | |
Arguments: | |
input_file: Path to the input CSV file. | |
output_file: Path to the output CSV file. | |
""" | |
# Read the CSV file into a DataFrame | |
df = pd.read_csv(input_file) | |
# Remove rows where all elements are NaN | |
df.dropna(how='all', inplace=True) | |
# Iterate through each row and each HTML field to convert images to base64 | |
for index, row in df.iterrows(): | |
for column in ['Question', 'Answer']: | |
html_content = row[column] | |
html_content_with_base64 = convert_images_to_base64(html_content) | |
df.at[index, column] = html_content_with_base64 | |
# Save the DataFrame back to a new CSV file | |
df.to_csv(output_file, index=False) | |
typer.echo(f"Converted images in {input_file} and saved to {output_file}") | |
if __name__ == "__main__": | |
typer.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment