Last active
November 20, 2021 19:09
-
-
Save santiagobasulto/d7269de2f955933b551ca88f17744c42 to your computer and use it in GitHub Desktop.
A quick script to transform a Markdown file's relative images to absolute by uploading them to a plugin-based service (S3, Imgur)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import mimetypes | |
from pathlib import Path | |
import requests | |
import boto3 | |
from botocore.exceptions import ClientError | |
PATTERN_FULL = '(?:!\[(?P<alt_text>.*?)\]\((?P<filename>.*?)\))' | |
PATTERN_FNAME = '(?:!\[(?:.*?)\]\((?P<filename>.*?)\))' | |
class Uploader: | |
def upload_image(self, image_path, override=False): | |
raise NotImplementedError() | |
class S3Uploader(Uploader): | |
def __init__(self, s3_bucket, s3_relative_path, s3_ACL=None, cloudfront_domain=None, cache_control=None): | |
self.bucket = s3_bucket | |
self.relative_path = s3_relative_path.rstrip('/').lstrip('/') | |
self.s3_acl = s3_ACL | |
self.cloudfront_domain = cloudfront_domain | |
self.cache_control = cache_control | |
assert not self.cloudfront_domain.startswith('http://') | |
assert not self.cloudfront_domain.startswith('https://') | |
self.client = boto3.client('s3') | |
def upload_image(self, image_path, override=False): | |
dns = self.cloudfront_domain or f'{self.bucket}.s3.amazonaws.com' | |
p = Path(image_path) | |
kwargs = {} | |
if self.cache_control: | |
kwargs['CacheControl'] = self.cache_control | |
if self.s3_acl: | |
kwargs['ACL'] = self.s3_acl | |
key = f'{self.relative_path}/{p.name}' | |
url = f"https://{dns}/{key}" | |
if not override: | |
try: | |
self.client.head_object(Bucket=self.bucket, Key=key) | |
print(f"\tFound: {url}") | |
return url | |
except ClientError as exc: | |
if exc.response['Error']['Code'] != "404": | |
raise exc | |
content_type, _ = mimetypes.guess_type(image_path) | |
if content_type: | |
kwargs['ContentType'] = content_type | |
with p.open('rb') as fp: | |
self.client.put_object( | |
Body=fp.read(), | |
Bucket=self.bucket, | |
Key=key, | |
**kwargs | |
) | |
print(f"\tUploaded: {url}") | |
return url | |
class ImgurUploader(Uploader): | |
def __init__(self, imgur_access_token): | |
self.access_token = imgur_access_token | |
def upload_image(self, image_path, override=False): | |
p = Path(image_path) | |
headers = {"Authorization": f"Bearer {self.access_token}"} | |
with p.open('rb') as fp: | |
files = {'image': fp} | |
resp = requests.post( | |
'https://api.imgur.com/3/image', | |
headers=headers, files=files) | |
resp.raise_for_status() | |
return resp.json()['data']['link'] | |
UPLOADERS = { | |
's3': S3Uploader, | |
'imgur': ImgurUploader | |
} | |
def main(original_path, output_path, uploader, override=False, **uploader_kwargs): | |
"""Reads a markdown file, finds all the images and uploads them using `uploader`. | |
The result is a new file under `output_path`. Provide specific parameters | |
for the uploader with `uploader_kwargs`. | |
Parameters | |
---------- | |
original_path: str, a valid filesystem path | |
The path of the markdown file used to transform. | |
output_path: str, a valid filesystem path | |
The path of where the resulting markdown file will be stored. | |
WARNING! This file will be overwritten. | |
uploader: str, a choice of uploaders | |
The uploader to use. Currently only supported in the `UPLOADERS` variable. | |
override: bool | |
Passed to the uploader, if the image should be overridden or not. | |
It's responsability of the uploader to respect this flag. | |
**uploader_kwargs: keyword arguments | |
Everything else will be passed to the Uploader at the moment of initialization. | |
""" | |
UploaderClass = UPLOADERS[uploader] | |
uploader = UploaderClass(**uploader_kwargs) | |
original_path = Path(original_path) | |
base_path = original_path.parent | |
pattern = re.compile(PATTERN_FNAME) | |
with original_path.open() as fp: | |
content = fp.read() | |
image_relative_paths = set(pattern.findall(content)) | |
image_mapping = { | |
image_relative: (base_path / image_relative) for image_relative in image_relative_paths | |
} | |
missing_images = [str(abs_path) for _, abs_path in image_mapping.items() if not abs_path.exists()] | |
if missing_images: | |
raise ValueError(f'Missing images: {",".join(missing_images)}') | |
image_results = { | |
relative_path: uploader.upload_image(abs_path, override) | |
for relative_path, abs_path in image_mapping.items() | |
} | |
for relative_path, upload_path in image_results.items(): | |
content = content.replace(relative_path, upload_path) | |
with open(output_path, 'w') as fp: | |
fp.write(content) | |
return image_results | |
CMD_REQUIRED_ARGUMENTS = { | |
's3': ['s3_bucket', 's3_relative_path'] | |
} | |
if __name__ == "__main__": | |
import argparse | |
import pathlib | |
parser = argparse.ArgumentParser(description='Process some integers.') | |
parser.add_argument('input', type=pathlib.Path, help='A path to the markdown with relative images to transform') | |
parser.add_argument('output', type=pathlib.Path, help='A path to store the output of the process') | |
parser.add_argument('-u', '--uploader', choices=['s3', 'imgur'], required=True) | |
parser.add_argument('-o', '--override', action='store_const', const=True, default=False) | |
# S3 specific params | |
parser.add_argument('--s3-bucket') | |
parser.add_argument('--s3-relative-path', help="Where to store the images within the bucket. A key prefix.") | |
parser.add_argument('--s3-acl', default='private') | |
parser.add_argument('--s3-cf-domain', help="The cloudfront domain to use instead of S3's default URL") | |
parser.add_argument('--s3-cache-control') | |
# Imgur specific params | |
parser.add_argument('--imgur-access-token') | |
args = parser.parse_args() | |
assert all([bool(getattr(args, arg)) for arg in CMD_REQUIRED_ARGUMENTS[args.uploader]]), "Missing arguments" | |
if args.uploader == 's3': | |
results = main( | |
args.input, | |
args.output, | |
's3', | |
override=args.override, | |
s3_bucket=args.s3_bucket, | |
s3_relative_path=args.s3_relative_path, | |
s3_ACL=args.s3_acl, | |
cloudfront_domain=args.s3_cf_domain, | |
cache_control=args.s3_cache_control, | |
) | |
else: | |
results = results = main( | |
args.input, | |
args.output, | |
'imgur', | |
override=args.override, | |
imgur_access_token=args.imgur_access_token, | |
) | |
print('\n') | |
print('-' * 60) | |
print(f"Replaced {len(results)} images") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I created this script in just a few minutes to solve an issue with a bunch of MD files that were exported with relative, local images and I needed them absolute and hosted on S3. Please make sure you read how it works before using it.