Created
January 16, 2024 02:53
-
-
Save pszemraj/81c3a3c795d5e8db2ac2b3aa16ee496c to your computer and use it in GitHub Desktop.
upload a folder to Hugging Face Hub and other utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import time | |
from datetime import datetime | |
from pathlib import Path | |
from typing import Optional | |
from huggingface_hub import upload_folder | |
from watchdog.events import PatternMatchingEventHandler | |
from watchdog.observers import Observer | |
def get_timestamp(): | |
return datetime.now().strftime("%b-%d %H:%M:%S") | |
def validate_inputs( | |
repo_id: str, folder_path: Path, path_in_repo: Optional[str] | |
) -> None: | |
if "/" not in repo_id: | |
raise ValueError( | |
"Invalid repo_id format. It should be in 'username/repository' format." | |
) | |
if not folder_path.exists() or not folder_path.is_dir(): | |
raise FileNotFoundError( | |
f"The folder path '{folder_path}' does not exist or is not a directory." | |
) | |
if path_in_repo and (path_in_repo.startswith("/") or path_in_repo.endswith("/")): | |
raise ValueError("path_in_repo should not start or end with '/'.") | |
def upload_to_huggingface( | |
repo_id: str, folder_path: Path, path_in_repo: Optional[str] | |
) -> None: | |
try: | |
upload_folder( | |
repo_id=repo_id, | |
folder_path=str(folder_path), | |
path_in_repo=path_in_repo, | |
ignore_patterns="*.pt*", | |
commit_message=f"Automated upload: directory change @ {get_timestamp()}", | |
) | |
logging.info("Upload completed successfully.") | |
except Exception as e: | |
logging.error(f"An error occurred during upload: {e}") | |
class ChangeHandler(PatternMatchingEventHandler): | |
def __init__( | |
self, | |
repo_id: str, | |
folder_path: Path, | |
path_in_repo: Optional[str], | |
exclude_substring: str, | |
delay: float = 15.0, | |
) -> None: | |
self.repo_id = repo_id | |
self.folder_path = folder_path | |
self.path_in_repo = path_in_repo | |
self.exclude_substring = exclude_substring | |
self.last_upload_time = 0 | |
self.delay = delay | |
ignore_patterns = None | |
if exclude_substring: | |
ignore_patterns = [f"*{exclude_substring}*"] | |
super().__init__(ignore_patterns=ignore_patterns) | |
def should_upload(self, event_path: str) -> bool: | |
current_time = time.time() | |
if current_time - self.last_upload_time > self.delay: | |
if self.exclude_substring and self.exclude_substring in event_path: | |
return False | |
return True | |
return False | |
def on_any_event(self, event) -> None: | |
if self.should_upload(event.src_path): | |
self.last_upload_time = time.time() | |
upload_to_huggingface(self.repo_id, self.folder_path, self.path_in_repo) | |
def main() -> None: | |
parser = argparse.ArgumentParser( | |
description="Monitor a folder and upload to Hugging Face Hub on changes." | |
) | |
parser.add_argument( | |
"repo_id", | |
type=str, | |
help="Repository ID on Hugging Face (e.g., 'username/repo_name')", | |
) | |
parser.add_argument( | |
"folder_path", type=Path, help="Path to the folder to be monitored" | |
) | |
parser.add_argument( | |
"-p", | |
"--path_in_repo", | |
type=str, | |
default=None, | |
help="Path in the repository where the folder will be uploaded (default: None)", | |
) | |
parser.add_argument( | |
"-ex", | |
"--exclude-substring", | |
type=str, | |
default="", | |
help="Substring to exclude files/directories from triggering uploads (default: '')", | |
) | |
parser.add_argument( | |
"-f", | |
"--check_freq", | |
type=int, | |
default=30, | |
help="Frequency (in seconds) to check for changes (default: 30)", | |
) | |
args = parser.parse_args() | |
validate_inputs(args.repo_id, args.folder_path, args.path_in_repo) | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
event_handler = ChangeHandler( | |
args.repo_id, | |
args.folder_path, | |
args.path_in_repo, | |
args.exclude_substring, | |
delay=args.check_freq, | |
) | |
observer = Observer() | |
observer.schedule(event_handler, path=str(args.folder_path), recursive=True) | |
observer.start() | |
logging.info(f"Monitoring folder:\t{args.folder_path}") | |
try: | |
while True: | |
time.sleep(1) # sleep indefinitely, the observer works in a separate thread | |
except KeyboardInterrupt: | |
observer.stop() | |
observer.join() | |
logging.info("Stopping monitoring") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
this script will upload a folder to Hugging Face Hub | |
pip install huggingface-hub | |
""" | |
import argparse | |
import logging | |
import sys | |
from pathlib import Path | |
from huggingface_hub import upload_folder | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
def validate_inputs(repo_id, folder_path, path_in_repo): | |
# Validate repo_id format | |
if "/" not in repo_id: | |
raise ValueError( | |
"Invalid repo_id format. It should be in 'username/repository' format." | |
) | |
# Validate folder_path | |
if not folder_path.exists() or not folder_path.is_dir(): | |
raise FileNotFoundError( | |
f"The folder path '{folder_path}' does not exist or is not a directory." | |
) | |
# Validate path_in_repo if provided | |
if path_in_repo and (path_in_repo.startswith("/") or path_in_repo.endswith("/")): | |
raise ValueError("path_in_repo should not start or end with '/'.") | |
def main(): | |
""" | |
Main function to set up the folder monitoring and upload process. | |
""" | |
parser = argparse.ArgumentParser(description="Upload a folder to Hugging Face Hub.") | |
parser.add_argument( | |
"repo_id", | |
type=str, | |
help="The repository ID on Hugging Face (e.g., 'username/repo_name')", | |
) | |
parser.add_argument( | |
"folder_path", | |
type=Path, | |
help="Path to the folder to be uploaded", | |
) | |
parser.add_argument( | |
"--path_in_repo", | |
type=str, | |
default=None, | |
help="Path in the repository where the folder will be uploaded (defaults to None)", | |
) | |
args = parser.parse_args() | |
validate_inputs(args.repo_id, args.folder_path, args.path_in_repo) | |
try: | |
folder_path_str = str(args.folder_path.resolve()) | |
logging.info( | |
f"Starting upload of folder {folder_path_str} to repo {args.repo_id}" | |
) | |
upload_folder( | |
repo_id=args.repo_id, | |
folder_path=folder_path_str, | |
path_in_repo=args.path_in_repo, | |
ignore_patterns="*.pt*", # ignore optimizers etc | |
commit_message="manual upload with upload_folder.py", | |
) | |
logging.info("Upload completed successfully.") | |
except Exception as e: | |
logging.error(f"An error occurred: {e}") | |
sys.exit(1) | |
logging.info(f"Done! pushed to:\t{args.repo_id}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment