Created
August 16, 2021 15:46
-
-
Save sabetAI/eded59f01ef6618d8eb9885c48f501ee to your computer and use it in GitHub Desktop.
Scraping Files for Upload to Codex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from argparse import ArgumentParser | |
import openai | |
import jsonlines | |
openai.api_key = os.environ['OPENAI_API_KEY'] | |
# read all typescript files in all subdirectories | |
def get_all_files(dir, ext): | |
all_files = [] | |
for root, dirs, files in os.walk(dir): | |
for file in files: | |
if file.endswith(ext): | |
all_files.append(os.path.join(root, file)) | |
return all_files | |
if __name__ == "__main__": | |
argparse = ArgumentParser(description="Scrape all typescript files in all subdirectories") | |
argparse.add_argument("dir", help="directory to search for typescript files") | |
argparse.add_argument("name", help="reference name for codex upload file") | |
argparse.add_argument("ext", help="filename extension to scrape") | |
# get all files for a given directory pass in as command line argument | |
args = argparse.parse_args() | |
all_files = get_all_files(args.dir, args.ext) | |
# read content of files into list | |
jsons = [] | |
for file in all_files: | |
with open(file) as f: | |
content = f.read() | |
jsons.append({'text' : content, 'metadata' : {'path' : file, 'filename' : os.path.basename(file)}}) | |
# write list of dicts to jsonlines file | |
with jsonlines.open(f'{args.name}.jsonl', mode='w') as writer: | |
writer.write_all(jsons) | |
# write json entries to openai file | |
openai.File.create(file=open(f'{args.name}.jsonl'), purpose='answers') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment