Created
June 18, 2023 13:41
-
-
Save mxchinegod/6489446b65d5961652cae755b72f0223 to your computer and use it in GitHub Desktop.
Disgusting SEC Filing ripper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from tqdm import tqdm | |
import feedparser | |
from bs4 import BeautifulSoup | |
from time import sleep | |
import os | |
from pprint import pprint | |
#!python3 -m pip install feedparser | |
def fancy_message(tag, body): | |
tags = [ | |
("FATAL", "☠️", "\033[91m"), # Red color for FATAL | |
("WARN", "🚨", "\033[93m"), # Yellow color for WARN | |
("INFO", "ℹ️", "\033[94m"), # Blue color for INFO | |
("WAIT", "☕️", "\033[96m") # Cyan color for WAIT | |
] | |
matching_tags = [x for x in tags if x[0] == tag.upper()] | |
if matching_tags: | |
tag_text = matching_tags[0][0] | |
emoji = matching_tags[0][1] | |
color_code = matching_tags[0][2] | |
print(f'{color_code}{emoji} {tag_text}: {body}\033[0m') # Reset color after the text | |
else: | |
print(f'Unknown tag: {tag}') | |
from datetime import datetime | |
def print_current_datetime(): | |
now = datetime.now() | |
date_string = now.strftime("%B %d, %Y") | |
time_string = now.strftime("%I:%M %p") | |
return f'{date_string} {time_string}' | |
while True: | |
# Fetch the RSS feed | |
user_agent = "<company> <email>" | |
feedparser.USER_AGENT = user_agent | |
rss_feed_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start=0&count=1000&output=atom" | |
feed = feedparser.parse(rss_feed_url, request_headers={"User-Agent": user_agent}) | |
filings = [] | |
# Iterate over the entries in the RSS feed | |
for entry in feed.entries: | |
# Get the URL from the entry | |
url = entry.link | |
title = entry.title | |
updated = entry.updated | |
_dict = { | |
"url": url | |
, "title": title | |
, "updated": updated | |
, "docs": [] | |
} | |
filings.append(_dict) | |
# Make a request to the URL | |
headers = { | |
"User-Agent": user_agent, | |
"Accept-Encoding": "gzip, deflate", | |
"Host": "www.sec.gov" | |
} | |
# Create a tqdm progress bar | |
progress_bar = tqdm(filings, desc="Processing filings") | |
for filing in progress_bar: | |
sleep(.5) | |
response = requests.get(filing["url"], headers=headers) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, "html.parser") | |
# Find all <a> tags with href attributes | |
for a_tag in soup.find_all("a", href=True): | |
href = a_tag["href"] | |
# Check if the href has htm, xml, or pdf extension | |
if href.endswith((".htm", ".xml", ".pdf", ".jpg", ".png")) and '/Archives/edgar/data/' in href: | |
filing["docs"].append(href) | |
# Update the progress bar | |
progress_bar.set_postfix({"Docs Found": len(filing["docs"])}) | |
headers = { | |
"User-Agent": user_agent, | |
"Accept-Encoding": "gzip, deflate", | |
"Host": "www.sec.gov" | |
} | |
folder = 'sec_filings' | |
if not os.path.exists(folder): | |
os.makedirs(folder) | |
progress_bar = tqdm(filings, desc="Downloading filings") | |
for item in progress_bar: | |
docs = item['docs'] | |
title = item['title'] | |
for doc in docs: | |
url = 'https://www.sec.gov' + doc | |
file_name = title.replace("/","_") + '_' + doc.split('/')[-1] | |
file_path = os.path.join(folder, file_name) | |
response = requests.get(url, stream=True, headers=headers) | |
response.raise_for_status() | |
total_size = int(response.headers.get('content-length', 0)) | |
block_size = 1024 # 1KB | |
with open(file_path, 'wb') as file: | |
for data in response.iter_content(block_size): | |
file.write(data) | |
progress_bar.set_postfix({"Attachments found": len(docs)}) | |
progress_bar.close() | |
fancy_message('info', f'Downloads completed @ {print_current_datetime()}') | |
fancy_message('wait','sleeping for 2 minutes') | |
sleep(120) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment