Last active
June 2, 2025 10:19
-
-
Save ayoubzulfiqar/75278f934d861bc491c55c57ed2d4482 to your computer and use it in GitHub Desktop.
SEC:Extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html | |
import json | |
import re | |
from bs4 import BeautifulSoup | |
path = "file.xml" | |
def tagValue(soup: BeautifulSoup, deiTag:str)-> dict[str, any]: | |
tag = soup.find("ix:nonNumeric", {"name": deiTag}) | |
return tag.text.strip() if tag else None | |
def xbrliContext(soup: BeautifulSoup): | |
contexts = {} | |
for ctx in soup.find_all("xbrli:context"): | |
ctxID = ctx.get("id") | |
entity = ctx.find("xbrli:identifier").text if ctx.find("xbrli:identifier") else None | |
period = ctx.find("xbrli:period") | |
if period.find("xbrli:instant"): | |
start = end = period.find("xbrli:instant").text | |
else: | |
start = period.find("xbrli:startDate").text if period.find("xbrli:startDate") else None | |
end = period.find("xbrli:endDate").text if period.find("xbrli:endDate") else None | |
dimensions = [] | |
segment = ctx.find("xbrli:segment") | |
if segment: | |
for member in segment.find_all("xbrldi:explicitMember"): | |
dim = member.get("dimension") | |
val = member.text.strip() | |
dimensions.append((dim, val)) | |
contexts[ctxID] = { | |
"entity": entity, | |
"start_date": start, | |
"end_date": end, | |
"dimensions": dimensions | |
} | |
return contexts | |
def xbrliUnits(soup: BeautifulSoup): | |
units = {} | |
for unit in soup.find_all("xbrli:unit"): | |
unitID = unit.get("id") | |
measure = unit.find("xbrli:measure") | |
if measure: | |
units[unitID] = measure.text | |
else: | |
divide = unit.find("xbrli:divide") | |
if divide: | |
num = divide.find("xbrli:unitNumerator").find("xbrli:measure").text | |
denom = divide.find("xbrli:unitDenominator").find("xbrli:measure").text | |
units[unitID] = f"{num} per {denom}" | |
return units | |
def factsWithContext(soup:BeautifulSoup, contexts: dict, units:dict): | |
facts = [] | |
for fact in soup.find_all("ix:nonNumeric"): | |
name = fact.get("name") | |
value = fact.text.strip() | |
if not value: | |
continue | |
contextID = fact.get("contextRef") | |
unitID = fact.get("unitRef") # Often missing in nonNumeric, but keep for generality | |
context = contexts.get(contextID, {}) | |
unit = units.get(unitID, None) | |
factData = { | |
"fact_name": name, | |
"value": value, | |
"unit": unit, | |
"context_id": contextID, | |
"start_date": context.get("start_date"), | |
"end_date": context.get("end_date"), | |
"dimensions": context.get("dimensions") | |
} | |
facts.append(factData) | |
# print(f"Total facts captured: {len(facts)}") | |
return facts | |
def companyMetaData(soup:BeautifulSoup)-> dict[str, any]: | |
comMetaData: dict[str, any] = { | |
"name": tagValue(soup, "dei:EntityRegistrantName"), | |
"cik": tagValue(soup, "dei:EntityCentralIndexKey"), | |
"fiscal_year_end": tagValue(soup, "dei:DocumentPeriodEndDate"), | |
"fiscal_year_focus": tagValue(soup, "dei:DocumentFiscalYearFocus"), | |
"fiscal_period_focus": tagValue(soup, "dei:DocumentFiscalPeriodFocus"), | |
"trading_symbol": tagValue(soup, "dei:TradingSymbol"), | |
"exchange_name": tagValue(soup, "dei:SecurityExchangeName"), | |
"sic_code": tagValue(soup, "dei:EntityStandardIndustrialClassification"), | |
"incorporation_state": tagValue(soup, "dei:EntityIncorporationStateCountryCode"), | |
"address": { | |
"street1": tagValue(soup, "dei:EntityAddressAddressLine1"), | |
"street2": tagValue(soup, "dei:EntityAddressAddressLine2"), | |
"city": tagValue(soup, "dei:EntityAddressCityOrTown"), | |
"state": tagValue(soup, "dei:EntityAddressStateOrProvince"), | |
"zip": tagValue(soup, "dei:EntityAddressPostalZipCode"), | |
"country": tagValue(soup, "dei:EntityAddressCountry") | |
} | |
} | |
return comMetaData | |
def periodStartDate(soup, contexts): | |
tag = soup.find("ix:nonNumeric", {"name": "dei:DocumentPeriodEndDate"}) | |
if not tag: | |
return None | |
contextID = tag.get("contextRef") | |
if contextID and contextID in contexts: | |
return contexts[contextID].get("start_date") | |
return None | |
def filingMetadata(soup: BeautifulSoup, contexts: dict)-> dict[str, any]: | |
fMetadata: dict[str, any] = { | |
"document_type": tagValue(soup, "dei:DocumentType"), | |
"amendment_flag": tagValue(soup, "dei:AmendmentFlag"), | |
"period_start_date": periodStartDate(soup, contexts), | |
"period_end_date": tagValue(soup, "dei:DocumentPeriodEndDate") | |
} | |
return fMetadata | |
# Parse Table | |
def isTextBlock(fact:dict)-> bool: | |
return "TextBlock" in fact.get("fact_name", "") | |
def parseTextBlockToTable(text: str): | |
# Try HTML table first | |
try: | |
soup = BeautifulSoup(text, "lxml") | |
table = soup.find("table") | |
if table: | |
rows = [] | |
for tr in table.find_all("tr"): | |
row = [cell.get_text(strip=True) for cell in tr.find_all(["td", "th"])] | |
if any(row): # skip empty rows | |
rows.append(row) | |
if len(rows) > 1: | |
return rows | |
except ValueError as e: | |
print(f"[HTML Table Parse Error] {e}") | |
# Fallback: Line + regex parsing | |
cleaned = re.sub(r'\s{2,}|\t| {2,}', ' | ', text.replace('\n', ' \n ')) | |
lines = [line.strip() for line in cleaned.split('\n') if line.strip()] | |
rows = [] | |
for line in lines: | |
cells = [cell.strip() for cell in re.split(r'\s{2,}|\|', line) if cell.strip()] | |
if len(cells) >= 2: | |
rows.append(cells) | |
return rows if rows else None | |
def textBlockTable(facts): | |
tables = [] | |
for fact in facts: | |
if "TextBlock" not in fact.get("fact_name", ""): | |
continue | |
parsed_table = parseTextBlockToTable(fact["value"]) | |
if parsed_table: # Only include tables with rows | |
tables.append({ | |
"fact_name": fact["fact_name"], | |
"context_id": fact.get("context_id"), | |
"start_date": fact.get("start_date"), | |
"end_date": fact.get("end_date"), | |
"dimensions": fact.get("dimensions", []), | |
"table_rows": parsed_table | |
}) | |
# print(f"Parsed tables: {len(tables)}") | |
return tables | |
def metadataSummary(soup: BeautifulSoup): | |
contexts = xbrliContext(soup) | |
units = xbrliUnits(soup) | |
facts = factsWithContext(soup, contexts, units) | |
tables = textBlockTable(facts) | |
print(f"Parsed tables: {len(tables)}") | |
print(json.dumps(tables)) | |
for table in tables[:1]: | |
print(f"\n=== {table['fact_name']} ===") | |
for row in table['table_rows']: | |
print("Total Rows:",row) | |
metadata = { | |
"company_metadata": companyMetaData(soup), | |
"filing_metadata": filingMetadata(soup, contexts), | |
"units": units, | |
"facts": facts, | |
"tables": tables, | |
} | |
return metadata | |
def Extractor10K(): | |
with open(path, 'r', encoding="utf-8") as f: | |
soup:BeautifulSoup = BeautifulSoup(f.read(), 'xml') | |
result = metadataSummary(soup) | |
print(json.dumps(result, ensure_ascii=False)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment