Skip to content

Instantly share code, notes, and snippets.

@ayoubzulfiqar
Last active June 2, 2025 10:19
Show Gist options
  • Save ayoubzulfiqar/75278f934d861bc491c55c57ed2d4482 to your computer and use it in GitHub Desktop.
Save ayoubzulfiqar/75278f934d861bc491c55c57ed2d4482 to your computer and use it in GitHub Desktop.
SEC:Extraction
import html
import json
import re
from bs4 import BeautifulSoup
path = "file.xml"
def tagValue(soup: BeautifulSoup, deiTag:str)-> dict[str, any]:
tag = soup.find("ix:nonNumeric", {"name": deiTag})
return tag.text.strip() if tag else None
def xbrliContext(soup: BeautifulSoup):
contexts = {}
for ctx in soup.find_all("xbrli:context"):
ctxID = ctx.get("id")
entity = ctx.find("xbrli:identifier").text if ctx.find("xbrli:identifier") else None
period = ctx.find("xbrli:period")
if period.find("xbrli:instant"):
start = end = period.find("xbrli:instant").text
else:
start = period.find("xbrli:startDate").text if period.find("xbrli:startDate") else None
end = period.find("xbrli:endDate").text if period.find("xbrli:endDate") else None
dimensions = []
segment = ctx.find("xbrli:segment")
if segment:
for member in segment.find_all("xbrldi:explicitMember"):
dim = member.get("dimension")
val = member.text.strip()
dimensions.append((dim, val))
contexts[ctxID] = {
"entity": entity,
"start_date": start,
"end_date": end,
"dimensions": dimensions
}
return contexts
def xbrliUnits(soup: BeautifulSoup):
units = {}
for unit in soup.find_all("xbrli:unit"):
unitID = unit.get("id")
measure = unit.find("xbrli:measure")
if measure:
units[unitID] = measure.text
else:
divide = unit.find("xbrli:divide")
if divide:
num = divide.find("xbrli:unitNumerator").find("xbrli:measure").text
denom = divide.find("xbrli:unitDenominator").find("xbrli:measure").text
units[unitID] = f"{num} per {denom}"
return units
def factsWithContext(soup:BeautifulSoup, contexts: dict, units:dict):
facts = []
for fact in soup.find_all("ix:nonNumeric"):
name = fact.get("name")
value = fact.text.strip()
if not value:
continue
contextID = fact.get("contextRef")
unitID = fact.get("unitRef") # Often missing in nonNumeric, but keep for generality
context = contexts.get(contextID, {})
unit = units.get(unitID, None)
factData = {
"fact_name": name,
"value": value,
"unit": unit,
"context_id": contextID,
"start_date": context.get("start_date"),
"end_date": context.get("end_date"),
"dimensions": context.get("dimensions")
}
facts.append(factData)
# print(f"Total facts captured: {len(facts)}")
return facts
def companyMetaData(soup:BeautifulSoup)-> dict[str, any]:
comMetaData: dict[str, any] = {
"name": tagValue(soup, "dei:EntityRegistrantName"),
"cik": tagValue(soup, "dei:EntityCentralIndexKey"),
"fiscal_year_end": tagValue(soup, "dei:DocumentPeriodEndDate"),
"fiscal_year_focus": tagValue(soup, "dei:DocumentFiscalYearFocus"),
"fiscal_period_focus": tagValue(soup, "dei:DocumentFiscalPeriodFocus"),
"trading_symbol": tagValue(soup, "dei:TradingSymbol"),
"exchange_name": tagValue(soup, "dei:SecurityExchangeName"),
"sic_code": tagValue(soup, "dei:EntityStandardIndustrialClassification"),
"incorporation_state": tagValue(soup, "dei:EntityIncorporationStateCountryCode"),
"address": {
"street1": tagValue(soup, "dei:EntityAddressAddressLine1"),
"street2": tagValue(soup, "dei:EntityAddressAddressLine2"),
"city": tagValue(soup, "dei:EntityAddressCityOrTown"),
"state": tagValue(soup, "dei:EntityAddressStateOrProvince"),
"zip": tagValue(soup, "dei:EntityAddressPostalZipCode"),
"country": tagValue(soup, "dei:EntityAddressCountry")
}
}
return comMetaData
def periodStartDate(soup, contexts):
tag = soup.find("ix:nonNumeric", {"name": "dei:DocumentPeriodEndDate"})
if not tag:
return None
contextID = tag.get("contextRef")
if contextID and contextID in contexts:
return contexts[contextID].get("start_date")
return None
def filingMetadata(soup: BeautifulSoup, contexts: dict)-> dict[str, any]:
fMetadata: dict[str, any] = {
"document_type": tagValue(soup, "dei:DocumentType"),
"amendment_flag": tagValue(soup, "dei:AmendmentFlag"),
"period_start_date": periodStartDate(soup, contexts),
"period_end_date": tagValue(soup, "dei:DocumentPeriodEndDate")
}
return fMetadata
# Parse Table
def isTextBlock(fact:dict)-> bool:
return "TextBlock" in fact.get("fact_name", "")
def parseTextBlockToTable(text: str):
# Try HTML table first
try:
soup = BeautifulSoup(text, "lxml")
table = soup.find("table")
if table:
rows = []
for tr in table.find_all("tr"):
row = [cell.get_text(strip=True) for cell in tr.find_all(["td", "th"])]
if any(row): # skip empty rows
rows.append(row)
if len(rows) > 1:
return rows
except ValueError as e:
print(f"[HTML Table Parse Error] {e}")
# Fallback: Line + regex parsing
cleaned = re.sub(r'\s{2,}|\t| {2,}', ' | ', text.replace('\n', ' \n '))
lines = [line.strip() for line in cleaned.split('\n') if line.strip()]
rows = []
for line in lines:
cells = [cell.strip() for cell in re.split(r'\s{2,}|\|', line) if cell.strip()]
if len(cells) >= 2:
rows.append(cells)
return rows if rows else None
def textBlockTable(facts):
tables = []
for fact in facts:
if "TextBlock" not in fact.get("fact_name", ""):
continue
parsed_table = parseTextBlockToTable(fact["value"])
if parsed_table: # Only include tables with rows
tables.append({
"fact_name": fact["fact_name"],
"context_id": fact.get("context_id"),
"start_date": fact.get("start_date"),
"end_date": fact.get("end_date"),
"dimensions": fact.get("dimensions", []),
"table_rows": parsed_table
})
# print(f"Parsed tables: {len(tables)}")
return tables
def metadataSummary(soup: BeautifulSoup):
contexts = xbrliContext(soup)
units = xbrliUnits(soup)
facts = factsWithContext(soup, contexts, units)
tables = textBlockTable(facts)
print(f"Parsed tables: {len(tables)}")
print(json.dumps(tables))
for table in tables[:1]:
print(f"\n=== {table['fact_name']} ===")
for row in table['table_rows']:
print("Total Rows:",row)
metadata = {
"company_metadata": companyMetaData(soup),
"filing_metadata": filingMetadata(soup, contexts),
"units": units,
"facts": facts,
"tables": tables,
}
return metadata
def Extractor10K():
with open(path, 'r', encoding="utf-8") as f:
soup:BeautifulSoup = BeautifulSoup(f.read(), 'xml')
result = metadataSummary(soup)
print(json.dumps(result, ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment