ayoubzulfiqar · June 2, 2025 10:19
diff --git a/sec.py b/sec.py
 import html
 import json
 import re

 from bs4 import BeautifulSoup

 path = "file.xml"




 def tagValue(soup: BeautifulSoup, deiTag:str)-> dict[str, any]:
    tag = soup.find("ix:nonNumeric", {"name": deiTag})
    return tag.text.strip() if tag else None


 def xbrliContext(soup: BeautifulSoup):
    contexts = {}
    for ctx in soup.find_all("xbrli:context"):
        ctxID = ctx.get("id")
        entity = ctx.find("xbrli:identifier").text if ctx.find("xbrli:identifier") else None
        period = ctx.find("xbrli:period")

        if period.find("xbrli:instant"):
            start = end = period.find("xbrli:instant").text
        else:
            start = period.find("xbrli:startDate").text if period.find("xbrli:startDate") else None
            end = period.find("xbrli:endDate").text if period.find("xbrli:endDate") else None

        dimensions = []
        segment = ctx.find("xbrli:segment")
        if segment:
            for member in segment.find_all("xbrldi:explicitMember"):
                dim = member.get("dimension")
                val = member.text.strip()
                dimensions.append((dim, val))

        contexts[ctxID] = {
            "entity": entity,
            "start_date": start,
            "end_date": end,
            "dimensions": dimensions
        }
    return contexts
    


 def xbrliUnits(soup: BeautifulSoup):
    units = {}
    for unit in soup.find_all("xbrli:unit"):
        unitID = unit.get("id")
        measure = unit.find("xbrli:measure")
        if measure:
            units[unitID] = measure.text
        else:
            divide = unit.find("xbrli:divide")
            if divide:
                num = divide.find("xbrli:unitNumerator").find("xbrli:measure").text
                denom = divide.find("xbrli:unitDenominator").find("xbrli:measure").text
                units[unitID] = f"{num} per {denom}"
    return units


 def factsWithContext(soup:BeautifulSoup, contexts: dict, units:dict):
    facts = []

    for fact in soup.find_all("ix:nonNumeric"):
        name = fact.get("name")
        value = fact.text.strip()
        if not value:
            continue
        contextID = fact.get("contextRef")
        unitID = fact.get("unitRef")  # Often missing in nonNumeric, but keep for generality

        context = contexts.get(contextID, {})
        unit = units.get(unitID, None)

        factData = {
            "fact_name": name,
            "value": value,
            "unit": unit,
            "context_id": contextID,
            "start_date": context.get("start_date"),
            "end_date": context.get("end_date"),
            "dimensions": context.get("dimensions")
        }
        facts.append(factData)
    # print(f"Total facts captured: {len(facts)}")
    return facts

 def companyMetaData(soup:BeautifulSoup)-> dict[str, any]:
    comMetaData: dict[str, any] = {
        "name": tagValue(soup, "dei:EntityRegistrantName"),
        "cik": tagValue(soup, "dei:EntityCentralIndexKey"),
        "fiscal_year_end": tagValue(soup, "dei:DocumentPeriodEndDate"),
        "fiscal_year_focus": tagValue(soup, "dei:DocumentFiscalYearFocus"),
        "fiscal_period_focus": tagValue(soup, "dei:DocumentFiscalPeriodFocus"),
        "trading_symbol": tagValue(soup, "dei:TradingSymbol"),
        "exchange_name": tagValue(soup, "dei:SecurityExchangeName"),
        "sic_code": tagValue(soup, "dei:EntityStandardIndustrialClassification"),
        "incorporation_state": tagValue(soup, "dei:EntityIncorporationStateCountryCode"),
        "address": {
            "street1": tagValue(soup, "dei:EntityAddressAddressLine1"),
            "street2": tagValue(soup, "dei:EntityAddressAddressLine2"),
            "city": tagValue(soup, "dei:EntityAddressCityOrTown"),
            "state": tagValue(soup, "dei:EntityAddressStateOrProvince"),
            "zip": tagValue(soup, "dei:EntityAddressPostalZipCode"),
            "country": tagValue(soup, "dei:EntityAddressCountry")
        }
    }
    return comMetaData
 def periodStartDate(soup, contexts):
    tag = soup.find("ix:nonNumeric", {"name": "dei:DocumentPeriodEndDate"})
    if not tag:
        return None
    contextID = tag.get("contextRef")
    if contextID and contextID in contexts:
        return contexts[contextID].get("start_date")
    return None


 def filingMetadata(soup: BeautifulSoup, contexts: dict)-> dict[str, any]:
    fMetadata: dict[str, any] = {
        "document_type": tagValue(soup, "dei:DocumentType"),
        "amendment_flag": tagValue(soup, "dei:AmendmentFlag"),
        "period_start_date": periodStartDate(soup, contexts),
        "period_end_date": tagValue(soup, "dei:DocumentPeriodEndDate")
    }
    return fMetadata


 # Parse Table

 def isTextBlock(fact:dict)-> bool:
    return "TextBlock" in fact.get("fact_name", "")
 def parseTextBlockToTable(text: str):
    # Try HTML table first
    try:
        soup = BeautifulSoup(text, "lxml")
        table = soup.find("table")
        if table:
            rows = []
            for tr in table.find_all("tr"):
                row = [cell.get_text(strip=True) for cell in tr.find_all(["td", "th"])]
                if any(row):  # skip empty rows
                    rows.append(row)
            if len(rows) > 1:
                return rows
    except ValueError as e:
        print(f"[HTML Table Parse Error] {e}")

    # Fallback: Line + regex parsing
    cleaned = re.sub(r'\s{2,}|\t| {2,}', ' | ', text.replace('\n', ' \n '))
    lines = [line.strip() for line in cleaned.split('\n') if line.strip()]
    rows = []
    for line in lines:
        cells = [cell.strip() for cell in re.split(r'\s{2,}|\|', line) if cell.strip()]
        if len(cells) >= 2:
            rows.append(cells)

    return rows if rows else None
 def textBlockTable(facts):
    tables = []
    for fact in facts:

        if "TextBlock" not in fact.get("fact_name", ""):
            continue

        parsed_table = parseTextBlockToTable(fact["value"])

        if parsed_table:  # Only include tables with rows
            tables.append({
                "fact_name": fact["fact_name"],
                "context_id": fact.get("context_id"),
                "start_date": fact.get("start_date"),
                "end_date": fact.get("end_date"),
                "dimensions": fact.get("dimensions", []),
                "table_rows": parsed_table
            })

    # print(f"Parsed tables: {len(tables)}")
    return tables

 def metadataSummary(soup: BeautifulSoup):
    contexts = xbrliContext(soup)
    units = xbrliUnits(soup)
    facts = factsWithContext(soup, contexts, units)
    
    tables =  textBlockTable(facts)
    print(f"Parsed tables: {len(tables)}")
    print(json.dumps(tables))
    for table in tables[:1]:
        print(f"\n=== {table['fact_name']} ===")
        for row in table['table_rows']:
            print("Total Rows:",row)
    metadata = {
        "company_metadata": companyMetaData(soup),
        "filing_metadata": filingMetadata(soup, contexts),
        "units": units,
        "facts": facts,
        "tables": tables,
    }
    return metadata


 def Extractor10K():
    with open(path, 'r', encoding="utf-8") as f:
        soup:BeautifulSoup = BeautifulSoup(f.read(), 'xml')
        result = metadataSummary(soup)
        print(json.dumps(result, ensure_ascii=False))
	import html
	import json
	import re

	from bs4 import BeautifulSoup

	path = "file.xml"




	def tagValue(soup: BeautifulSoup, deiTag:str)-> dict[str, any]:
	tag = soup.find("ix:nonNumeric", {"name": deiTag})
	return tag.text.strip() if tag else None


	def xbrliContext(soup: BeautifulSoup):
	contexts = {}
	for ctx in soup.find_all("xbrli:context"):
	ctxID = ctx.get("id")
	entity = ctx.find("xbrli:identifier").text if ctx.find("xbrli:identifier") else None
	period = ctx.find("xbrli:period")

	if period.find("xbrli:instant"):
	start = end = period.find("xbrli:instant").text
	else:
	start = period.find("xbrli:startDate").text if period.find("xbrli:startDate") else None
	end = period.find("xbrli:endDate").text if period.find("xbrli:endDate") else None

	dimensions = []
	segment = ctx.find("xbrli:segment")
	if segment:
	for member in segment.find_all("xbrldi:explicitMember"):
	dim = member.get("dimension")
	val = member.text.strip()
	dimensions.append((dim, val))

	contexts[ctxID] = {
	"entity": entity,
	"start_date": start,
	"end_date": end,
	"dimensions": dimensions
	}
	return contexts



	def xbrliUnits(soup: BeautifulSoup):
	units = {}
	for unit in soup.find_all("xbrli:unit"):
	unitID = unit.get("id")
	measure = unit.find("xbrli:measure")
	if measure:
	units[unitID] = measure.text
	else:
	divide = unit.find("xbrli:divide")
	if divide:
	num = divide.find("xbrli:unitNumerator").find("xbrli:measure").text
	denom = divide.find("xbrli:unitDenominator").find("xbrli:measure").text
	units[unitID] = f"{num} per {denom}"
	return units


	def factsWithContext(soup:BeautifulSoup, contexts: dict, units:dict):
	facts = []

	for fact in soup.find_all("ix:nonNumeric"):
	name = fact.get("name")
	value = fact.text.strip()
	if not value:
	continue
	contextID = fact.get("contextRef")
	unitID = fact.get("unitRef") # Often missing in nonNumeric, but keep for generality

	context = contexts.get(contextID, {})
	unit = units.get(unitID, None)

	factData = {
	"fact_name": name,
	"value": value,
	"unit": unit,
	"context_id": contextID,
	"start_date": context.get("start_date"),
	"end_date": context.get("end_date"),
	"dimensions": context.get("dimensions")
	}
	facts.append(factData)
	# print(f"Total facts captured: {len(facts)}")
	return facts

	def companyMetaData(soup:BeautifulSoup)-> dict[str, any]:
	comMetaData: dict[str, any] = {
	"name": tagValue(soup, "dei:EntityRegistrantName"),
	"cik": tagValue(soup, "dei:EntityCentralIndexKey"),
	"fiscal_year_end": tagValue(soup, "dei:DocumentPeriodEndDate"),
	"fiscal_year_focus": tagValue(soup, "dei:DocumentFiscalYearFocus"),
	"fiscal_period_focus": tagValue(soup, "dei:DocumentFiscalPeriodFocus"),
	"trading_symbol": tagValue(soup, "dei:TradingSymbol"),
	"exchange_name": tagValue(soup, "dei:SecurityExchangeName"),
	"sic_code": tagValue(soup, "dei:EntityStandardIndustrialClassification"),
	"incorporation_state": tagValue(soup, "dei:EntityIncorporationStateCountryCode"),
	"address": {
	"street1": tagValue(soup, "dei:EntityAddressAddressLine1"),
	"street2": tagValue(soup, "dei:EntityAddressAddressLine2"),
	"city": tagValue(soup, "dei:EntityAddressCityOrTown"),
	"state": tagValue(soup, "dei:EntityAddressStateOrProvince"),
	"zip": tagValue(soup, "dei:EntityAddressPostalZipCode"),
	"country": tagValue(soup, "dei:EntityAddressCountry")
	}
	}
	return comMetaData
	def periodStartDate(soup, contexts):
	tag = soup.find("ix:nonNumeric", {"name": "dei:DocumentPeriodEndDate"})
	if not tag:
	return None
	contextID = tag.get("contextRef")
	if contextID and contextID in contexts:
	return contexts[contextID].get("start_date")
	return None


	def filingMetadata(soup: BeautifulSoup, contexts: dict)-> dict[str, any]:
	fMetadata: dict[str, any] = {
	"document_type": tagValue(soup, "dei:DocumentType"),
	"amendment_flag": tagValue(soup, "dei:AmendmentFlag"),
	"period_start_date": periodStartDate(soup, contexts),
	"period_end_date": tagValue(soup, "dei:DocumentPeriodEndDate")
	}
	return fMetadata


	# Parse Table

	def isTextBlock(fact:dict)-> bool:
	return "TextBlock" in fact.get("fact_name", "")
	def parseTextBlockToTable(text: str):
	# Try HTML table first
	try:
	soup = BeautifulSoup(text, "lxml")
	table = soup.find("table")
	if table:
	rows = []
	for tr in table.find_all("tr"):
	row = [cell.get_text(strip=True) for cell in tr.find_all(["td", "th"])]
	if any(row): # skip empty rows
	rows.append(row)
	if len(rows) > 1:
	return rows
	except ValueError as e:
	print(f"[HTML Table Parse Error] {e}")

	# Fallback: Line + regex parsing
	cleaned = re.sub(r'\s{2,}\|\t\| {2,}', ' \| ', text.replace('\n', ' \n '))
	lines = [line.strip() for line in cleaned.split('\n') if line.strip()]
	rows = []
	for line in lines:
	cells = [cell.strip() for cell in re.split(r'\s{2,}\|\\|', line) if cell.strip()]
	if len(cells) >= 2:
	rows.append(cells)

	return rows if rows else None
	def textBlockTable(facts):
	tables = []
	for fact in facts:

	if "TextBlock" not in fact.get("fact_name", ""):
	continue

	parsed_table = parseTextBlockToTable(fact["value"])

	if parsed_table: # Only include tables with rows
	tables.append({
	"fact_name": fact["fact_name"],
	"context_id": fact.get("context_id"),
	"start_date": fact.get("start_date"),
	"end_date": fact.get("end_date"),
	"dimensions": fact.get("dimensions", []),
	"table_rows": parsed_table
	})

	# print(f"Parsed tables: {len(tables)}")
	return tables

	def metadataSummary(soup: BeautifulSoup):
	contexts = xbrliContext(soup)
	units = xbrliUnits(soup)
	facts = factsWithContext(soup, contexts, units)

	tables = textBlockTable(facts)
	print(f"Parsed tables: {len(tables)}")
	print(json.dumps(tables))
	for table in tables[:1]:
	print(f"\n=== {table['fact_name']} ===")
	for row in table['table_rows']:
	print("Total Rows:",row)
	metadata = {
	"company_metadata": companyMetaData(soup),
	"filing_metadata": filingMetadata(soup, contexts),
	"units": units,
	"facts": facts,
	"tables": tables,
	}
	return metadata


	def Extractor10K():
	with open(path, 'r', encoding="utf-8") as f:
	soup:BeautifulSoup = BeautifulSoup(f.read(), 'xml')
	result = metadataSummary(soup)
	print(json.dumps(result, ensure_ascii=False))