# python 3.12

"""
Toy script.

Takes some input from a csv file on big American
mines and looks at Wikipedia text for some extra
context.
"""

import copy

import pprint

import sys

from urllib import request

import re

from bs4 import BeautifulSoup

def parsed_data(datafile:str) -> dict:
    """
    Get csv data into a dictionary keyed on mine name.
    """
    retval = {}
    with open(datafile, 'r') as f:
        headers = [x.strip() for x in next(f).split(',')]
        for linex in f:
            vals = [x.strip() for x in linex.split(',')]
            retval[vals[0]] = {key:val for key, val in zip(headers, vals)} 
    pprint.pprint(retval)
    return retval
        
def data_with_wikipedia(parsed_data:dict) -> dict:
    """
    Connect to wikipedia sites and fill in
    raw html data.

    Return dictionary.
    """
    retval = copy.deepcopy(parsed_data)
    for minex in retval:
        obj = request.urlopen(retval[minex]['wikipedia page'])
        html = obj.read()
        soup = BeautifulSoup(html, 'html.parser')
        print(soup.title)
        # Text from html and strip out newlines.
        newstring = soup.get_text().replace('\n', '')
        retval[minex]['wikipediatext'] = newstring
    return retval

def data_with_company(data_with_wikipedia:dict) -> dict:
    """
    Fetches company ownership for mine out of 
    Wikipedia text dump.

    Returns a new dictionary with the company name
    without the big wikipedia text dump.
    """
    # Wikipedia setup for mine company name.
    COMPANYPAT = r'[a-z]Company'
    # Lower case followed by upper case heuristic.
    ENDCOMPANYPAT = '[a-z][A-Z]'
    retval = copy.deepcopy(data_with_wikipedia)
    companypat = re.compile(COMPANYPAT)
    endcompanypat = re.compile(ENDCOMPANYPAT) 
    for minex in retval:
        print(minex)
        match = re.search(companypat, retval[minex]['wikipediatext'])
        if match:
            print('Company match span = ', match.span())
            companyidx = match.span()[1]
            match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
            print('End Company match span = ', match2.span())
            retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
        # Get rid of big text dump in return value.
        retval[minex].pop('wikipediatext')
    return retval

def info_output(data_with_company:dict) -> str:
    """
    Prints some output text to a file for each
    mine in the data_with_company dictionary.

    Returns string filename of output.
    """
    INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
    COMPANYLINEFMT = '\n    {company:s} owns the mine.\n\n'
    retval = 'mine_info.txt'
    with open(retval, 'w') as f:
        for minex in data_with_company:
            print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
            print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
    return retval

def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
    """
    Return dictionary keyed on mine with counts of
    commodity (e.g., zinc etc.) mentions on Wikipedia
    page (excluding ones in the company name).
    """
    retval = {}
    # This will probably miss some occurrences at mashed together
    # word boundaries. It is a rough estimate.
    # '\b[Gg]old\b'
    commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
    for minex in data_with_wikipedia:
        print(minex)
        commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
        commoditypat = commoditypatfmt.format(commodityuc,
                                              data_with_wikipedia[minex]['commodity'][0],
                                              data_with_wikipedia[minex]['commodity'][1:])
        print(commoditypat)
        commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
        # pprint.pprint(commoditymatches)
        nummatchesraw = len(commoditymatches)
        print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
        companymatches = re.findall(data_with_company[minex]['company'],
                                    data_with_wikipedia[minex]['wikipediatext'])
        numcompanymatches = len(companymatches)
        print('Length of companymatches is {0:d}.'.format(numcompanymatches))
        # Is the commodity name part of the company name?
        print('commoditypat = ', commoditypat)
        print(data_with_company[minex]['company'])
        commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
        if commoditymatchcompany:
            print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
            nummatchesfinal = nummatchesraw - numcompanymatches
            retval[minex] = nummatchesfinal 
        else:
            retval[minex] = nummatchesraw 
    return retval

def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
    """
    Find the number of times the company you associate with
    the property/mine (very subjective) is within the
    text of the mine's wikipedia article.
    """
    retval = {}
    for minex in data_with_wikipedia:
        colloquial_pat = data_with_wikipedia[minex]['colloquial association']
        print(minex)
        nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
        print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
        retval[minex] = nummatches
    return retval

def info_dict_merged(data_with_company:dict,
                     commodity_word_counts:dict,
                     colloquial_company_word_counts:dict) -> dict:
    """
    Get a dictionary with all the collected information
    in it minus the big Wikipedia text dump.
    """
    retval = copy.deepcopy(data_with_company)
    for minex in retval:
        retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
        retval[minex]['commodity word count'] = commodity_word_counts[minex]
    return retval

def wikipedia_report(info_dict_merged:dict) -> str:
    """
    Writes out Wikipedia information (word counts)
    to file in prose; returns string filename.
    """
    retval = 'wikipedia_info.txt'
    colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
    commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
    with open(retval, 'w') as f:
        for minex in info_dict_merged:
            print(colloqfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['colloquial association count'],
                                   info_dict_merged[minex]['colloquial association']), file=f)
            print(commodfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['commodity word count'],
                                   info_dict_merged[minex]['commodity']), file=f)
    return retval