Skip to content

Instantly share code, notes, and snippets.

@roclark
Created March 16, 2019 23:36
Show Gist options
  • Save roclark/c5ee987f144f8069650272b562be2181 to your computer and use it in GitHub Desktop.
Save roclark/c5ee987f144f8069650272b562be2181 to your computer and use it in GitHub Desktop.
A better machine learning algorithm to predict the scores of college basketball games
import pandas as pd
from argparse import ArgumentParser
from os.path import isfile
from sportsreference.ncaab.teams import Teams
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
DATASET_NAME = 'dataset.pkl'
FIELDS_TO_DROP = ['away_points', 'home_points', 'date', 'location',
'losing_abbr', 'losing_name', 'winner', 'winning_abbr',
'winning_name', 'home_ranking', 'away_ranking', 'pace']
def load_saved_dataset():
if isfile(DATASET_NAME):
return pd.read_pickle(DATASET_NAME)
return pd.DataFrame()
def create_dataset(teams):
dataset = load_saved_dataset()
if not dataset.empty:
return dataset
for team in teams:
dataset = pd.concat([dataset, team.schedule.dataframe_extended])
dataset.to_pickle(DATASET_NAME)
return dataset.drop_duplicates()
def process_data(dataset):
X = dataset.drop(FIELDS_TO_DROP, 1).dropna()
y = dataset[['home_points', 'away_points']].values
return train_test_split(X, y)
def build_model(X_train, y_train):
parameters = {'bootstrap': False,
'min_samples_leaf': 3,
'n_estimators': 50,
'min_samples_split': 10,
'max_features': 'sqrt',
'max_depth': 6}
model = RandomForestRegressor(**parameters)
model.fit(X_train, y_train)
return model
def add_features(stats):
if 'defensive_rating' not in stats and \
'offensive_rating' in stats and \
'net_rating' in stats:
stats['defensive_rating'] = stats['offensive_rating'] - \
stats['net_rating']
defensive_rebound_percentage = 100.0 * stats['defensive_rebounds'] /\
(stats['defensive_rebounds'] + stats['offensive_rebounds'])
stats['defensive_rebound_percentage'] = defensive_rebound_percentage
return stats
def replace_feature_names(team, away=False):
team = team.drop(team.filter(regex='opp_').columns, axis=1)
team = add_features(team)
if away:
columns = ['away_%s' % col for col in team]
else:
columns = ['home_%s' % col for col in team]
team.columns = columns
return team.reset_index()
def create_matchup_data(home, away):
home_stats = replace_feature_names(home)
away_stats = replace_feature_names(away, away=True)
return pd.concat([away_stats, home_stats], axis=1)
def parse_arguments():
parser = ArgumentParser()
parser.add_argument('home', help='Specify the name of the home team as '
'noted on sports-reference.com, such as "purdue".')
parser.add_argument('away', help='Specify the name of the away team as '
'noted on sports-reference.com, such as "indiana".')
return parser.parse_args()
args = parse_arguments()
teams = Teams()
dataset = create_dataset(teams)
X_train, X_test, y_train, y_test = process_data(dataset)
model = build_model(X_train, y_train)
match_stats = create_matchup_data(teams(args.home).dataframe,
teams(args.away).dataframe)
df = match_stats.loc[:, X_train.columns]
result = model.predict(df).astype(int)
print('%s %s - %s %s' % (args.home, result[0][0], result[0][1], args.away))
Copy link

ghost commented Apr 4, 2019

Below is an adaptation of your code using multiprocessing that drastically speeds up the dataset creation.

Came across your sportsreference API a few weeks back and have found it very useful, keep up the good work.

import os
import pandas as pd
from sportsreference.ncaab.teams import Teams
import multiprocessing


DATASET_DIR = r"C:\Users\bd391nr\gitrepos\actuary\data"
DATASET_FILE = r"ncaab_dataset.pkl"
DATASET_PATH = os.path.join(DATASET_DIR, "raw", DATASET_FILE)


def check_dir(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)


def load_saved_dataset():
    if os.path.isfile(os.path.join(DATASET_PATH)):
        return pd.read_pickle(DATASET_PATH)
    return pd.DataFrame()


def pull_team_stats(team):
    """returns a pandas dataframe of team related statistics"""
    print("Collecting data for:", team.name)
    check_dir(os.path.join(DATASET_DIR, "raw", team.abbreviation.lower()))
    dataset_team = team.schedule.dataframe_extended

    dataset_team.to_pickle(
        os.path.join(
            DATASET_DIR,
            "raw",
            team.abbreviation.lower(),
            "dataset.pkl"))
    return dataset_team


def create_dataset():
    """method to generate datasets"""
    # check if dataset already exists
    dataset = load_saved_dataset()
    if not dataset.empty:
        return dataset

    # set up a pool of worker threads  
    pool = multiprocessing.Pool(max(multiprocessing.cpu_count()-1, 1))

    # process in parallelism
    team_list = [team for team in Teams()]
    dataset_list = pool.map(pull_team_stats, team_list)

    # combine all datasets
    dataset = pd.concat(dataset_list, ignore_index=True)
    dataset.to_pickle(DATASET_PATH)


if __name__ == '__main__':
    create_dataset()

@mooream11
Copy link

Hey man I know it is a long shot for a reply but I keep getting an error saying the following arguments are required: home, away
An exception has occurred, use %tb to see the full traceback.

@MW624322
Copy link

Hey man I know it is a long shot for a reply but I keep getting an error saying the following arguments are required: home, away
An exception has occurred, use %tb to see the full traceback.

im getting that too -- did you ever solve?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment