Created
September 22, 2021 19:18
-
-
Save glemaitre/1649fe00f38a803e89060e9810400b52 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
# Download the original dataset to be able to easily build an index with the | |
# original datetime. | |
# The dataset is available at: | |
# https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip | |
import pandas as pd | |
df_external = pd.read_csv( | |
"~/Downloads/Bike-Sharing-Dataset/hour.csv", | |
index_col=0, | |
) | |
df_external.head() | |
# %% | |
dteday = pd.to_datetime(df_external["dteday"]) | |
# %% | |
time_index = pd.to_datetime( | |
{ | |
"year": dteday.dt.year, | |
"month": dteday.dt.month, | |
"day": dteday.dt.day, | |
"hour": df_external["hr"], | |
} | |
) | |
# %% | |
# We create the same dataset than in the original example: | |
# https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py | |
from sklearn.datasets import fetch_openml | |
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True) | |
df = bike_sharing.frame | |
# %% | |
# To get a quick understanding of the periodic patterns of the data, let us | |
# have a look at the average demand per hour during a week. | |
# | |
# Note that the week starts on a Sunday, during the weekend. We can clearly | |
# distinguish the commute patterns in the morning and evenings of the work days | |
# and the leisure use of the bikes on the weekends with a more spread peak | |
# demand around the middle of the days: | |
import matplotlib.pyplot as plt | |
fig, ax = plt.subplots(figsize=(12, 4)) | |
average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"] | |
average_week_demand.plot(ax=ax) | |
_ = ax.set( | |
title="Average hourly bike demand during the week", | |
xticks=[i * 24 for i in range(7)], | |
xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"], | |
xlabel="Time of the week", | |
ylabel="Number of bike rentals", | |
) | |
# %% | |
# | |
# The target of the prediction problem is the absolute count of bike rentals on | |
# a hourly basis: | |
df["count"].max() | |
# %% | |
# | |
# Let us rescale the target variable (number of hourly bike rentals) to predict | |
# a relative demand so that the mean absolute error is more easily interpreted | |
# as a fraction of the maximum demand. | |
# | |
# .. note:: | |
# | |
# The fit method of the models used in this notebook all minimize the | |
# mean squared error to estimate the conditional mean instead of the mean | |
# absolute error that would fit an estimator of the conditional median. | |
# | |
# When reporting performance measure on the test set in the discussion, we | |
# instead choose to focus on the mean absolute error that is more | |
# intuitive than the (root) mean squared error. Note, however, that the | |
# best models for one metric are also the best for the other in this | |
# study. | |
y = df["count"] / 1000 | |
# %% | |
fig, ax = plt.subplots(figsize=(12, 4)) | |
y.hist(bins=30, ax=ax) | |
_ = ax.set( | |
xlabel="Fraction of rented fleet demand", | |
ylabel="Number of hours", | |
) | |
# %% | |
# The input feature data frame is a time annotated hourly log of variables | |
# describing the weather conditions. It includes both numerical and categorical | |
# variables. Note that the time information has already been expanded into | |
# several complementary columns. | |
# | |
X = df.drop("count", axis="columns") | |
X | |
# %% | |
# .. note:: | |
# | |
# If the time information was only present as a date or datetime column, we | |
# could have expanded it into hour-in-the-day, day-in-the-week, | |
# day-in-the-month, month-in-the-year using pandas: | |
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components | |
# | |
# We now introspect the distribution of the categorical variables, starting | |
# with `"weather"`: | |
# | |
X["weather"].value_counts() | |
# %% | |
# Since there are only 3 `"heavy_rain"` events, we cannot use this category to | |
# train machine learning models with cross validation. Instead, we simplify the | |
# representation by collapsing those into the `"rain"` category. | |
# | |
X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True) | |
# %% | |
X["weather"].value_counts() | |
# %% | |
X.tail() | |
# %% | |
X.index = pd.Index(time_index).to_period("H") | |
# %% | |
X.head() | |
# %% | |
y.index = pd.Index(time_index).to_period("H") | |
# %% | |
# From now on, we will try to use sktime to to process the data | |
# as a forecasting problem. Indeed, we want a model that will | |
# augment the data by addind information from past samples. | |
# | |
# First, let's split the data into 2 sets: 2010 that will be used | |
# for training and 2011 that will be used for testing. | |
from sktime.forecasting.model_selection import temporal_train_test_split | |
y_train, y_test, X_train, X_test = temporal_train_test_split( | |
y, X, test_size=0.5 | |
) | |
# %% | |
# We will start by creating a dummy forecaster that will predict the | |
# latest values seen in the time series. | |
from sktime.forecasting.naive import NaiveForecaster | |
forecaster = NaiveForecaster() | |
forecaster.fit(y_train) | |
# %% | |
# The way of doing prediction is a not straightforward. One needs to | |
# create a ForecastHorizon object that take the time index for which | |
# one wants the prediction. This will enforce your time index to be | |
# a regularly spaced period. | |
from sktime.forecasting.base import ForecastingHorizon | |
fh = ForecastingHorizon( | |
y_test.index, is_relative=False | |
) | |
forecaster.predict(fh) | |
# %% | |
# Now, let's create a preprocessor to handle the heterogeneous data | |
# in X. We also create an HGBRT as a predictor. | |
from sklearn.pipeline import make_pipeline | |
from sklearn.preprocessing import OrdinalEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.ensemble import HistGradientBoostingRegressor | |
categorical_columns = [ | |
"weather", | |
"season", | |
"holiday", | |
"workingday", | |
] | |
categories = [ | |
["clear", "misty", "rain"], | |
["spring", "summer", "fall", "winter"], | |
["False", "True"], | |
["False", "True"], | |
] | |
ordinal_encoder = OrdinalEncoder(categories=categories) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
("categorical", ordinal_encoder, categorical_columns), | |
], | |
remainder="passthrough", | |
) | |
gbrt = HistGradientBoostingRegressor() | |
# %% | |
# A simple scikit-learn approach is to pipeline the preprocessor | |
# and the HGBDT. However, we treat a sample by providing data only | |
# at time t, without any hint from previous samples. | |
gbrt_pipeline = make_pipeline(preprocessor, gbrt) | |
gbrt_pipeline.fit(X_train, y_train) | |
y_pred_regressor = gbrt_pipeline.predict(X_test) | |
y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index) | |
# %% | |
# Now, we would like to have the same type of model that would take | |
# into accound past samples. It seems that sktime is not good at | |
# taking into account heterogeneous data and therefore we have to | |
# manage everything by hand. One might wants to check the | |
# ForecasterPipeline in case that it could be used instead. | |
X_train_trans = preprocessor.fit_transform(X_train) | |
X_test_trans = preprocessor.transform(X_test) | |
# %% | |
# Recreate some dataset with the right indices. | |
X_train_trans = pd.DataFrame( | |
X_train_trans, index=X_train.index, columns=X_train.columns, | |
) | |
X_test_trans = pd.DataFrame( | |
X_test_trans, index=X_test.index, columns=X_test.columns, | |
) | |
# %% | |
# We can create a forecaster from the scikit-learn pipeline. We | |
# need to use make_reduction and specify the window_length to create | |
# internally a new dataframe that will take the past samples into | |
# account. | |
# | |
# The recursive strategy means that we will fit a single model to predict | |
# every samples in the horizon. We could have more complex strategy to | |
# use a particular model for each horizon sample. | |
from sktime.forecasting.compose import make_reduction | |
forecaster = make_reduction( | |
gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor" | |
) | |
# %% | |
# Plotting the data, we could see that there is a general trend that | |
# bike rental is generally increasing. We will take into account this | |
# general trend by preprocessing the target. | |
from sktime.transformations.series.detrend import Detrender | |
from sktime.forecasting.trend import PolynomialTrendForecaster | |
detrender = Detrender(PolynomialTrendForecaster(degree=1)) | |
# %% | |
# The TransformedTargetForecaster has a weird Pipeline API. | |
from sktime.forecasting.compose import TransformedTargetForecaster | |
forecaster_detrended = TransformedTargetForecaster( | |
steps=[ | |
("detrender", detrender), | |
("forecaster", forecaster), | |
] | |
) | |
# %% | |
# The forecaster will not support ot have data with missing entry. | |
# This is probably wrong but we will resample to have a regularly | |
# spaced samples and use a forward fill. | |
forecaster_detrended.fit( | |
y=y_train.resample("H").ffill(), | |
X=X_train_trans.resample("H").ffill()) | |
# %% | |
# We do the same to get the predictions. | |
y_pred_forecasting = forecaster_detrended.predict( | |
fh, X=X_test_trans.resample("H").ffill() | |
) | |
# %% | |
# Finally, let's make a plot by resampling by day in order to see | |
# something on the plot. | |
import matplotlib.pyplot as plt | |
_, ax = plt.subplots() | |
y_test.resample("D").mean().plot(ax=ax, label="True targets") | |
y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster") | |
y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor") | |
ax.set_ylabel("Target") | |
ax.legend() | |
ax.set_title("Comparison of a regressor and a forecaster") | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment