glemaitre · September 22, 2021 19:18
diff --git a/plot_forecasting.py b/plot_forecasting.py
 # %%
 # Download the original dataset to be able to easily build an index with the
 # original datetime.
 # The dataset is available at:
 # https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
 import pandas as pd

 df_external = pd.read_csv(
    "~/Downloads/Bike-Sharing-Dataset/hour.csv",
    index_col=0,
 )
 df_external.head()

 # %%
 dteday = pd.to_datetime(df_external["dteday"])

 # %%
 time_index = pd.to_datetime(
    {
        "year": dteday.dt.year,
        "month": dteday.dt.month,
        "day": dteday.dt.day,
        "hour": df_external["hr"],
    }
 )

 # %%
 # We create the same dataset than in the original example:
 # https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
 from sklearn.datasets import fetch_openml

 bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
 df = bike_sharing.frame

 # %%
 # To get a quick understanding of the periodic patterns of the data, let us
 # have a look at the average demand per hour during a week.
 #
 # Note that the week starts on a Sunday, during the weekend. We can clearly
 # distinguish the commute patterns in the morning and evenings of the work days
 # and the leisure use of the bikes on the weekends with a more spread peak
 # demand around the middle of the days:
 import matplotlib.pyplot as plt


 fig, ax = plt.subplots(figsize=(12, 4))
 average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"]
 average_week_demand.plot(ax=ax)
 _ = ax.set(
    title="Average hourly bike demand during the week",
    xticks=[i * 24 for i in range(7)],
    xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
    xlabel="Time of the week",
    ylabel="Number of bike rentals",
 )

 # %%
 #
 # The target of the prediction problem is the absolute count of bike rentals on
 # a hourly basis:
 df["count"].max()

 # %%
 #
 # Let us rescale the target variable (number of hourly bike rentals) to predict
 # a relative demand so that the mean absolute error is more easily interpreted
 # as a fraction of the maximum demand.
 #
 # .. note::
 #
 #     The fit method of the models used in this notebook all minimize the
 #     mean squared error to estimate the conditional mean instead of the mean
 #     absolute error that would fit an estimator of the conditional median.
 #
 #     When reporting performance measure on the test set in the discussion, we
 #     instead choose to focus on the mean absolute error that is more
 #     intuitive than the (root) mean squared error. Note, however, that the
 #     best models for one metric are also the best for the other in this
 #     study.
 y = df["count"] / 1000

 # %%
 fig, ax = plt.subplots(figsize=(12, 4))
 y.hist(bins=30, ax=ax)
 _ = ax.set(
    xlabel="Fraction of rented fleet demand",
    ylabel="Number of hours",
 )

 # %%
 # The input feature data frame is a time annotated hourly log of variables
 # describing the weather conditions. It includes both numerical and categorical
 # variables. Note that the time information has already been expanded into
 # several complementary columns.
 #
 X = df.drop("count", axis="columns")
 X

 # %%
 # .. note::
 #
 #    If the time information was only present as a date or datetime column, we
 #    could have expanded it into hour-in-the-day, day-in-the-week,
 #    day-in-the-month, month-in-the-year using pandas:
 #    https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
 #
 # We now introspect the distribution of the categorical variables, starting
 # with `"weather"`:
 #
 X["weather"].value_counts()

 # %%
 # Since there are only 3 `"heavy_rain"` events, we cannot use this category to
 # train machine learning models with cross validation. Instead, we simplify the
 # representation by collapsing those into the `"rain"` category.
 #
 X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
 # %%
 X["weather"].value_counts()

 # %%
 X.tail()

 # %%
 X.index = pd.Index(time_index).to_period("H")

 # %%
 X.head()

 # %%
 y.index = pd.Index(time_index).to_period("H")

 # %%
 # From now on, we will try to use sktime to to process the data
 # as a forecasting problem. Indeed, we want a model that will
 # augment the data by addind information from past samples.
 #
 # First, let's split the data into 2 sets: 2010 that will be used
 # for training and 2011 that will be used for testing.
 from sktime.forecasting.model_selection import temporal_train_test_split

 y_train, y_test, X_train, X_test = temporal_train_test_split(
    y, X, test_size=0.5
 )

 # %%
 # We will start by creating a dummy forecaster that will predict the
 # latest values seen in the time series.
 from sktime.forecasting.naive import NaiveForecaster

 forecaster = NaiveForecaster()
 forecaster.fit(y_train)

 # %%
 # The way of doing prediction is a not straightforward. One needs to
 # create a ForecastHorizon object that take the time index for which
 # one wants the prediction. This will enforce your time index to be
 # a regularly spaced period.
 from sktime.forecasting.base import ForecastingHorizon

 fh = ForecastingHorizon(
    y_test.index, is_relative=False
 )

 forecaster.predict(fh)

 # %%
 # Now, let's create a preprocessor to handle the heterogeneous data
 # in X. We also create an HGBRT as a predictor.
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OrdinalEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import HistGradientBoostingRegressor

 categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
 ]
 categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
 ]
 ordinal_encoder = OrdinalEncoder(categories=categories)
 preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", ordinal_encoder, categorical_columns),
    ],
    remainder="passthrough",
 )
 gbrt = HistGradientBoostingRegressor()

 # %%
 # A simple scikit-learn approach is to pipeline the preprocessor
 # and the HGBDT. However, we treat a sample by providing data only
 # at time t, without any hint from previous samples.
 gbrt_pipeline = make_pipeline(preprocessor, gbrt)
 gbrt_pipeline.fit(X_train, y_train)
 y_pred_regressor = gbrt_pipeline.predict(X_test)
 y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index)

 # %%
 # Now, we would like to have the same type of model that would take
 # into accound past samples. It seems that sktime is not good at
 # taking into account heterogeneous data and therefore we have to
 # manage everything by hand. One might wants to check the
 # ForecasterPipeline in case that it could be used instead.
 X_train_trans = preprocessor.fit_transform(X_train)
 X_test_trans = preprocessor.transform(X_test)

 # %%
 # Recreate some dataset with the right indices.
 X_train_trans = pd.DataFrame(
    X_train_trans, index=X_train.index, columns=X_train.columns,
 )
 X_test_trans = pd.DataFrame(
    X_test_trans, index=X_test.index, columns=X_test.columns,
 )

 # %%
 # We can create a forecaster from the scikit-learn pipeline. We
 # need to use make_reduction and specify the window_length to create
 # internally a new dataframe that will take the past samples into
 # account.
 #
 # The recursive strategy means that we will fit a single model to predict
 # every samples in the horizon. We could have more complex strategy to
 # use a particular model for each horizon sample.
 from sktime.forecasting.compose import make_reduction

 forecaster = make_reduction(
    gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor"
 )


 # %%
 # Plotting the data, we could see that there is a general trend that
 # bike rental is generally increasing. We will take into account this
 # general trend by preprocessing the target.
 from sktime.transformations.series.detrend import Detrender
 from sktime.forecasting.trend import PolynomialTrendForecaster

 detrender = Detrender(PolynomialTrendForecaster(degree=1))

 # %%
 # The TransformedTargetForecaster has a weird Pipeline API.
 from sktime.forecasting.compose import TransformedTargetForecaster

 forecaster_detrended = TransformedTargetForecaster(
    steps=[
        ("detrender", detrender),
        ("forecaster", forecaster),
    ]
 )

 # %%
 # The forecaster will not support ot have data with missing entry.
 # This is probably wrong but we will resample to have a regularly
 # spaced samples and use a forward fill.
 forecaster_detrended.fit(
    y=y_train.resample("H").ffill(),
    X=X_train_trans.resample("H").ffill())

 # %%
 # We do the same to get the predictions.
 y_pred_forecasting = forecaster_detrended.predict(
    fh, X=X_test_trans.resample("H").ffill()
 )

 # %%
 # Finally, let's make a plot by resampling by day in order to see
 # something on the plot.
 import matplotlib.pyplot as plt

 _, ax = plt.subplots()
 y_test.resample("D").mean().plot(ax=ax, label="True targets")
 y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster")
 y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor")
 ax.set_ylabel("Target")
 ax.legend()
 ax.set_title("Comparison of a regressor and a forecaster")

 # %%
	# %%
	# Download the original dataset to be able to easily build an index with the
	# original datetime.
	# The dataset is available at:
	# https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
	import pandas as pd

	df_external = pd.read_csv(
	"~/Downloads/Bike-Sharing-Dataset/hour.csv",
	index_col=0,
	)
	df_external.head()

	# %%
	dteday = pd.to_datetime(df_external["dteday"])

	# %%
	time_index = pd.to_datetime(
	{
	"year": dteday.dt.year,
	"month": dteday.dt.month,
	"day": dteday.dt.day,
	"hour": df_external["hr"],
	}
	)

	# %%
	# We create the same dataset than in the original example:
	# https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
	from sklearn.datasets import fetch_openml

	bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
	df = bike_sharing.frame

	# %%
	# To get a quick understanding of the periodic patterns of the data, let us
	# have a look at the average demand per hour during a week.
	#
	# Note that the week starts on a Sunday, during the weekend. We can clearly
	# distinguish the commute patterns in the morning and evenings of the work days
	# and the leisure use of the bikes on the weekends with a more spread peak
	# demand around the middle of the days:
	import matplotlib.pyplot as plt


	fig, ax = plt.subplots(figsize=(12, 4))
	average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"]
	average_week_demand.plot(ax=ax)
	_ = ax.set(
	title="Average hourly bike demand during the week",
	xticks=[i * 24 for i in range(7)],
	xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
	xlabel="Time of the week",
	ylabel="Number of bike rentals",
	)

	# %%
	#
	# The target of the prediction problem is the absolute count of bike rentals on
	# a hourly basis:
	df["count"].max()

	# %%
	#
	# Let us rescale the target variable (number of hourly bike rentals) to predict
	# a relative demand so that the mean absolute error is more easily interpreted
	# as a fraction of the maximum demand.
	#
	# .. note::
	#
	# The fit method of the models used in this notebook all minimize the
	# mean squared error to estimate the conditional mean instead of the mean
	# absolute error that would fit an estimator of the conditional median.
	#
	# When reporting performance measure on the test set in the discussion, we
	# instead choose to focus on the mean absolute error that is more
	# intuitive than the (root) mean squared error. Note, however, that the
	# best models for one metric are also the best for the other in this
	# study.
	y = df["count"] / 1000

	# %%
	fig, ax = plt.subplots(figsize=(12, 4))
	y.hist(bins=30, ax=ax)
	_ = ax.set(
	xlabel="Fraction of rented fleet demand",
	ylabel="Number of hours",
	)

	# %%
	# The input feature data frame is a time annotated hourly log of variables
	# describing the weather conditions. It includes both numerical and categorical
	# variables. Note that the time information has already been expanded into
	# several complementary columns.
	#
	X = df.drop("count", axis="columns")
	X

	# %%
	# .. note::
	#
	# If the time information was only present as a date or datetime column, we
	# could have expanded it into hour-in-the-day, day-in-the-week,
	# day-in-the-month, month-in-the-year using pandas:
	# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
	#
	# We now introspect the distribution of the categorical variables, starting
	# with `"weather"`:
	#
	X["weather"].value_counts()

	# %%
	# Since there are only 3 `"heavy_rain"` events, we cannot use this category to
	# train machine learning models with cross validation. Instead, we simplify the
	# representation by collapsing those into the `"rain"` category.
	#
	X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
	# %%
	X["weather"].value_counts()

	# %%
	X.tail()

	# %%
	X.index = pd.Index(time_index).to_period("H")

	# %%
	X.head()

	# %%
	y.index = pd.Index(time_index).to_period("H")

	# %%
	# From now on, we will try to use sktime to to process the data
	# as a forecasting problem. Indeed, we want a model that will
	# augment the data by addind information from past samples.
	#
	# First, let's split the data into 2 sets: 2010 that will be used
	# for training and 2011 that will be used for testing.
	from sktime.forecasting.model_selection import temporal_train_test_split

	y_train, y_test, X_train, X_test = temporal_train_test_split(
	y, X, test_size=0.5
	)

	# %%
	# We will start by creating a dummy forecaster that will predict the
	# latest values seen in the time series.
	from sktime.forecasting.naive import NaiveForecaster

	forecaster = NaiveForecaster()
	forecaster.fit(y_train)

	# %%
	# The way of doing prediction is a not straightforward. One needs to
	# create a ForecastHorizon object that take the time index for which
	# one wants the prediction. This will enforce your time index to be
	# a regularly spaced period.
	from sktime.forecasting.base import ForecastingHorizon

	fh = ForecastingHorizon(
	y_test.index, is_relative=False
	)

	forecaster.predict(fh)

	# %%
	# Now, let's create a preprocessor to handle the heterogeneous data
	# in X. We also create an HGBRT as a predictor.
	from sklearn.pipeline import make_pipeline
	from sklearn.preprocessing import OrdinalEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.ensemble import HistGradientBoostingRegressor

	categorical_columns = [
	"weather",
	"season",
	"holiday",
	"workingday",
	]
	categories = [
	["clear", "misty", "rain"],
	["spring", "summer", "fall", "winter"],
	["False", "True"],
	["False", "True"],
	]
	ordinal_encoder = OrdinalEncoder(categories=categories)
	preprocessor = ColumnTransformer(
	transformers=[
	("categorical", ordinal_encoder, categorical_columns),
	],
	remainder="passthrough",
	)
	gbrt = HistGradientBoostingRegressor()

	# %%
	# A simple scikit-learn approach is to pipeline the preprocessor
	# and the HGBDT. However, we treat a sample by providing data only
	# at time t, without any hint from previous samples.
	gbrt_pipeline = make_pipeline(preprocessor, gbrt)
	gbrt_pipeline.fit(X_train, y_train)
	y_pred_regressor = gbrt_pipeline.predict(X_test)
	y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index)

	# %%
	# Now, we would like to have the same type of model that would take
	# into accound past samples. It seems that sktime is not good at
	# taking into account heterogeneous data and therefore we have to
	# manage everything by hand. One might wants to check the
	# ForecasterPipeline in case that it could be used instead.
	X_train_trans = preprocessor.fit_transform(X_train)
	X_test_trans = preprocessor.transform(X_test)

	# %%
	# Recreate some dataset with the right indices.
	X_train_trans = pd.DataFrame(
	X_train_trans, index=X_train.index, columns=X_train.columns,
	)
	X_test_trans = pd.DataFrame(
	X_test_trans, index=X_test.index, columns=X_test.columns,
	)

	# %%
	# We can create a forecaster from the scikit-learn pipeline. We
	# need to use make_reduction and specify the window_length to create
	# internally a new dataframe that will take the past samples into
	# account.
	#
	# The recursive strategy means that we will fit a single model to predict
	# every samples in the horizon. We could have more complex strategy to
	# use a particular model for each horizon sample.
	from sktime.forecasting.compose import make_reduction

	forecaster = make_reduction(
	gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor"
	)


	# %%
	# Plotting the data, we could see that there is a general trend that
	# bike rental is generally increasing. We will take into account this
	# general trend by preprocessing the target.
	from sktime.transformations.series.detrend import Detrender
	from sktime.forecasting.trend import PolynomialTrendForecaster

	detrender = Detrender(PolynomialTrendForecaster(degree=1))

	# %%
	# The TransformedTargetForecaster has a weird Pipeline API.
	from sktime.forecasting.compose import TransformedTargetForecaster

	forecaster_detrended = TransformedTargetForecaster(
	steps=[
	("detrender", detrender),
	("forecaster", forecaster),
	]
	)

	# %%
	# The forecaster will not support ot have data with missing entry.
	# This is probably wrong but we will resample to have a regularly
	# spaced samples and use a forward fill.
	forecaster_detrended.fit(
	y=y_train.resample("H").ffill(),
	X=X_train_trans.resample("H").ffill())

	# %%
	# We do the same to get the predictions.
	y_pred_forecasting = forecaster_detrended.predict(
	fh, X=X_test_trans.resample("H").ffill()
	)

	# %%
	# Finally, let's make a plot by resampling by day in order to see
	# something on the plot.
	import matplotlib.pyplot as plt

	_, ax = plt.subplots()
	y_test.resample("D").mean().plot(ax=ax, label="True targets")
	y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster")
	y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor")
	ax.set_ylabel("Target")
	ax.legend()
	ax.set_title("Comparison of a regressor and a forecaster")

	# %%