Skip to content

Instantly share code, notes, and snippets.

@LukasHaas
Last active August 24, 2020 10:58
Show Gist options
  • Save LukasHaas/a7e003f904026bfb7ac1fcea39ee0389 to your computer and use it in GitHub Desktop.
Save LukasHaas/a7e003f904026bfb7ac1fcea39ee0389 to your computer and use it in GitHub Desktop.
A scikit learn wrapper class to evaluate regression problems at difference levels of confidence (or quantiles).
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, clone
from sklearn.metrics import r2_score
from decimal import Decimal
from typing import Tuple, Union, Dict
class QuantileRegressor(BaseEstimator):
"""Predicts quantiles for regression problems.
This class is a wrapper class for sklearn estimators with the additional
possibility of adding rule-based logic to the underlying estimator.
The provided rules are hard-coded and take precedence over the underlying
estimator's predictions.
"""
def __init__(self, base_regressor: BaseEstimator, quantiles: list = None, quantile_range: Tuple = None, step: float = None, **base_params):
"""Initializes the QuantileRegressor instance.
Initializes the quantile regressor by supplying the underlying
sklearn estimator as well as fixed quantiles or a quantile range
and step size.
Args:
base_regressor: The underlying sklearn estimator.
Must implement a fit and predict method as well as accept loss and alpha parameters.
fit_quantiles, optional: List of quantiles on which the model should be trained on.
If no list is provided, the model falls back on the quantile_range and step parameters.
quantile_range, optional: Tuple with a lower and higher quantile bound which
provide a range for quantiles on which the model should be trained on.
step, optional: Step size which is used to create the model quantile range.
**base_params: Optional keyword arguments which will be passed on
to the ``base_model``.
Examples:
The below example illustrates how an instance of the
QuantileRegressor class can be initialized with a trained
sklearn GradientBoostingRegressor instance.
>>> gbr = GradientBoostingRegressor()
>>> quantile_reg = QuantileRegressor(gbr, fit_quantiles=[0.4, 0.5, 0.55])
"""
assert {'loss', 'alpha'}.issubset(base_regressor.get_params().keys()), \
'Provided base_regressor instance doesn\'t accept quantile loss function.'
assert quantiles is not None or (quantile_range is not None and step is not None), \
'The variable fit_quantiles or the variables quantile_range and step must be specified.'
params = {'loss': 'quantile',
'alpha': 0.5}
base_regressor = clone(base_regressor)
base_regressor.set_params(**base_params)
base_regressor.set_params(**params)
self.base_regressor = base_regressor
self.fit_quantiles = quantiles
self.quantile_range = quantile_range
self.step = step
model_dict = {}
self._quantiles = [0.5]
model_dict['0.5'] = base_regressor
self.model_dict = model_dict
quantiles = self.__quantile_creator() if self.fit_quantiles is None and quantile_range is not None and step is not None \
else self.fit_quantiles
all_models = [self._create_model_from_quantile(q) for q in quantiles]
for i in range(0, len(quantiles)):
if quantiles[i] not in self.model_dict.keys():
self.model_dict['{}'.format(quantiles[i])] = all_models[i]
quantiles = self._quantiles + quantiles
quantiles = list(set(quantiles))
self._quantiles = sorted(quantiles)
def __repr__(self):
return "Quantile Regressor:\n\n\t Base Model: {}\n\t Quantiles: {}".format(self.base_regressor, self.get_quantiles())
def __str__(self):
return self.__str__
def _create_model_from_quantile(self, quantile: float) -> BaseEstimator:
"""Returns a new copy of the base_regressor with the specified quantile.
Args:
quantile: The quantile to predict.
Returns
BaseEstimator: The new model with the adjusted quantile loss function.
"""
new_model = sklearn.base.clone(self.base_regressor)
params = {'loss': 'quantile',
'alpha': quantile}
new_model.set_params(**params)
return new_model
def __quantile_creator(self) -> list:
"""Returns a list of quantiles based on the specified range and step.
Returns
list: The quantiles to use for model training.
"""
quantiles = []
low = self.quantile_range[0]
high = self.quantile_range[1]
while low < high + self.step:
quantiles.append(low)
low = low + self.step
decimal_step = Decimal('{}'.format(self.step))
decimal_places = abs(decimal_step.as_tuple().exponent)
quantiles = [round(x, decimal_places) for x in quantiles]
return quantiles
def fit(self, X: pd.DataFrame, y:pd.Series, additional_quantiles: list = [], **kwargs):
"""Fits the estimator to the data.
Fits the estimator to the data, by creating copies of the underlying
estimator, each trained on a specific quantile.
Args:
X: The training feature data.
y: The training label data.
additional_quantiles, optional: List of additional quantiles on which
the model should be trained on.
**kwargs: Optional keyword arguments passed to the underlying
estimator's fit function.
"""
new_quantiles = list(filter(lambda x: x not in self._quantiles, additional_quantiles))
self._quantiles = sorted(self._quantiles + new_quantiles)
new_models = [self._create_model_from_quantile(q) for q in new_quantiles]
for i in range(0, len(new_quantiles)):
quantile = new_quantiles[i]
model = new_models[i]
self.model_dict['{}'.format(quantile)] = model
for quantile, model in self.model_dict.items():
model.fit(X, y)
def predict(self, X: Union[pd.DataFrame, pd.Series, np.array, list], quantiles: list = [0.5]) -> Union[Dict, np.array]:
"""Gets predictions for the provided input data and quantiles.
The predicitons are evaluated for every quantile and input data
point using the underlying estimator.
Args:
X: The feature data to evaluate predictions for.
quantiles, optional: The quantiles which should be estimated.
Returns:
Union[Dict, np.array]: The evaluated predictions.
If multiple quantiles were provided, the result is a dictionary
where the keys are the quantiles and the values the predictions.
"""
quantile_set = set(quantiles)
model_quantiles = set(self._quantiles)
assert quantile_set.issubset(model_quantiles), \
'The model hasn\'t been trained on all provided quantiles yet.\n\
See get_quantiles() method for available quantile values or fit the model on additional quantiles.'
model_predictions = {}
for quantile in quantiles:
model = self.model_dict['{}'.format(quantile)]
predictions = model.predict(X)
model_predictions['{}'.format(quantile)] = predictions
if len(quantiles) == 1:
return list(model_predictions.values())[0]
else:
return model_predictions
def predict_confidence_interval(self, X: Union[pd.DataFrame, pd.Series, np.array, list], confidence: float) -> list:
"""Predicts confidence intervals for the provided input data and quantiles.
The model must first be trained on the quantiles resulting from
the confidence level, meaning 0.5 +- (confidence / 2).
Args:
X: The feature data to evaluate predictions for.
confidence: The confidence for which the interval
should be calculated (0.0 - 1.0)
Returns:
list: The evaluated confidence intervals.
The return type is a list of tuples which each show the confidence
interval range.
"""
assert confidence >= 0 and confidence <= 1, \
"The confidence must be between 0.0 and 1.0"
first_quantile = 0.5 - (confidence / 2)
second_quantile = 0.5 + (confidence / 2)
quantile_set = set([first_quantile, second_quantile])
model_quantiles = set(self._quantiles)
assert quantile_set.issubset(model_quantiles), \
'The model hasn\'t been trained yet on the needed quantiles for the {} % confidence interval.\n\
Please fit the model first on all of the following quantiles: {}.'.format(int(confidence * 100), quantile_set)
predictions = self.predict(X, quantiles=[first_quantile, second_quantile])
pred_keys = list(predictions.keys())
data_length = len(predictions[pred_keys[0]])
confidence_intervals = [(predictions[pred_keys[0]][x], predictions[pred_keys[1]][x]) for x in range(data_length)]
return confidence_intervals
def score(self, X: Union[pd.DataFrame, pd.Series, np.array, list], y: Union[pd.Series, np.array, list], quantile=0.5) -> float:
"""Evaluates the r2 score for the predictions.
Evaluates the r2 score for the predictions at the specified quantile.
Args:
X: The feature data to evaluate the r2 score for.
y: The true values at the specified quantile.
quantile, optional: The quantile at which to score the estimator.
In most cases, only the value 0.5 makes sense for scoring.
Returns:
float: The r2 score.
"""
predictions = self.predict(X, quantiles=[quantile])
score = r2_score(y, predictions)
return score
def get_params(self, deep: bool = True) -> Dict:
"""Return the model's and base regressor's parameters.
Args:
deep: Whether to recursively return the base model's parameters.
Returns
Dict: The model's parameters.
"""
params = {'base_regressor': self.base_regressor,
'fit_quantiles': self.fit_quantiles,
'quantile_range': self.quantile_range,
'step': self.step,
'model_dict': self.model_dict}
params.update(self.base_regressor.get_params(deep=deep))
return params
def set_params(self, **params):
"""Sets parameters for the model and base regressor.
Args:
**params: Optional keyword arguments.
"""
parameters = params.copy()
if 'base_regressor' in parameters:
value = parameters.pop('base_regressor')
self.base_regressor = value
if 'fit_quantiles' in parameters:
value = parameters.pop('fit_quantiles')
self.fit_quantiles = value
if 'quantile_range' in parameters:
value = parameters.pop('quantile_range')
self.quantile_range = value
if 'step' in parameters:
value = parameters.pop('step')
self.step = value
if 'outcome_range' in parameters:
value = parameters.pop('outcome_range')
self.outcome_range = value
if 'model_dict' in parameters:
value = parameters.pop('model_dict')
self.model_dict = value
self.base_regressor.set_params(**parameters)
for model in self.model_dict.values():
model.set_params(**parameters)
def get_quantiles(self) -> list:
"""Returns the quantiles on which the model was trained.
Returns:
list: The quantiles on which the model was trained on.
"""
return self._quantiles
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment