-
-
Save brentp/089c7d6d69d78d26437f to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
u""" | |
Beta regression for modeling rates and proportions. | |
References | |
---------- | |
Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression | |
in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in | |
Economics and Statistics, 2011. | |
Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer? | |
Maximum-likelihood regression with beta-distributed dependent variables." | |
Psychological methods 11.1 (2006): 54. | |
""" | |
import numpy as np | |
import pandas as pd | |
import statsmodels.api as sm | |
from scipy.special import gammaln as lgamma | |
from statsmodels.base.model import GenericLikelihoodModel | |
from statsmodels.genmod.families import Binomial | |
# this is only need while #2024 is open. | |
class Logit(sm.families.links.Logit): | |
"""Logit tranform that won't overflow with large numbers.""" | |
def inverse(self, z): | |
return 1 / (1. + np.exp(-z)) | |
_init_example = """ | |
Beta regression with default of logit-link for exog and log-link | |
for precision. | |
>>> mod = Beta(endog, exog) | |
>>> rslt = mod.fit() | |
>>> print rslt.summary() | |
We can also specify a formula and a specific structure and use the | |
identity-link for phi. | |
>>> from sm.families.links import identity | |
>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
>>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', | |
... dat, Z=Z, link_phi=identity()) | |
In the case of proportion-data, we may think that the precision depends on | |
the number of measurements. E.g for sequence data, on the number of | |
sequence reads covering a site: | |
>>> Z = patsy.dmatrix('~ coverage', df) | |
>>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z) | |
>>> rslt = mod.fit() | |
""" | |
class Beta(GenericLikelihoodModel): | |
"""Beta Regression. | |
This implementation uses `phi` as a precision parameter equal to | |
`a + b` from the Beta parameters. | |
""" | |
def __init__(self, endog, exog, Z=None, link=Logit(), | |
link_phi=sm.families.links.Log(), **kwds): | |
""" | |
Parameters | |
---------- | |
endog : array-like | |
1d array of endogenous values (i.e. responses, outcomes, | |
dependent variables, or 'Y' values). | |
exog : array-like | |
2d array of exogeneous values (i.e. covariates, predictors, | |
independent variables, regressors, or 'X' values). A nobs x k | |
array where `nobs` is the number of observations and `k` is | |
the number of regressors. An intercept is not included by | |
default and should be added by the user. See | |
`statsmodels.tools.add_constant`. | |
Z : array-like | |
2d array of variables for the precision phi. | |
link : link | |
Any link in sm.families.links for `exog` | |
link_phi : link | |
Any link in sm.families.links for `Z` | |
Examples | |
-------- | |
{example} | |
See Also | |
-------- | |
:ref:`links` | |
""".format(example=_init_example) | |
assert np.all((0 < endog) & (endog < 1)) | |
if Z is None: | |
extra_names = ['phi'] | |
Z = np.ones((len(endog), 1), dtype='f') | |
else: | |
extra_names = ['precision-%s' % zc for zc in \ | |
(Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))] | |
kwds['extra_params_names'] = extra_names | |
super(Beta, self).__init__(endog, exog, **kwds) | |
self.link = link | |
self.link_phi = link_phi | |
self.Z = Z | |
assert len(self.Z) == len(self.endog) | |
def nloglikeobs(self, params): | |
""" | |
Negative log-likelihood. | |
Parameters | |
---------- | |
params : np.ndarray | |
Parameter estimates | |
""" | |
return -self._ll_br(self.endog, self.exog, self.Z, params) | |
def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False, | |
method='bfgs', **kwds): | |
""" | |
Fit the model. | |
Parameters | |
---------- | |
start_params : array-like | |
A vector of starting values for the regression | |
coefficients. If None, a default is chosen. | |
maxiter : integer | |
The maximum number of iterations | |
disp : bool | |
Show convergence stats. | |
method : str | |
The optimization method to use. | |
""" | |
if start_params is None: | |
start_params = sm.GLM(self.endog, self.exog, family=Binomial() | |
).fit(disp=False).params | |
start_params = np.append(start_params, [0.5] * self.Z.shape[1]) | |
return super(Beta, self).fit(start_params=start_params, | |
maxiter=maxiter, maxfun=maxfun, | |
method=method, disp=disp, **kwds) | |
def _ll_br(self, y, X, Z, params): | |
nz = self.Z.shape[1] | |
Xparams = params[:-nz] | |
Zparams = params[-nz:] | |
mu = self.link.inverse(np.dot(X, Xparams)) | |
phi = self.link_phi.inverse(np.dot(Z, Zparams)) | |
# TODO: derive a and b and constrain to > 0? | |
if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf) | |
ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \ | |
+ (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \ | |
* np.log(1 - y) | |
return ll | |
if __name__ == "__main__": | |
import patsy | |
dat = pd.read_table('gasoline.txt') | |
Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
# using other precison params with | |
m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat, | |
Z=Z, link_phi=sm.families.links.identity()) | |
print m.fit().summary() | |
fex = pd.read_csv('foodexpenditure.csv') | |
m = Beta.from_formula(' I(food/income) ~ income + persons', fex) | |
print m.fit().summary() | |
#print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary() | |
dev = pd.read_csv('methylation-test.csv') | |
Z = patsy.dmatrix('~ age', dev, return_type='dataframe') | |
m = Beta.from_formula('methylation ~ gender + CpG', dev, | |
Z=Z, | |
link_phi=sm.families.links.identity()) | |
print m.fit().summary() |
library(betareg) | |
data("GasolineYield", package = "betareg") | |
data("FoodExpenditure", package = "betareg") | |
#fe_beta = betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
#print(summary(fe_beta)$coefficients) | |
#m = betareg(yield ~ batch + temp, data = GasolineYield) | |
#print(summary(m)) | |
#gy2 <- betareg(yield ~ batch + temp | temp, data = GasolineYield, link.phi="identity") | |
#print(summary(gy2)) | |
meth = read.csv('methylation-test.csv') | |
m = betareg(methylation ~ gender + CpG | age, meth) | |
print(summary(m)) | |
food | income | persons | |
---|---|---|---|
15.998 | 62.476 | 1 | |
16.652 | 82.304 | 5 | |
21.741 | 74.679 | 3 | |
7.431 | 39.151 | 3 | |
10.481 | 64.724 | 5 | |
13.548 | 36.786 | 3 | |
23.256 | 83.052 | 4 | |
17.976 | 86.935 | 1 | |
14.161 | 88.233 | 2 | |
8.825 | 38.695 | 2 | |
14.184 | 73.831 | 7 | |
19.604 | 77.122 | 3 | |
13.728 | 45.519 | 2 | |
21.141 | 82.251 | 2 | |
17.446 | 59.862 | 3 | |
9.629 | 26.563 | 3 | |
14.005 | 61.818 | 2 | |
9.16 | 29.682 | 1 | |
18.831 | 50.825 | 5 | |
7.641 | 71.062 | 4 | |
13.882 | 41.99 | 4 | |
9.67 | 37.324 | 3 | |
21.604 | 86.352 | 5 | |
10.866 | 45.506 | 2 | |
28.98 | 69.929 | 6 | |
10.882 | 61.041 | 2 | |
18.561 | 82.469 | 1 | |
11.629 | 44.208 | 2 | |
18.067 | 49.467 | 5 | |
14.539 | 25.905 | 5 | |
19.192 | 79.178 | 5 | |
25.918 | 75.811 | 3 | |
28.833 | 82.718 | 6 | |
15.869 | 48.311 | 4 | |
14.91 | 42.494 | 5 | |
9.55 | 40.573 | 4 | |
23.066 | 44.872 | 6 | |
14.751 | 27.167 | 7 |
iyield gravity pressure temp10 temp batch | |
0.122 50.8 8.6 190 205 1 | |
0.223 50.8 8.6 190 275 1 | |
0.347 50.8 8.6 190 345 1 | |
0.457 50.8 8.6 190 407 1 | |
0.08 40.8 3.5 210 218 2 | |
0.131 40.8 3.5 210 273 2 | |
0.266 40.8 3.5 210 347 2 | |
0.074 40 6.1 217 212 3 | |
0.182 40 6.1 217 272 3 | |
0.304 40 6.1 217 340 3 | |
0.069 38.4 6.1 220 235 4 | |
0.152 38.4 6.1 220 300 4 | |
0.26 38.4 6.1 220 365 4 | |
0.336 38.4 6.1 220 410 4 | |
0.144 40.3 4.8 231 307 5 | |
0.268 40.3 4.8 231 367 5 | |
0.349 40.3 4.8 231 395 5 | |
0.1 32.2 5.2 236 267 6 | |
0.248 32.2 5.2 236 360 6 | |
0.317 32.2 5.2 236 402 6 | |
0.028 41.3 1.8 267 235 7 | |
0.064 41.3 1.8 267 275 7 | |
0.161 41.3 1.8 267 358 7 | |
0.278 41.3 1.8 267 416 7 | |
0.05 38.1 1.2 274 285 8 | |
0.176 38.1 1.2 274 365 8 | |
0.321 38.1 1.2 274 444 8 | |
0.14 32.2 2.4 284 351 9 | |
0.232 32.2 2.4 284 424 9 | |
0.085 31.8 0.2 316 365 10 | |
0.147 31.8 0.2 316 379 10 | |
0.18 31.8 0.2 316 428 10 |
Unnamed: 0 | gender | age | Basename | ID | id | plate | CpG | methylation | ||
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_0 | 0.815 | |
1 | 1 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_0 | 0.803 | |
2 | 2 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_0 | 0.803 | |
3 | 3 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_0 | 0.808 | |
4 | 4 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_0 | 0.8550000000000001 | |
5 | 5 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_0 | 0.8129999999999998 | |
6 | 6 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_0 | 0.816 | |
7 | 7 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_0 | 0.827 | |
8 | 8 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_0 | 0.829 | |
9 | 9 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_0 | 0.7760000000000001 | |
10 | 10 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_0 | 0.7859999999999999 | |
11 | 11 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_0 | 0.822 | |
12 | 12 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_1 | 0.891 | |
13 | 13 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_1 | 0.894 | |
14 | 14 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_1 | 0.894 | |
15 | 15 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_1 | 0.869 | |
16 | 16 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_1 | 0.914 | |
17 | 17 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_1 | 0.889 | |
18 | 18 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_1 | 0.8850000000000001 | |
19 | 19 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_1 | 0.898 | |
20 | 20 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_1 | 0.896 | |
21 | 21 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_1 | 0.86 | |
22 | 22 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_1 | 0.887 | |
23 | 23 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_1 | 0.8800000000000001 | |
24 | 24 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_2 | 0.936 | |
25 | 25 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_2 | 0.9129999999999999 | |
26 | 26 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_2 | 0.9000000000000001 | |
27 | 27 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_2 | 0.9119999999999999 | |
28 | 28 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_2 | 0.9349999999999999 | |
29 | 29 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_2 | 0.9280000000000002 | |
30 | 30 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_2 | 0.9150000000000001 | |
31 | 31 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_2 | 0.9160000000000001 | |
32 | 32 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_2 | 0.929 | |
33 | 33 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_2 | 0.92 | |
34 | 34 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_2 | 0.9160000000000001 | |
35 | 35 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_2 | 0.926 |
# in R: | |
import io | |
import statsmodels.api as sm | |
import pandas as pd | |
import patsy | |
from betareg import Beta | |
import numpy as np | |
# betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
_income_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03 | |
income -0.01229884 0.003035585 -4.051556 5.087819e-05 | |
persons 0.11846210 0.035340667 3.352005 8.022853e-04""" | |
_income_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(phi) 35.60975 8.079598 4.407366 1.046351e-05 | |
""" | |
_methylation_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 1.44224 0.03401 42.404 2e-16 | |
genderM 0.06986 0.04359 1.603 0.109 | |
CpGCpG_1 0.60735 0.04834 12.563 2e-16 | |
CpGCpG_2 0.97355 0.05311 18.331 2e-16""" | |
_methylation_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 8.22829 1.79098 4.594 4.34e-06 | |
age -0.03471 0.03276 -1.059 0.289""" | |
expected_income_mean = pd.read_table(io.StringIO(_income_estimates_mean), sep="\s+") | |
expected_income_precision = pd.read_table(io.StringIO(_income_estimates_precision), sep="\s+") | |
expected_methylation_mean = pd.read_table(io.StringIO(_methylation_estimates_mean), sep="\s+") | |
expected_methylation_precision = pd.read_table(io.StringIO(_methylation_estimates_precision), sep="\s+") | |
income = pd.read_csv('foodexpenditure.csv') | |
methylation = pd.read_csv('methylation-test.csv') | |
def check_same(a, b, eps, name): | |
assert np.allclose(a, b, rtol=0.01, atol=eps), \ | |
("different from expected", name, list(a), list(b)) | |
def test_income_coefficients(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-1], expected_income_mean['Estimate'], 1e-3, "estimates" | |
yield check_same, rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3, "p-values" | |
def test_income_precision(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
# note that we have to exp the phi results for now. | |
yield check_same, np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-1:], expected_income_precision['zvalue'], 0.1, "z-score" | |
yield check_same, rslt.pvalues[-1:], expected_income_precision['Pr(>|z|)'], 1e-3, "p-values" | |
def test_methylation_coefficients(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2, "estimates" | |
yield check_same, rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-2], expected_methylation_mean['Pr(>|z|)'], 1e-2, "p-values" | |
def test_methylation_precision(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
#yield check_same, sm.families.links.logit()(rslt.params[-2:]), expected_methylation_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-2:], expected_methylation_precision['zvalue'], 0.1, "z-score" | |
there's a pull request (not sure if it's open or closed now) on the statsmodels repo where someone else did much more work on this.
there's a pull request (not sure if it's open or closed now) on the statsmodels repo where someone else did much more work on this.
@brentp - thank you for your work on this!
I searched open and closed PRs on the statsmodels repo for "beta regression" and couldn't find the PR you mentioned. I only found #2030 and #4238.
Do you remember if code in that PR was substantially more robust than yours? If yes, would you by any chance remember what it was about, so that I can try other search queries?
thank you for your work.
if I had generated data follow beta distribution to crate beta regression mode how do it in R?
can I used the author estmation
This is a very useful code. It behooves best if you send a pull request to statsmodels module.