-
-
Save agramfort/2578458 to your computer and use it in GitHub Desktop.
Ridge regression scaled
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
========================================================= | |
Title | |
========================================================= | |
Description | |
""" | |
print __doc__ | |
# Author: Andreas Mueller <[email protected]> | |
# Jaques Grobler <[email protected]> | |
# License: BSD | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.linear_model import Ridge | |
from sklearn.cross_validation import ShuffleSplit | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.utils import check_random_state | |
from time import time | |
if __name__ == '__main__': | |
rnd = check_random_state(1) | |
# set up dataset | |
n_samples = 100 | |
n_features = 1000 | |
subplotnum = 1 | |
X = rnd.randn(n_samples, n_features) | |
coef = 5 * rnd.randn(n_features) | |
# coef[10:] = 0 | |
y = np.dot(X, coef) | |
clf = Ridge(fit_intercept=False, tol=0.1) | |
alphas = np.logspace(-2, 6, 30) | |
t0_clf = time() | |
plt.figure(figsize=(9, 10)) | |
plt.clf() | |
plt.xlabel('C') | |
plt.ylabel('CV Score') | |
colors = ['b', 'g', 'r', 'c'] | |
for k, train_fraction in enumerate(np.arange(0.2, 0.6, 0.1)[::-1]): | |
param_grid = dict(alpha=alphas) | |
t0 = time() | |
grid = GridSearchCV(clf, n_jobs=-1, refit=False, param_grid=param_grid, | |
cv=ShuffleSplit(n=n_samples, train_fraction=train_fraction, | |
n_iterations=100, random_state=1)) | |
grid.fit(X, y) | |
print "GridSearchCV done in %0.3fs." % (time() - t0) | |
scores = [x[1] for x in grid.grid_scores_] | |
print(clf) | |
scales = [(1, 'No scaling'), | |
(1./(np.sqrt(n_samples * train_fraction)), '1/sqrt(n_samples)'), | |
(1./(n_samples * train_fraction), '1/n_samples'), | |
] | |
for subplotnum, (scaler, name) in enumerate(scales): | |
print ('%s: Scaler: %s (1/%s)') % (subplotnum + 1, name, scaler) | |
plt.subplot(3, 1, subplotnum + 1) | |
grid_alphas = alphas / float(scaler) | |
plt.semilogx(grid_alphas, scores, | |
label="fraction %.2f" % train_fraction) | |
ymin, ymax = plt.ylim() | |
plt.vlines(grid_alphas[np.argmax(scores)], ymin, ymax, | |
color=colors[k], linestyles='solid') | |
print '-------------------------------' | |
# plot ledgend | |
plt.legend(loc="lower left") | |
print "Total time taken: %0.3fs." % (time() - t0_clf) | |
plt.show() | |
print "Completed" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment