ShivendraAgrawal · December 12, 2016 18:31
diff --git a/Scikit-Learn-snippets.py b/Scikit-Learn-snippets.py
 SOURCE http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/

 # Correlation
 import numpy as np
 from scipy.stats import pearsonr
 np.random.seed(0)
 size = 300
 x = np.random.normal(0, 1, size)
 print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))
 print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))

 #Feature selection Random forrest
 from sklearn.datasets import load_boston
 from sklearn.ensemble import RandomForestRegressor
 import numpy as np
 #Load boston housing dataset as an example
 boston = load_boston()
 X = boston["data"]
 Y = boston["target"]
 names = boston["feature_names"]
 rf = RandomForestRegressor()
 rf.fit(X, Y)
 print "Features sorted by their score:"
 print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True)
             
 #Shuffle split and feature selection
 from sklearn.cross_validation import ShuffleSplit
 from sklearn.metrics import r2_score
 from collections import defaultdict
 
 X = boston["data"]
 Y = boston["target"]
 
 rf = RandomForestRegressor()
 scores = defaultdict(list)
 
 #crossvalidate the scores on a number of different random splits of the data
 for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]
    r = rf.fit(X_train, Y_train)
    acc = r2_score(Y_test, rf.predict(X_test))
    for i in range(X.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(Y_test, rf.predict(X_t))
        scores[names[i]].append((acc-shuff_acc)/acc)
 print "Features sorted by their score:"
 print sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True)
              
              
 # Linear Model with L1 Lasso regularization
 from sklearn.linear_model import Lasso
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import load_boston
  
 boston = load_boston()
 scaler = StandardScaler()
 X = scaler.fit_transform(boston["data"])
 Y = boston["target"]
 names = boston["feature_names"]
  
 lasso = Lasso(alpha=.3)
 lasso.fit(X, Y)
  
 print "Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True)
	SOURCE http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/

	# Correlation
	import numpy as np
	from scipy.stats import pearsonr
	np.random.seed(0)
	size = 300
	x = np.random.normal(0, 1, size)
	print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))
	print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))

	#Feature selection Random forrest
	from sklearn.datasets import load_boston
	from sklearn.ensemble import RandomForestRegressor
	import numpy as np
	#Load boston housing dataset as an example
	boston = load_boston()
	X = boston["data"]
	Y = boston["target"]
	names = boston["feature_names"]
	rf = RandomForestRegressor()
	rf.fit(X, Y)
	print "Features sorted by their score:"
	print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
	reverse=True)

	#Shuffle split and feature selection
	from sklearn.cross_validation import ShuffleSplit
	from sklearn.metrics import r2_score
	from collections import defaultdict

	X = boston["data"]
	Y = boston["target"]

	rf = RandomForestRegressor()
	scores = defaultdict(list)

	#crossvalidate the scores on a number of different random splits of the data
	for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
	X_train, X_test = X[train_idx], X[test_idx]
	Y_train, Y_test = Y[train_idx], Y[test_idx]
	r = rf.fit(X_train, Y_train)
	acc = r2_score(Y_test, rf.predict(X_test))
	for i in range(X.shape[1]):
	X_t = X_test.copy()
	np.random.shuffle(X_t[:, i])
	shuff_acc = r2_score(Y_test, rf.predict(X_t))
	scores[names[i]].append((acc-shuff_acc)/acc)
	print "Features sorted by their score:"
	print sorted([(round(np.mean(score), 4), feat) for
	feat, score in scores.items()], reverse=True)


	# Linear Model with L1 Lasso regularization
	from sklearn.linear_model import Lasso
	from sklearn.preprocessing import StandardScaler
	from sklearn.datasets import load_boston

	boston = load_boston()
	scaler = StandardScaler()
	X = scaler.fit_transform(boston["data"])
	Y = boston["target"]
	names = boston["feature_names"]

	lasso = Lasso(alpha=.3)
	lasso.fit(X, Y)

	print "Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True)