Skip to content

Instantly share code, notes, and snippets.

#utilizing one-hot-encoding, Randomforest, and xgboost to predict the outliers
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import GradientBoostingClassifier
import time
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
#matplotlib inline
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime
import operator
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import cross_val_score, train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import cross_val_score, train_test_split
#input the data
data0 = pd.read_csv('./data/train.csv', sep=',', header=False, names=range(378))
data = data0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef as mc
import operator
@shengch02
shengch02 / Stochastic gradient ascent
Created January 5, 2017 02:06
(Python) Implement stochastic gradient ascent with L2 penalty. Compare convergence of stochastic gradient ascent with that of batch gradient ascent.
#Training logistic regression via stochastic gradient ascent
import math
import pandas as pd
import numpy as np
#the dataset consists a subset of baby product reviews on Amazon.com
import sframe
products = sframe.SFrame('amazon_baby_subset.gl/')
products = sframe.SFrame.to_dataframe(products)
@shengch02
shengch02 / Explore precision and recall
Created January 3, 2017 17:59
(Python) Explore various evaluation metrics: accuracy, confusion matrix, precision, recall. Explore how various metrics can be combined to produce a cost of making an error. Explore precision and recall curves.
#explore precision and recall
import pandas as pd
import numpy as np
#the dataset consists of baby product reviews on Amazon.com
import sframe
products = sframe.SFrame('amazon_baby.gl/')
#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment,
# perform a train/test split, produce word count matrix
@shengch02
shengch02 / Implementing gradient boosted trees from scratch
Created January 2, 2017 17:08
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities. Evaluate the trained model and compare it with a baseline.
#Boosting a decision stump from scratch
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
@shengch02
shengch02 / Gradient boosted trees
Created January 2, 2017 01:29
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities (on a validation set). Find the most positive and negative loans using the learned model. Explore how the number of trees influences classification performance.
#use the pre-implemented gradient boosted trees
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
@shengch02
shengch02 / explore various techniques for preventing overfitting in decision trees
Created December 28, 2016 22:00
(Python) Implement binary decision trees with different early stopping methods. Compare models with different stopping parameters.
#explore various techniques for preventing overfitting in decision trees
import math
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')