Skip to content

Instantly share code, notes, and snippets.

@hamolicious
Last active August 11, 2020 02:52
Show Gist options
  • Save hamolicious/4ca7a48f32d628ac7c1c342bfe779b3f to your computer and use it in GitHub Desktop.
Save hamolicious/4ca7a48f32d628ac7c1c342bfe779b3f to your computer and use it in GitHub Desktop.
from os import system ; system('cls')
# import all the necessary libraries
import pandas
import numpy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import sklearn
# import the data file which is in CSV format into a pandas DataFrame
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] # MEDV is the median cost of the house
bos1 = pandas.read_csv('DataSets/housing.csv', delimiter=r'\s+', names=column_names) # formats the CSV file into a table
# divide our data into training data and testing data such that 70% of data is training data and the rest is testing data
bos1.isna().sum() # counts how many values are missing from each row
X = numpy.array(bos1.iloc[:,0:13]) # data to base the learning from
Y = numpy.array(bos1['MEDV']) # data to generate based on X data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5) # splits the data into test and training data
# load first model
lr = LinearRegression() # create an instance of the model
# train first model
lr.fit(x_train, y_train)
# predict the testing data so that later evaluate the model
pred_lr = lr.predict(x_test) # generate prices based on the input parameters
# load second model
knr = KNeighborsRegressor()
knr.fit(x_train, y_train)
pred_knr = knr.predict(x_test)
# hyperparameter tuning for knr (trying to find the best value for K such that it has the lowest MSE)
for i in range(1, 50):
model = KNeighborsRegressor(i)
model.fit(x_train, y_train)
pred_y = model.predict(x_test)
mse = sklearn.metrics.mean_squared_error(y_test, pred_y, squared=False)
# k=4 seems to have the lowest mse
print( f'{mse} error for k={i}' )
"""
Why am I using the pred_lr from the lr model where K is 5 (default) instead of the above tuned model?
"""
print('\n')
# error for linear regression
mse_lr = sklearn.metrics.mean_squared_error(y_test, pred_lr, squared=False)
print(f'error for Linear Regression = {mse_lr}')
# error for K-NN
mse_knr = sklearn.metrics.mean_squared_error(y_test, pred_knr, squared=False)
print(f'error for K-NN = {mse_knr}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment