Last active
August 11, 2020 02:52
-
-
Save hamolicious/4ca7a48f32d628ac7c1c342bfe779b3f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import system ; system('cls') | |
# import all the necessary libraries | |
import pandas | |
import numpy | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LinearRegression | |
from sklearn.neighbors import KNeighborsRegressor | |
import sklearn | |
# import the data file which is in CSV format into a pandas DataFrame | |
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] # MEDV is the median cost of the house | |
bos1 = pandas.read_csv('DataSets/housing.csv', delimiter=r'\s+', names=column_names) # formats the CSV file into a table | |
# divide our data into training data and testing data such that 70% of data is training data and the rest is testing data | |
bos1.isna().sum() # counts how many values are missing from each row | |
X = numpy.array(bos1.iloc[:,0:13]) # data to base the learning from | |
Y = numpy.array(bos1['MEDV']) # data to generate based on X data | |
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5) # splits the data into test and training data | |
# load first model | |
lr = LinearRegression() # create an instance of the model | |
# train first model | |
lr.fit(x_train, y_train) | |
# predict the testing data so that later evaluate the model | |
pred_lr = lr.predict(x_test) # generate prices based on the input parameters | |
# load second model | |
knr = KNeighborsRegressor() | |
knr.fit(x_train, y_train) | |
pred_knr = knr.predict(x_test) | |
# hyperparameter tuning for knr (trying to find the best value for K such that it has the lowest MSE) | |
for i in range(1, 50): | |
model = KNeighborsRegressor(i) | |
model.fit(x_train, y_train) | |
pred_y = model.predict(x_test) | |
mse = sklearn.metrics.mean_squared_error(y_test, pred_y, squared=False) | |
# k=4 seems to have the lowest mse | |
print( f'{mse} error for k={i}' ) | |
""" | |
Why am I using the pred_lr from the lr model where K is 5 (default) instead of the above tuned model? | |
""" | |
print('\n') | |
# error for linear regression | |
mse_lr = sklearn.metrics.mean_squared_error(y_test, pred_lr, squared=False) | |
print(f'error for Linear Regression = {mse_lr}') | |
# error for K-NN | |
mse_knr = sklearn.metrics.mean_squared_error(y_test, pred_knr, squared=False) | |
print(f'error for K-NN = {mse_knr}') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment