hamolicious · August 11, 2020 02:52
diff --git a/HousePricePrediction.py b/HousePricePrediction.py
 from os import system ; system('cls')

 # import all the necessary libraries
 import pandas
 import numpy

 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.neighbors import KNeighborsRegressor
 import sklearn

 # import the data file which is in CSV format into a pandas DataFrame
 column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] # MEDV is the median cost of the house
 bos1 = pandas.read_csv('DataSets/housing.csv', delimiter=r'\s+', names=column_names) # formats the CSV file into a table

 # divide our data into training data and testing data such that 70% of data is training data and the rest is testing data
 bos1.isna().sum() # counts how many values are missing from each row

 X = numpy.array(bos1.iloc[:,0:13]) # data to base the learning from
 Y = numpy.array(bos1['MEDV']) # data to generate based on X data

 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5) # splits the data into test and training data

 # load first model
 lr = LinearRegression() # create an instance of the model
 # train first model
 lr.fit(x_train, y_train)
 # predict the testing data so that later evaluate the model
 pred_lr = lr.predict(x_test) # generate prices based on the input parameters


 # load second model
 knr = KNeighborsRegressor()
 knr.fit(x_train, y_train)
 pred_knr = knr.predict(x_test)


 # hyperparameter tuning for knr (trying to find the best value for K such that it has the lowest MSE)
 for i in range(1, 50):
    model = KNeighborsRegressor(i)
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)

    mse = sklearn.metrics.mean_squared_error(y_test, pred_y, squared=False)

    # k=4 seems to have the lowest mse
    print( f'{mse} error for k={i}' )


 """
 Why am I using the pred_lr from the lr model where K is 5 (default) instead of the above tuned model?
 """

 print('\n')
 # error for linear regression
 mse_lr = sklearn.metrics.mean_squared_error(y_test, pred_lr, squared=False)
 print(f'error for Linear Regression = {mse_lr}')

 # error for K-NN
 mse_knr = sklearn.metrics.mean_squared_error(y_test, pred_knr, squared=False)
 print(f'error for K-NN = {mse_knr}')
	from os import system ; system('cls')

	# import all the necessary libraries
	import pandas
	import numpy

	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.neighbors import KNeighborsRegressor
	import sklearn

	# import the data file which is in CSV format into a pandas DataFrame
	column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] # MEDV is the median cost of the house
	bos1 = pandas.read_csv('DataSets/housing.csv', delimiter=r'\s+', names=column_names) # formats the CSV file into a table

	# divide our data into training data and testing data such that 70% of data is training data and the rest is testing data
	bos1.isna().sum() # counts how many values are missing from each row

	X = numpy.array(bos1.iloc[:,0:13]) # data to base the learning from
	Y = numpy.array(bos1['MEDV']) # data to generate based on X data

	x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5) # splits the data into test and training data

	# load first model
	lr = LinearRegression() # create an instance of the model
	# train first model
	lr.fit(x_train, y_train)
	# predict the testing data so that later evaluate the model
	pred_lr = lr.predict(x_test) # generate prices based on the input parameters


	# load second model
	knr = KNeighborsRegressor()
	knr.fit(x_train, y_train)
	pred_knr = knr.predict(x_test)


	# hyperparameter tuning for knr (trying to find the best value for K such that it has the lowest MSE)
	for i in range(1, 50):
	model = KNeighborsRegressor(i)
	model.fit(x_train, y_train)
	pred_y = model.predict(x_test)

	mse = sklearn.metrics.mean_squared_error(y_test, pred_y, squared=False)

	# k=4 seems to have the lowest mse
	print( f'{mse} error for k={i}' )


	"""
	Why am I using the pred_lr from the lr model where K is 5 (default) instead of the above tuned model?
	"""

	print('\n')
	# error for linear regression
	mse_lr = sklearn.metrics.mean_squared_error(y_test, pred_lr, squared=False)
	print(f'error for Linear Regression = {mse_lr}')

	# error for K-NN
	mse_knr = sklearn.metrics.mean_squared_error(y_test, pred_knr, squared=False)
	print(f'error for K-NN = {mse_knr}')