Skip to content

Instantly share code, notes, and snippets.

@braz
Last active February 27, 2019 16:47
Show Gist options
  • Save braz/e2178926ad233283c3d171b54546883f to your computer and use it in GitHub Desktop.
Save braz/e2178926ad233283c3d171b54546883f to your computer and use it in GitHub Desktop.
Python plotting of "Vinho Verde" red wine dataset for linear regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# URL for the Wine Quality Portuguese "Vinho Verde" red wine dataset (UCI Machine Learning Repository)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
# download the file
try:
import urllib.request
raw_data = urllib.request.urlopen(url)
except ImportError:
import urllib
raw_data = urllib.urlopen(url)
data_names= ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality (0-10)']
# load the CSV file as a numpy matrix
data = pd.read_csv(raw_data, sep=";", header=None, skiprows=1)
data.columns = data_names
# Create linear regression object
regr = linear_model.LinearRegression()
x_data = data['alcohol'].values.reshape(-1,1)
y_data = data['quality (0-10)'].values.reshape(-1,1)
# once the data is reshaped, running the fit is simple
regr.fit(x_data, y_data)
plt.clf()
plt.figure(figsize = (10, 6))
plt.title('Alcohol vs Quality')
plt.xlabel(data_names[10])
plt.ylabel(data_names[11])
plt.scatter(data['alcohol'].values, data['quality (0-10)'].values)
plt.legend()
# Plot the data and the fit for the linear regresssion
plt.plot(x_data, regr.predict(x_data), color='black', linewidth=3)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment