Last active
August 9, 2020 18:52
-
-
Save adraguidev/790d672f259ff7e04f18cac069d75813 to your computer and use it in GitHub Desktop.
H2O
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#crear un h2oframe desde pandas | |
df_medicine = h2o.H2OFrame(df) | |
#Convertir H2Odataframe a un Dataframe de Pandas | |
df_python=h2o.h2o.as_list(df_medicine, use_pandas=True) | |
type(df_python) | |
#Encoding | |
from collections import defaultdict | |
from sklearn import preprocessing | |
#Defino el diccionario con la funcion LabelEncoder(que se usa para convertir los object en numerico | |
d = defaultdict(preprocessing.LabelEncoder) | |
type(d) | |
# Agrupando columnas por tipo de datos | |
tipos = df_python.columns.to_series().groupby(df_python.dtypes).groups | |
import numpy as np | |
#Generar la lista de solo los nombres de las columna de tipo object | |
list_text=list(tipos[np.dtype('object')]) | |
list_text | |
#Aplicamos fit a todas las variables categoricas para encodearlas | |
#lambda x: d[x.name].fit_transform(x) if x.name in list_text else x | |
#Solo se aplica d[x.name].fit_transform(x) la transformacion | |
#si el nombre se encuentra en la lista(if x.name in list_text) | |
#de lo contrario no se realiza ninguna transformacion(x) | |
# Codificar las variables | |
fit=df_python.apply(lambda x: d[x.name].fit_transform(x) if x.name in list_text else x) | |
# Invertir el codificado(Necesario para entrenar al fit si quieres invertir el cambio | |
fit.apply(lambda x: d[x.name].inverse_transform(x) if x.name in list_text else x) | |
# Usando el diccionario para etiquetar datos futuros | |
df_python=df_python.apply(lambda x: d[x.name].transform(x) if x.name in list_text else x) | |
df_python.head() | |
#Como obtener cada clase de la variable "Treatment" | |
class_Genotype = list(d['Treatment'].classes_) | |
class_Genotype | |
#El numero correspondiente de cada clase de la lista "class_job" | |
d['Treatment'].transform(class_Genotype) | |
x = df_python.values | |
#Ojo: Cuando se esta trabajando con H2O | |
predictors = df_medicine.col_names #nombres de las variables | |
#generamos datos de entrenamiento y texto / particionamiento | |
train, valid = df_medicine.split_frame(ratios=[.8], seed=1234) | |
encoding = "one_hot_explicit" | |
pros_km = H2OKMeansEstimator(categorical_encoding=encoding, | |
k=3, | |
seed=1234) | |
pros_km.train(x=predictors, #etiquetas | |
training_frame=train, #datos de entrenamiento | |
validation_frame=valid) #datos de validación | |
pros_km.scoring_history() | |
predict = pros_km.predict(df_medicine) | |
from sklearn.metrics import euclidean_distances, silhouette_score | |
#Convert H2Odataframe to Pandas Dataframe | |
df_predict=h2o.h2o.as_list(predict, use_pandas=True) | |
df_predict.head() | |
#Conocer la suma de cuadrados | |
qqq=pros_km.summary().as_data_frame | |
#Plotear Silouette | |
from sklearn.metrics import euclidean_distances, silhouette_score | |
import matplotlib.pyplot as plt | |
sil = [] #definimos un objeto vacio tipo lista | |
for i in range(2, 11): | |
#kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0) | |
pros_km = H2OKMeansEstimator(categorical_encoding=encoding, | |
k=i, | |
seed=1234, | |
max_iterations=20) | |
pros_km.train(x=predictors, #etiqueta de los predictores | |
training_frame=train, #data de entrenamiento | |
validation_frame=valid) #data de validación | |
predict = pros_km.predict(df_medicine) | |
df_predict_i = h2o.h2o.as_list(predict, use_pandas=True) | |
sil.append(silhouette_score(x, df_predict_i.values)) | |
#print(sil) | |
plt.plot(range(2, 11), sil) | |
plt.ylabel("Silouette") | |
plt.xlabel("k") | |
plt.title("Silouette para data clientes") | |
#Calculamos el modelo con K=2 según Siluette | |
### | |
pros_km = H2OKMeansEstimator(categorical_encoding=encoding, | |
k=2, | |
seed=1234, | |
max_iterations=20) | |
pros_km.train(x=predictors, training_frame=train, validation_frame=valid) | |
predict = pros_km.predict(df_medicine) #predict etiqueta de grupo o cluster | |
df_predict = h2o.h2o.as_list(predict, use_pandas=True) #transformamos a un objeto tipo data frame | |
df_predict_val = df_predict['predict'].to_numpy() #agregamos la etiqueta correspondiente | |
type(df_predict_val) | |
#Graficamos | |
### | |
plt.scatter(x[df_predict_val == 0, 0], x[df_predict_val == 0, 1], s = 50, c = 'pink', label = 'Grupo A') | |
plt.scatter(x[df_predict_val == 1, 0], x[df_predict_val == 1, 1], s = 50, c = 'yellow', label = 'Grupo B') | |
#plt.scatter(x[df_predict_val == 2, 0], x[df_predict_val == 2, 1], s = 100, c = 'cyan', label = 'Grupo C') | |
#plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 100, c = 'magenta', label = 'Grupo Derrochador') | |
#plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 100, c = 'orange', label = 'Grupo Cuidadoso') | |
#plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centroide') | |
plt.style.use('fivethirtyeight') | |
plt.title('K Means Clustering', fontsize = 20) | |
#plt.xlabel('Ingrsos Anuales') | |
#plt.ylabel('Puntaje de gastos') | |
plt.legend() | |
plt.grid() | |
plt.show() | |
#No olvidar apagar H2O | |
h2o.shutdown() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment