Created
August 10, 2020 01:45
-
-
Save adraguidev/1d12cdbda6f480e03796cf82c3145e7b to your computer and use it in GitHub Desktop.
Metodo CLARANS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importamos las librerrias | |
from pyclustering.cluster.clarans import clarans; | |
from pyclustering.utils import timedcall; | |
import pyreadstat | |
#cargamos la data | |
filesav = 'datos/democracias_latam.sav' | |
df, meta = pyreadstat.read_sav(filesav ) | |
df.head(25) | |
#extraemos las variables independientes | |
X = df.iloc[:, 4:].values | |
#echa un vistazo al conjunto de datos | |
print("Un vistazo al conjunto de datos : ",X[:3]) | |
##Clustering algorithm CLARANS | |
#np.random.seed(2020) | |
clarans_instance = clarans(X, #data | |
3, #numero de cluster | |
10, #cantidad de iteraciones | |
4); #maximo numero de vecinos | |
#llama al método clarans 'proceso' para implementar el algoritmo | |
(ticks, result) = timedcall(clarans_instance.process); | |
print("Tiempo : ", ticks, "\n"); | |
#vemos los clusters generados | |
clusters = clarans_instance.get_clusters(); | |
clusters | |
#devuelve los medoides | |
medoids = clarans_instance.get_medoids(); | |
import pandas as pd | |
#Declarar un dataframe base | |
df_x = pd.DataFrame({}, | |
columns=[]) | |
for i in range (0, len(clusters)): | |
df["cluster"] = None | |
#df["cluster"] = m[df.A] | |
dfb=df.loc[clusters[i],] | |
dfb.cluster= i | |
#Concatenar con el dataframe base | |
df_x = pd.concat([df_x, dfb]) | |
#Ordenar por indice | |
df_x = df_x.sort_index() | |
#Coeficiente Silueta | |
from sklearn.metrics import euclidean_distances, silhouette_score | |
score = silhouette_score(X, df_x["cluster"].values) | |
score | |
#Coeficiente de silueta para distintos valores de k | |
from sklearn.metrics import euclidean_distances, silhouette_score | |
import matplotlib.pyplot as plt | |
from pyclustering.cluster.clarans import clarans; | |
from pyclustering.utils import timedcall; | |
sil = [] | |
for i in range(2,11): | |
#Declarar un dataframe base | |
df_x = pd.DataFrame({}, | |
columns=[]) | |
clarans_instance = clarans(X, i, 30, 4) | |
#llama al método clarans 'proceso' para implementar el algoritmo | |
(ticks, result) = timedcall(clarans_instance.process); | |
clusters = clarans_instance.get_clusters() | |
len_cluster = len(clusters) | |
#print(clarans_instance.get_clusters()) | |
for k in range (0, len_cluster): | |
df["cluster"] = None | |
dfb=df.loc[clusters[k],] | |
dfb.cluster= k | |
#Concatenar con el dataframe base | |
df_x = pd.concat([df_x, dfb]) | |
#print(dfb) | |
#Ordenar por indice | |
df_x = df_x.sort_index() | |
#print("---------1111") | |
#print(i) | |
#print(df_x) | |
score= round(silhouette_score(X, df_x["cluster"].values),2) | |
sil.append(score) | |
plt.plot(range(2, 11),sil) | |
plt.ylabel("Silouette") | |
plt.xlabel("k") | |
plt.title("Silouette para data clientes") | |
plt.show() | |
sil |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clarans_instance = clarans(X, 2, 6, 4); | |
(ticks, result) = timedcall(clarans_instance.process); | |
print("Tiempo : ", ticks, "\n"); | |
clusters = clarans_instance.get_clusters(); | |
clusters | |
len(clusters) | |
import pandas as pd | |
#Declarar un dataframe base | |
df_x = pd.DataFrame({}, | |
columns=[]) | |
for i in range (0, len(clusters)): | |
df["cluster"] = None | |
#df["cluster"] = m[df.A] | |
dfb=df.loc[clusters[i],] | |
dfb.cluster= i | |
#Concatenar con el dataframe base | |
df_x = pd.concat([df_x, dfb]) | |
#Ordenar por indice | |
df_x = df_x.sort_index() | |
df_x | |
import numpy as np | |
import pandas as pd | |
def obt_metrics(data_new,n_clusters,n_columns,metric):#data_new:dataframe,el numero de clusters,el numero de columnas,#1 media 2 desviacion estandar | |
str_columns=data_new.iloc[:, 0:n_columns].columns | |
data_final = pd.DataFrame(columns=str_columns)#Los nombres de las diferentes columnas | |
data_final['Cluster'] = "Cluster :"#Crear la columna Cluster | |
for i in range(0,n_clusters):#Recorremos cada cluster | |
#print(i) | |
#Especificar el nombre donde se encuentra el nombre de la columna de cluster | |
data_new_1=data_new[data_new.cluster==i]#Agrupamos el dataframe por cada cluster | |
data1=data_new_1.describe()#Obtenemos el numero,media,desviacion estandar,...,max. | |
data1=data1.iloc[:, 0:n_columns]#Selecciona todas las columnas | |
data1=data1.iloc[[metric],]#1 mean 2 desviacion estandar | |
#print(data1)#Pintamos de cada cluster | |
len_data1=len(data1.columns)#El numero de columnas | |
list_row=[]#Declaramos el array | |
for k in range(0,len_data1):#Recorremos cada columna | |
name_column=data1.columns[k]#Obtenemos el nombre de cada columna | |
value=data1[[name_column][0]][0]#Obtenemos el valor de la columna | |
#print(name_column) | |
#print(value) | |
list_row.append(value)#Agregamos los valores de cada columna | |
list_row.append("Cluster :"+str(i))#Agregamos la columna cluster | |
data_final.loc[i-1] = list_row#Agregamos una fila al dataframe final | |
#print("---------------------------") | |
return data_final | |
data_varimp = df_x.iloc[:,4:10] | |
#data_new:dataframe,el numero de clusters,el numero de columnas,#1 media, 2 desviacion estandar | |
data_final=obt_metrics(data_varimp,len(clusters),5,1) | |
data_final | |
#data_new:dataframe,el numero de clusters,el numero de columnas,#1 media, 2 desviacion estandar | |
data_final2=obt_metrics(data_varimp,len(clusters),5,2) | |
data_final2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment