Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pb111/65dab4818f16ddb58bb6a18a3ba1785b to your computer and use it in GitHub Desktop.
Save pb111/65dab4818f16ddb58bb6a18a3ba1785b to your computer and use it in GitHub Desktop.
K-Means Clustering with Python and Scikit-Learn
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@etranger3000
Copy link

Great work. There is any other way(technique, algorithm,...) to initiate the K-Means?
Thanks,

@Ahmed-amine-thabet
Copy link

Ahmed-amine-thabet commented May 14, 2025

Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

1.

a.

data = pd.read_csv('Mall_Customers.csv')

b.

print("Premieres lignes du dataset:")
print(data.head())
print("\nStructure du dataset:")
print(data.info())
print("\nDescription statistique:")
print(data.describe())

c.

data = data.drop('CustomerID', axis=1)
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1}) # Encode Gender

d.

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(data['Age'], kde=True)
plt.title('Histogramme de l'Age')

plt.subplot(1, 3, 2)
sns.boxplot(y=data['Annual Income (k$)'])
plt.title('Boxplot du Revenu Annuel')

plt.subplot(1, 3, 3)
sns.scatterplot(x='Age', y='Annual Income (k$)', data=data, hue='Gender')
plt.title('Scatter Plot Age vs Revenu Annuel')

plt.tight_layout()
plt.show()

e.

scaler = StandardScaler()
numeric_columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

2.

a.

inertia = []
silhouette_scores = []
K = range(1, 11)

for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data[numeric_columns])
inertia.append(kmeans.inertia_)
if k > 1:
silhouette_scores.append(silhouette_score(data[numeric_columns], kmeans.labels_))

b.

plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Nombre de clusters (K)')
plt.ylabel('Inertie')
plt.title('Methode du Coude')
plt.show()

c.

(Using k-means++ by default in sklearn)

kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)

d.

plt.figure(figsize=(8, 5))
plt.plot(K[1:], silhouette_scores, 'bo-')
plt.xlabel('Nombre de clusters (K)')
plt.ylabel('Score de Silhouette')
plt.title('Score de Silhouette')
plt.show()

e.

optimal_k = 5 # Based on elbow and silhouette analysis (example)
print(f"Valeur optimale de K: {optimal_k}")

f.

kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42)
cluster_labels = kmeans.fit_predict(data[numeric_columns])
data['Cluster'] = cluster_labels

3.

a.

plt.figure(figsize=(8, 5))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', data=data, palette='viridis')
plt.title('Clients selon Revenu et Score de Depense')
plt.show()

c.

plt.figure(figsize=(8, 5))
sns.scatterplot(x='Age', y='Spending Score (1-100)', hue='Cluster', data=data, palette='viridis')
plt.title('Clusters selon Age et Score de Depense')
plt.show()

4.

a.

cluster_summary = data.groupby('Cluster')[numeric_columns].mean()
print("\nCaracteristiques moyennes de chaque cluster (apres standardisation):")
print(cluster_summary)

b.

for i in range(optimal_k):
cluster_data = data[data['Cluster'] == i][numeric_columns].mean()
print(f"\nSegment {i}:")
print(f" - Age moyen: {cluster_data['Age']:.2f}, Revenu moyen: {cluster_data['Annual Income (k$)']:.2f}, Score de depense: {cluster_data['Spending Score (1-100)']:.2f}")
if cluster_data['Age'] < 0:
print(" Proposition: 'Jeunes depensiers'")
elif cluster_data['Annual Income (k$)'] > 0 and cluster_data['Spending Score (1-100)'] < 0:
print(" Proposition: 'Clients a fort revenu mais faible depense'")
else:
print(" Proposition: A definir selon contexte")

c.

print("\nPossibles strategies:")
print("- Personnaliser les offres selon les caracteristiques de chaque cluster.")
print("- Cibler les 'Jeunes depensiers' avec des promotions attractives.")
print("- Encourager les 'Clients a fort revenu mais faible depense' a depenser davantage.")

d.

print("\nCommentaire sur K-Means:")
print("- K-Means est efficace pour segmenter les clients en groupes homogenes.")
print("- Cependant, il suppose des clusters spheriques et peut etre sensible aux initialisations.")
print("- La standardisation est cruciale ici due aux differentes echelles des variables.")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment