-
-
Save pb111/65dab4818f16ddb58bb6a18a3ba1785b to your computer and use it in GitHub Desktop.
Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
1.
a.
data = pd.read_csv('Mall_Customers.csv')
b.
print("Premieres lignes du dataset:")
print(data.head())
print("\nStructure du dataset:")
print(data.info())
print("\nDescription statistique:")
print(data.describe())
c.
data = data.drop('CustomerID', axis=1)
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1}) # Encode Gender
d.
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(data['Age'], kde=True)
plt.title('Histogramme de l'Age')
plt.subplot(1, 3, 2)
sns.boxplot(y=data['Annual Income (k$)'])
plt.title('Boxplot du Revenu Annuel')
plt.subplot(1, 3, 3)
sns.scatterplot(x='Age', y='Annual Income (k$)', data=data, hue='Gender')
plt.title('Scatter Plot Age vs Revenu Annuel')
plt.tight_layout()
plt.show()
e.
scaler = StandardScaler()
numeric_columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
2.
a.
inertia = []
silhouette_scores = []
K = range(1, 11)
for k in K:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data[numeric_columns])
inertia.append(kmeans.inertia_)
if k > 1:
silhouette_scores.append(silhouette_score(data[numeric_columns], kmeans.labels_))
b.
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Nombre de clusters (K)')
plt.ylabel('Inertie')
plt.title('Methode du Coude')
plt.show()
c.
(Using k-means++ by default in sklearn)
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
d.
plt.figure(figsize=(8, 5))
plt.plot(K[1:], silhouette_scores, 'bo-')
plt.xlabel('Nombre de clusters (K)')
plt.ylabel('Score de Silhouette')
plt.title('Score de Silhouette')
plt.show()
e.
optimal_k = 5 # Based on elbow and silhouette analysis (example)
print(f"Valeur optimale de K: {optimal_k}")
f.
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42)
cluster_labels = kmeans.fit_predict(data[numeric_columns])
data['Cluster'] = cluster_labels
3.
a.
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', data=data, palette='viridis')
plt.title('Clients selon Revenu et Score de Depense')
plt.show()
c.
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Age', y='Spending Score (1-100)', hue='Cluster', data=data, palette='viridis')
plt.title('Clusters selon Age et Score de Depense')
plt.show()
4.
a.
cluster_summary = data.groupby('Cluster')[numeric_columns].mean()
print("\nCaracteristiques moyennes de chaque cluster (apres standardisation):")
print(cluster_summary)
b.
for i in range(optimal_k):
cluster_data = data[data['Cluster'] == i][numeric_columns].mean()
print(f"\nSegment {i}:")
print(f" - Age moyen: {cluster_data['Age']:.2f}, Revenu moyen: {cluster_data['Annual Income (k$)']:.2f}, Score de depense: {cluster_data['Spending Score (1-100)']:.2f}")
if cluster_data['Age'] < 0:
print(" Proposition: 'Jeunes depensiers'")
elif cluster_data['Annual Income (k$)'] > 0 and cluster_data['Spending Score (1-100)'] < 0:
print(" Proposition: 'Clients a fort revenu mais faible depense'")
else:
print(" Proposition: A definir selon contexte")
c.
print("\nPossibles strategies:")
print("- Personnaliser les offres selon les caracteristiques de chaque cluster.")
print("- Cibler les 'Jeunes depensiers' avec des promotions attractives.")
print("- Encourager les 'Clients a fort revenu mais faible depense' a depenser davantage.")
d.
print("\nCommentaire sur K-Means:")
print("- K-Means est efficace pour segmenter les clients en groupes homogenes.")
print("- Cependant, il suppose des clusters spheriques et peut etre sensible aux initialisations.")
print("- La standardisation est cruciale ici due aux differentes echelles des variables.")
Great work. There is any other way(technique, algorithm,...) to initiate the K-Means?
Thanks,