from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
datamart_rfmt = pd.read_csv("data/chapter_4/datamart_rfmt.csv")
datamart_rfmt_log = np.log(datamart_rfmt)
scaler = StandardScaler()
datamart_rfmt_normalized = scaler.fit_transform(datamart_rfmt_log)
datamart_rfmt_normalized = pd.DataFrame(data = datamart_rfmt_normalized, index=datamart_rfmt.index, columns=datamart_rfmt.columns)
from sklearn.cluster import KMeans
sse = {}
for k in range(1,11):
kmeans = KMeans(n_clusters = k, random_state = 1).fit(datamart_rfmt_normalized)
sse[k] = kmeans.inertia_
import seaborn as sns
import matplotlib.pyplot as plt
plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
The four is best.
kmeans = KMeans(n_clusters=4, random_state=1)
kmeans.fit(datamart_rfmt_normalized)
cluster_labels = kmeans.labels_
datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels)
grouped = datamart_rfmt_k4.groupby(['Cluster'])
grouped.agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': 'mean',
'Tenure': ['mean', 'count']
}).round(1)