In [7]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
In [8]:
datamart_rfmt = pd.read_csv("data/chapter_4/datamart_rfmt.csv")
In [10]:
datamart_rfmt_log = np.log(datamart_rfmt)
In [15]:
scaler = StandardScaler()
datamart_rfmt_normalized = scaler.fit_transform(datamart_rfmt_log)
datamart_rfmt_normalized = pd.DataFrame(data = datamart_rfmt_normalized, index=datamart_rfmt.index, columns=datamart_rfmt.columns)
In [16]:
from sklearn.cluster import KMeans
In [20]:
sse = {}
for k in range(1,11):
    kmeans = KMeans(n_clusters = k, random_state = 1).fit(datamart_rfmt_normalized)
    sse[k] = kmeans.inertia_
In [21]:
import seaborn as sns
import matplotlib.pyplot as plt
In [22]:
plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()

The four is best.

In [24]:
kmeans = KMeans(n_clusters=4, random_state=1) 
kmeans.fit(datamart_rfmt_normalized)
cluster_labels = kmeans.labels_
In [27]:
datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels)
grouped = datamart_rfmt_k4.groupby(['Cluster'])
grouped.agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': 'mean',
    'Tenure': ['mean', 'count']
  }).round(1)
Out[27]:
Recency Frequency MonetaryValue Tenure
mean mean mean mean count
Cluster
0 26.4 42.9 887.6 274.6 1188
1 30.1 7.5 101.8 40.1 656
2 140.2 7.6 151.1 216.6 878
3 168.5 6.2 104.9 227.6 921