# Print the average values of the variables in the dataset
print(data.mean())
# Print the standard deviation of the variables in the dataset
print(data.std())
# Get the key statistics of the dataset
print(data.describe())
import pandas as pd
data = pd.read_csv("data/chapter_3/rfm_datamart.csv")
print(data.mean())
print(data.std())
print(data.describe())
Obviously, here Python methods do each variables in the dataset.
Next we will detect the skewness for this dataset.
import seaborn as sns
import matplotlib.pyplot as plt
plt.subplot(3,1,1);sns.distplot(data['Recency'])
plt.subplot(3,1,2);sns.distplot(data['Frequency'])
plt.subplot(3,1,3);sns.distplot(data['MonetaryValue'])
plt.show()
Obviously, there are right skewness for all variables. It is usual in the Fin-Tech database.
import numpy as np
data_log = np.log(data)
plt.subplot(3,1,1);sns.distplot(data_log['Recency'])
plt.subplot(3,1,2);sns.distplot(data_log['Frequency'])
plt.subplot(3,1,3);sns.distplot(data_log['MonetaryValue'])
plt.show()
It works better, but there are some weakness. Let's normalize it.
data_normalized = (data_log - data_log.mean()) / data_log.std()
data_normalized.describe().round(2)
describe()
is a good indicator to detect mean, std and skewness(25,75%)
Or, we can do it in a much quicker way.
conda install scikit-learn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_log)
data_normalized = pd.DataFrame(data_normalized, index=data.index, columns=data.columns)
data_normalized.describe().round(2)
plt.subplot(3,1,1);sns.distplot(data_normalized['Recency'])
plt.subplot(3,1,2);sns.distplot(data_normalized['Frequency'])
plt.subplot(3,1,3);sns.distplot(data_normalized['MonetaryValue'])
plt.show()
Here, there are some skewness (positive and negative) in this dataset.