1 Introduction

这里写我们大致思路。

2 Set Work Path

setwd("E:/sms_cases")  # 设定工作空间

3 Library All Packages

library(tidyverse)
library(plotly)
library(jiebaR)
library(wordcloud2)
library(caret)

4 Read Data

在读取数据的时候，我们使用了readr包中的read_csv()函数，这个函数是Rstudio首席科学家HadleyWickham的杰作之一。这个读取数据的函数主要有以下3个优点：

读取数据比基础包的read.csv()快十倍左右，并且显示读取进度
不会把字符串强制转换为因子类型
不会像基础包一样继承本地环境的一些行为，导致代码在其他电脑上不适用

sms <- read_csv("all_sms.csv")  # 读取数据

5 Explore Data

导入数据之后，下一步操作就行去简单的探索数据的基本结构，这样会为后续的分析打下基础（不同的任务要求不同类型的数据）。比如说在分类任务中，大部分模型要求目标变量是因子型，回归任务中要求目标变量是数值型等。

我们后续的任务是分类，经过简单的数据探索发现：目标变量是整型，然后将其转换为了因子型。

sms %>% str()  # 简单探索数据结构

## Classes 'tbl_df', 'tbl' and 'data.frame':    646620 obs. of  2 variables:
##  $ label  : int  0 0 0 0 1 0 1 0 0 1 ...
##  $ message: chr  "商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一" "带给我们大常州一场壮观的视觉盛宴" "有原因不明的泌尿系统结石等" "23年从盐城拉回来的麻麻的嫁妆" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 2
##   .. ..$ label  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ message: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

sms$label <- factor(sms$label)  # 将目标变量转换为因子型

6 Text Processing

经过上面的探索和转换，下一步就是进行nlp操作。这一部分主要涉及到了分词、去停词、分词再处理、词频统计和绘制词云图。

6.1 creat segment engine

engine <- worker(stop_word = "stopwords.txt")

6.2 segment & extract

## 对短信进行分词，并且对分词进行再处理
cut_all <-
  sms$message %>%
  plyr::llply(segment, engine) %>%                # 对短信进行分词
  lapply(str_replace_all, '[0-9a-zA-Z]', '') %>%  # 将数值和母替换为空字符串
  plyr::llply(function(x) x[nchar(x) > 1])        # 剔除长度小于1的字符串

## 封装函数去识别“character(0)”
is_zero <- function(x) { identical(x, character(0)) }

## 分别提取短信的最后文本处理结果
cut_ham  <- cut_all[which(sms$label == "0")] %>% .[!sapply(., is_zero)]
cut_spam <- cut_all[which(sms$label == "1")] %>% .[!sapply(., is_zero)]

6.3 wordfreq

## 统计正常短信的词频
word_freq_ham <-
  cut_ham %>%
  unlist() %>%               # 转换为向量
  as.tibble() %>%            # 转换为数据框
  rename(words = value) %>%  # 重命名列名
  group_by(words) %>%        # 按照不同的词就行分组
  summarise(freq = n()) %>%  # 统计上述分组的词的个数
  arrange(desc(freq))        # 降序排列

## 统计垃圾短信的词频
word_freq_spam <-
  cut_spam %>%
  unlist() %>%               # 转换为向量
  as.tibble() %>%            # 转换为数据框
  rename(words = value) %>%  # 重命名列名
  group_by(words) %>%        # 按照不同的词就行分组
  summarise(freq = n()) %>%  # 统计上述分组的词的个数
  arrange(desc(freq))        # 降序排列

7 wordcloud

wordcloud2(word_freq_ham[1:100, ], minRotation = -pi/2, maxRotation = -pi/2)

wordcloud2(word_freq_spam[1:100, ], minRotation = -pi/2, maxRotation = -pi/2)

8 Balance Data

在进行训练模型分类之前，我们需要把数据平衡一下。在现实生活中到处都是不平衡的事件：

欺诈问题中，欺诈类观测毕竟占总样本的少数
客户流失问题中，流失客户往往也是占少部分
会员对某波活动的响应问题，同样真正响应的也只是占很少的比例

如果你的数据存在不平衡的现象，分析得出的结论也一定是右偏的，往往分类结果会偏向于较多观测的类。我们的这个短信数据同样是不平衡的，垃圾短信：正常短信是9：1，如果不进行数据的平衡，那么分类结果会偏向于正常短信，那么就会导致很多垃圾短信不能成功识别。

set.seed(1)
sms_balance <- 
  sms %>%
  filter(label == "0") %>%                   # 筛选出所有的正常短信
  sample_n(unname(table(sms$label))[2]) %>%  # 随机抽取出与垃圾短信观测数相同的部分正常短信
  bind_rows(sms %>% filter(label == "1"))    # 将抽取出的正常短信与垃圾短信合并

9 Mutate Features

sms_model <- 
  sms_balance %>%
  mutate(message = str_replace_all(message, '[0-9a-zA-Z]', 'x'),
         msg_length = str_length(message),
         num_x = str_count(message, "[X|x]"),
         pct_x = str_count(message, "[X|x]") / msg_length,
         num_puncts = str_count(message, "[:punct:]"),
         pct_puncts = str_count(message, "[:punct:]") / msg_length,
         message = NULL)

10 EDA

10.1 msg_lenth

msg_num_x <- plot_ly(sms_model, 
                        x = ~msg_length, 
                        color = ~label, 
                        type = "histogram")
msg_num_x

msg_len_box <- plot_ly(sms_model, 
                       y = ~msg_length, 
                       color = ~label, 
                       type = "box")
msg_len_box

msg_len_summary <- 
  sms_model %>% 
  group_by(label) %>%
  summarise(len_median = median(msg_length),
            len_mean = mean(msg_length),
            len_q1 = quantile(msg_length, 0.25),
            len_q3 = quantile(msg_length, 0.75)) %>%
  select(-label) %>%
  t() %>%
  as_tibble()

msg_len_bar <- 
  plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
  add_bars(y = msg_len_summary[[1]], name = "label 0", 
           text = round(msg_len_summary[[1]], 0), textposition = "auto") %>%
  add_bars(y = msg_len_summary[[2]], name = "label 1",
           text = round(msg_len_summary[[2]], 0), textposition = "auto")
  
msg_len_bar

10.2 num_x

msg_num_x_hist <- plot_ly(sms_model, 
                         x = ~num_x, 
                         color = ~label, 
                         type = "histogram")
msg_num_x_hist

msg_num_x_box <- plot_ly(sms_model, 
                         y = ~num_x, 
                         color = ~label, 
                         type = "box")
msg_num_x_box

msg_num_x_summary <- 
  sms_model %>% 
  group_by(label) %>%
  summarise(num_x_median = median(num_x),
            num_x_mean = mean(num_x),
            num_x_q1 = quantile(num_x, 0.25),
            num_x_q3 = quantile(num_x, 0.75)) %>%
  select(-label) %>%
  t() %>%
  as_tibble()

msg_num_x_bar <- 
  plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
  add_bars(y = msg_num_x_summary[[1]], name = "label 0", 
           text = round(msg_num_x_summary[[1]], 0), textposition = "auto") %>%
  add_bars(y = msg_num_x_summary[[2]], name = "label 1",
           text = round(msg_num_x_summary[[2]], 0), textposition = "auto")
  
msg_num_x_bar

10.3 pct_x

msg_pct_x_hist <- plot_ly(sms_model, 
                         x = ~pct_x, 
                         color = ~label, 
                         type = "histogram")
msg_pct_x_hist

msg_pct_x_box <- plot_ly(sms_model, 
                         y = ~pct_x, 
                         color = ~label, 
                         type = "box")
msg_pct_x_box

msg_pct_x_summary <- 
  sms_model %>% 
  group_by(label) %>%
  summarise(pct_x_median = median(pct_x),
            pct_x_mean = mean(pct_x),
            pct_x_q1 = quantile(pct_x, 0.25),
            pct_x_q3 = quantile(pct_x, 0.75)) %>%
  select(-label) %>%
  t() %>%
  as_tibble()

msg_pct_x_bar <- 
  plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
  add_bars(y = msg_pct_x_summary[[1]], name = "label 0", 
           text = round(msg_pct_x_summary[[1]], 0), textposition = "auto") %>%
  add_bars(y = msg_pct_x_summary[[2]], name = "label 1",
           text = round(msg_pct_x_summary[[2]], 0), textposition = "auto")
  
msg_pct_x_bar

10.4 num_puncts

msg_num_puncts_hist <- plot_ly(sms_model, 
                         x = ~num_puncts, 
                         color = ~label, 
                         type = "histogram")
msg_num_puncts_hist

msg_num_puncts_box <- plot_ly(sms_model, 
                         y = ~num_puncts, 
                         color = ~label, 
                         type = "box")
msg_num_puncts_box

msg_num_puncts_summary <- 
  sms_model %>% 
  group_by(label) %>%
  summarise(num_puncts_median = median(num_puncts),
            num_puncts_mean = mean(num_puncts),
            num_puncts_q1 = quantile(num_puncts, 0.25),
            num_puncts_q3 = quantile(num_puncts, 0.75)) %>%
  select(-label) %>%
  t() %>%
  as_tibble()

msg_num_puncts_bar <- 
  plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
  add_bars(y = msg_num_puncts_summary[[1]], name = "label 0", 
           text = round(msg_num_puncts_summary[[1]], 0), textposition = "auto") %>%
  add_bars(y = msg_num_puncts_summary[[2]], name = "label 1",
           text = round(msg_num_puncts_summary[[2]], 0), textposition = "auto")
  
msg_num_puncts_bar

10.5 pct_puncts

msg_pct_puncts_hist <- plot_ly(sms_model, 
                         x = ~pct_puncts, 
                         color = ~label, 
                         type = "histogram")
msg_pct_puncts_hist

msg_pct_puncts_box <- plot_ly(sms_model, 
                         y = ~pct_puncts, 
                         color = ~label, 
                         type = "box")
msg_pct_puncts_box

msg_pct_puncts_summary <- 
  sms_model %>% 
  group_by(label) %>%
  summarise(pct_puncts_median = median(pct_puncts),
            pct_puncts_mean = mean(pct_puncts),
            pct_puncts_q1 = quantile(pct_puncts, 0.25),
            pct_puncts_q3 = quantile(pct_puncts, 0.75)) %>%
  select(-label) %>%
  t() %>%
  as_tibble()

msg_pct_puncts_bar <- 
  plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
  add_bars(y = msg_pct_puncts_summary[[1]], name = "label 0", 
           text = round(msg_pct_puncts_summary[[1]], 0), textposition = "auto") %>%
  add_bars(y = msg_pct_puncts_summary[[2]], name = "label 1",
           text = round(msg_pct_puncts_summary[[2]], 0), textposition = "auto")
  
msg_pct_puncts_bar

11 Stratified Sampling

set.seed(1)
idx <- createDataPartition(sms_model$label, p = 0.7, list = F)
train_set <- sms_model[idx, ]
test_set  <- sms_model[-idx, ]

12 Training Model

12.1 rpart

ctrl <- trainControl(method = "cv", number = 5, selectionFunction = "oneSE")

set.seed(1)
mod_rpart <- train(label ~ ., 
                   data = train_set, 
                   method = "rpart", 
                   trControl = ctrl
                   )
mod_rpart

## CART 
## 
## 89806 samples
##     5 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 71845, 71844, 71846, 71845, 71844 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.002215888  0.9904573  0.9809146
##   0.002427455  0.9882303  0.9764605
##   0.974634212  0.6942211  0.3884645
## 
## Accuracy was used to select the optimal model using  the one SE rule.
## The final value used for the model was cp = 0.002215888.

pred_rpart <- predict(mod_rpart, test_set[-1])

confusionMatrix(pred_rpart, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19179   350
##          1    65 18894
##                                           
##                Accuracy : 0.9892          
##                  95% CI : (0.9881, 0.9902)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9784          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9966          
##             Specificity : 0.9818          
##          Pos Pred Value : 0.9821          
##          Neg Pred Value : 0.9966          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4983          
##    Detection Prevalence : 0.5074          
##       Balanced Accuracy : 0.9892          
##                                           
##        'Positive' Class : 0               
##

12.2 naiveBayes

ctrl <- trainControl(method = "cv", number = 5, selectionFunction = "oneSE")

set.seed(1)
mod_nb <- train(label ~ ., 
                data = train_set, 
                method = "nb", 
                trControl = ctrl
                )
mod_nb

## Naive Bayes 
## 
## 89806 samples
##     5 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 71845, 71844, 71846, 71845, 71844 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9814601  0.9629202
##    TRUE      0.9916376  0.9832751
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using  the one SE rule.
## The final values used for the model were fL = 0, usekernel = TRUE
##  and adjust = 1.

pred_nb <- predict(mod_nb, test_set[-1])

confusionMatrix(pred_nb, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19159   199
##          1    85 19045
##                                           
##                Accuracy : 0.9926          
##                  95% CI : (0.9917, 0.9935)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9852          
##  Mcnemar's Test P-Value : 2.01e-11        
##                                           
##             Sensitivity : 0.9956          
##             Specificity : 0.9897          
##          Pos Pred Value : 0.9897          
##          Neg Pred Value : 0.9956          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4978          
##    Detection Prevalence : 0.5030          
##       Balanced Accuracy : 0.9926          
##                                           
##        'Positive' Class : 0               
##

12.3 svm

ctrl <- trainControl(method = "cv", number = 3, selectionFunction = "oneSE")

set.seed(1)
mod_svm <- train(label ~ ., 
                   data = train_set, 
                   method = "svmRadial", 
                   trControl = ctrl
                   )
mod_svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 89806 samples
##     5 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 59870, 59872, 59870 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.9932855  0.9865711
##   0.50  0.9935639  0.9871278
##   1.00  0.9935973  0.9871946
## 
## Tuning parameter 'sigma' was held constant at a value of 1.119024
## Accuracy was used to select the optimal model using  the one SE rule.
## The final values used for the model were sigma = 1.119024 and C = 0.5.

pred_svm <- predict(mod_svm, test_set[-1])

confusionMatrix(pred_svm, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19103    79
##          1   141 19165
##                                          
##                Accuracy : 0.9943         
##                  95% CI : (0.9935, 0.995)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9886         
##  Mcnemar's Test P-Value : 3.912e-05      
##                                          
##             Sensitivity : 0.9927         
##             Specificity : 0.9959         
##          Pos Pred Value : 0.9959         
##          Neg Pred Value : 0.9927         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4963         
##    Detection Prevalence : 0.4984         
##       Balanced Accuracy : 0.9943         
##                                          
##        'Positive' Class : 0              
##

12.4 randomforest

library(randomForest)

set.seed(1)
mod_rf <- randomForest(x = train_set[-1], 
                       y = train_set$label, 
                       ntree = 100, 
                       importance = T)

pred_rf <- predict(mod_rf, test_set[-1])

confusionMatrix(pred_rf, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19119    98
##          1   125 19146
##                                           
##                Accuracy : 0.9942          
##                  95% CI : (0.9934, 0.9949)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9884          
##  Mcnemar's Test P-Value : 0.08167         
##                                           
##             Sensitivity : 0.9935          
##             Specificity : 0.9949          
##          Pos Pred Value : 0.9949          
##          Neg Pred Value : 0.9935          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4968          
##    Detection Prevalence : 0.4993          
##       Balanced Accuracy : 0.9942          
##                                           
##        'Positive' Class : 0               
##

## 构建一个数据框--包含变量的重要性对应值
(importance2 <- importance(mod_rf))

##                    0         1 MeanDecreaseAccuracy MeanDecreaseGini
## msg_length  7.148187 17.274667            15.806924       13941.3334
## num_x       5.729274  9.186615             9.888173        2412.2439
## pct_x       9.712275 17.497482            17.705742         531.8948
## num_puncts 11.615470 13.837237            17.244250       19153.7126
## pct_puncts 10.796521  5.987960            11.248701        8441.3769

var_importance2 <- 
  data.frame(Variables = row.names(importance2), 
            Importance = round(importance2[, 'MeanDecreaseGini'], 2)) %>% 
  arrange(desc(Importance))

# 可视化变量重要性
ggplot(var_importance2, 
       aes(x = reorder(Variables, Importance), y = Importance, fill = Variables)) +
  geom_bar(stat = 'identity') +
  labs(x = 'Variables') +
  coord_flip() +
  theme_minimal() +
  guides(fill = 'none')

12.5 randomforest_new1

set.seed(1)
mod_rf_new1 <- randomForest(x = train_set %>% dplyr::select(num_puncts), 
                            y = train_set$label, 
                            ntree = 100)

pred_rf_new1 <- predict(mod_rf_new1, test_set[-1])

confusionMatrix(pred_rf_new1, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19116   347
##          1   128 18897
##                                           
##                Accuracy : 0.9877          
##                  95% CI : (0.9865, 0.9887)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9753          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9933          
##             Specificity : 0.9820          
##          Pos Pred Value : 0.9822          
##          Neg Pred Value : 0.9933          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4967          
##    Detection Prevalence : 0.5057          
##       Balanced Accuracy : 0.9877          
##                                           
##        'Positive' Class : 0               
##

12.6 randomforest_new2

set.seed(1)
mod_rf_new2 <- randomForest(x = train_set %>% 
                              dplyr::select(num_puncts, msg_length), 
                            y = train_set$label, 
                            ntree = 100)

pred_rf_new2 <- predict(mod_rf_new2, test_set[-1])

confusionMatrix(pred_rf_new2, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19094   207
##          1   150 19037
##                                           
##                Accuracy : 0.9907          
##                  95% CI : (0.9897, 0.9917)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9814          
##  Mcnemar's Test P-Value : 0.003038        
##                                           
##             Sensitivity : 0.9922          
##             Specificity : 0.9892          
##          Pos Pred Value : 0.9893          
##          Neg Pred Value : 0.9922          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4961          
##    Detection Prevalence : 0.5015          
##       Balanced Accuracy : 0.9907          
##                                           
##        'Positive' Class : 0               
##

12.7 randomforest_new3

set.seed(1)
mod_rf_new3 <- randomForest(x = train_set %>% 
                              dplyr::select(num_puncts, msg_length, pct_puncts), 
                            y = train_set$label, 
                            ntree = 100)

pred_rf_new3 <- predict(mod_rf_new3, test_set[-1])

confusionMatrix(pred_rf_new3, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19088   201
##          1   156 19043
##                                           
##                Accuracy : 0.9907          
##                  95% CI : (0.9897, 0.9917)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9814          
##  Mcnemar's Test P-Value : 0.01987         
##                                           
##             Sensitivity : 0.9919          
##             Specificity : 0.9896          
##          Pos Pred Value : 0.9896          
##          Neg Pred Value : 0.9919          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4959          
##    Detection Prevalence : 0.5012          
##       Balanced Accuracy : 0.9907          
##                                           
##        'Positive' Class : 0               
##

12.8 randomforest_new4

set.seed(1)
mod_rf_new4 <- randomForest(x = train_set %>% 
                              dplyr::select(num_puncts, pct_puncts), 
                            y = train_set$label, 
                            ntree = 100)

pred_rf_new4 <- predict(mod_rf_new4, test_set[-1])

confusionMatrix(pred_rf_new4, test_set$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19113   239
##          1   131 19005
##                                           
##                Accuracy : 0.9904          
##                  95% CI : (0.9894, 0.9913)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9808          
##  Mcnemar's Test P-Value : 2.657e-08       
##                                           
##             Sensitivity : 0.9932          
##             Specificity : 0.9876          
##          Pos Pred Value : 0.9876          
##          Neg Pred Value : 0.9932          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4966          
##    Detection Prevalence : 0.5028          
##       Balanced Accuracy : 0.9904          
##                                           
##        'Positive' Class : 0               
##

ham or spam

Xiaosong Jin

2018-05-05