这里写我们大致思路。
<>
setwd("E:/sms_cases") # 设定工作空间
library(tidyverse)
library(plotly)
library(jiebaR)
library(wordcloud2)
library(caret)
在读取数据的时候,我们使用了readr包中的read_csv()
函数,这个函数是Rstudio首席科学家HadleyWickham的杰作之一。这个读取数据的函数主要有以下3个优点:
read.csv()
快十倍左右,并且显示读取进度sms <- read_csv("all_sms.csv") # 读取数据
导入数据之后,下一步操作就行去简单的探索数据的基本结构,这样会为后续的分析打下基础(不同的任务要求不同类型的数据)。比如说在分类任务中,大部分模型要求目标变量是因子型,回归任务中要求目标变量是数值型等。
我们后续的任务是分类,经过简单的数据探索发现:目标变量是整型,然后将其转换为了因子型。
sms %>% str() # 简单探索数据结构
## Classes 'tbl_df', 'tbl' and 'data.frame': 646620 obs. of 2 variables:
## $ label : int 0 0 0 0 1 0 1 0 0 1 ...
## $ message: chr "商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一" "带给我们大常州一场壮观的视觉盛宴" "有原因不明的泌尿系统结石等" "23年从盐城拉回来的麻麻的嫁妆" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ label : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ message: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
sms$label <- factor(sms$label) # 将目标变量转换为因子型
经过上面的探索和转换,下一步就是进行nlp操作。这一部分主要涉及到了分词、去停词、分词再处理、词频统计和绘制词云图。
engine <- worker(stop_word = "stopwords.txt")
## 对短信进行分词,并且对分词进行再处理
cut_all <-
sms$message %>%
plyr::llply(segment, engine) %>% # 对短信进行分词
lapply(str_replace_all, '[0-9a-zA-Z]', '') %>% # 将数值和母替换为空字符串
plyr::llply(function(x) x[nchar(x) > 1]) # 剔除长度小于1的字符串
## 封装函数去识别“character(0)”
is_zero <- function(x) { identical(x, character(0)) }
## 分别提取短信的最后文本处理结果
cut_ham <- cut_all[which(sms$label == "0")] %>% .[!sapply(., is_zero)]
cut_spam <- cut_all[which(sms$label == "1")] %>% .[!sapply(., is_zero)]
## 统计正常短信的词频
word_freq_ham <-
cut_ham %>%
unlist() %>% # 转换为向量
as.tibble() %>% # 转换为数据框
rename(words = value) %>% # 重命名列名
group_by(words) %>% # 按照不同的词就行分组
summarise(freq = n()) %>% # 统计上述分组的词的个数
arrange(desc(freq)) # 降序排列
## 统计垃圾短信的词频
word_freq_spam <-
cut_spam %>%
unlist() %>% # 转换为向量
as.tibble() %>% # 转换为数据框
rename(words = value) %>% # 重命名列名
group_by(words) %>% # 按照不同的词就行分组
summarise(freq = n()) %>% # 统计上述分组的词的个数
arrange(desc(freq)) # 降序排列
wordcloud2(word_freq_ham[1:100, ], minRotation = -pi/2, maxRotation = -pi/2)
wordcloud2(word_freq_spam[1:100, ], minRotation = -pi/2, maxRotation = -pi/2)
在进行训练模型分类之前,我们需要把数据平衡一下。在现实生活中到处都是不平衡的事件:
如果你的数据存在不平衡的现象,分析得出的结论也一定是右偏的,往往分类结果会偏向于较多观测的类。我们的这个短信数据同样是不平衡的,垃圾短信:正常短信是9:1,如果不进行数据的平衡,那么分类结果会偏向于正常短信,那么就会导致很多垃圾短信不能成功识别。
set.seed(1)
sms_balance <-
sms %>%
filter(label == "0") %>% # 筛选出所有的正常短信
sample_n(unname(table(sms$label))[2]) %>% # 随机抽取出与垃圾短信观测数相同的部分正常短信
bind_rows(sms %>% filter(label == "1")) # 将抽取出的正常短信与垃圾短信合并
sms_model <-
sms_balance %>%
mutate(message = str_replace_all(message, '[0-9a-zA-Z]', 'x'),
msg_length = str_length(message),
num_x = str_count(message, "[X|x]"),
pct_x = str_count(message, "[X|x]") / msg_length,
num_puncts = str_count(message, "[:punct:]"),
pct_puncts = str_count(message, "[:punct:]") / msg_length,
message = NULL)
msg_num_x <- plot_ly(sms_model,
x = ~msg_length,
color = ~label,
type = "histogram")
msg_num_x
msg_len_box <- plot_ly(sms_model,
y = ~msg_length,
color = ~label,
type = "box")
msg_len_box
msg_len_summary <-
sms_model %>%
group_by(label) %>%
summarise(len_median = median(msg_length),
len_mean = mean(msg_length),
len_q1 = quantile(msg_length, 0.25),
len_q3 = quantile(msg_length, 0.75)) %>%
select(-label) %>%
t() %>%
as_tibble()
msg_len_bar <-
plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
add_bars(y = msg_len_summary[[1]], name = "label 0",
text = round(msg_len_summary[[1]], 0), textposition = "auto") %>%
add_bars(y = msg_len_summary[[2]], name = "label 1",
text = round(msg_len_summary[[2]], 0), textposition = "auto")
msg_len_bar
msg_num_x_hist <- plot_ly(sms_model,
x = ~num_x,
color = ~label,
type = "histogram")
msg_num_x_hist
msg_num_x_box <- plot_ly(sms_model,
y = ~num_x,
color = ~label,
type = "box")
msg_num_x_box
msg_num_x_summary <-
sms_model %>%
group_by(label) %>%
summarise(num_x_median = median(num_x),
num_x_mean = mean(num_x),
num_x_q1 = quantile(num_x, 0.25),
num_x_q3 = quantile(num_x, 0.75)) %>%
select(-label) %>%
t() %>%
as_tibble()
msg_num_x_bar <-
plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
add_bars(y = msg_num_x_summary[[1]], name = "label 0",
text = round(msg_num_x_summary[[1]], 0), textposition = "auto") %>%
add_bars(y = msg_num_x_summary[[2]], name = "label 1",
text = round(msg_num_x_summary[[2]], 0), textposition = "auto")
msg_num_x_bar
msg_pct_x_hist <- plot_ly(sms_model,
x = ~pct_x,
color = ~label,
type = "histogram")
msg_pct_x_hist
msg_pct_x_box <- plot_ly(sms_model,
y = ~pct_x,
color = ~label,
type = "box")
msg_pct_x_box
msg_pct_x_summary <-
sms_model %>%
group_by(label) %>%
summarise(pct_x_median = median(pct_x),
pct_x_mean = mean(pct_x),
pct_x_q1 = quantile(pct_x, 0.25),
pct_x_q3 = quantile(pct_x, 0.75)) %>%
select(-label) %>%
t() %>%
as_tibble()
msg_pct_x_bar <-
plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
add_bars(y = msg_pct_x_summary[[1]], name = "label 0",
text = round(msg_pct_x_summary[[1]], 0), textposition = "auto") %>%
add_bars(y = msg_pct_x_summary[[2]], name = "label 1",
text = round(msg_pct_x_summary[[2]], 0), textposition = "auto")
msg_pct_x_bar
msg_num_puncts_hist <- plot_ly(sms_model,
x = ~num_puncts,
color = ~label,
type = "histogram")
msg_num_puncts_hist
msg_num_puncts_box <- plot_ly(sms_model,
y = ~num_puncts,
color = ~label,
type = "box")
msg_num_puncts_box
msg_num_puncts_summary <-
sms_model %>%
group_by(label) %>%
summarise(num_puncts_median = median(num_puncts),
num_puncts_mean = mean(num_puncts),
num_puncts_q1 = quantile(num_puncts, 0.25),
num_puncts_q3 = quantile(num_puncts, 0.75)) %>%
select(-label) %>%
t() %>%
as_tibble()
msg_num_puncts_bar <-
plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
add_bars(y = msg_num_puncts_summary[[1]], name = "label 0",
text = round(msg_num_puncts_summary[[1]], 0), textposition = "auto") %>%
add_bars(y = msg_num_puncts_summary[[2]], name = "label 1",
text = round(msg_num_puncts_summary[[2]], 0), textposition = "auto")
msg_num_puncts_bar
msg_pct_puncts_hist <- plot_ly(sms_model,
x = ~pct_puncts,
color = ~label,
type = "histogram")
msg_pct_puncts_hist
msg_pct_puncts_box <- plot_ly(sms_model,
y = ~pct_puncts,
color = ~label,
type = "box")
msg_pct_puncts_box
msg_pct_puncts_summary <-
sms_model %>%
group_by(label) %>%
summarise(pct_puncts_median = median(pct_puncts),
pct_puncts_mean = mean(pct_puncts),
pct_puncts_q1 = quantile(pct_puncts, 0.25),
pct_puncts_q3 = quantile(pct_puncts, 0.75)) %>%
select(-label) %>%
t() %>%
as_tibble()
msg_pct_puncts_bar <-
plot_ly(x = c("Median", "Mean", "Q1", "Q3")) %>%
add_bars(y = msg_pct_puncts_summary[[1]], name = "label 0",
text = round(msg_pct_puncts_summary[[1]], 0), textposition = "auto") %>%
add_bars(y = msg_pct_puncts_summary[[2]], name = "label 1",
text = round(msg_pct_puncts_summary[[2]], 0), textposition = "auto")
msg_pct_puncts_bar
set.seed(1)
idx <- createDataPartition(sms_model$label, p = 0.7, list = F)
train_set <- sms_model[idx, ]
test_set <- sms_model[-idx, ]
ctrl <- trainControl(method = "cv", number = 5, selectionFunction = "oneSE")
set.seed(1)
mod_rpart <- train(label ~ .,
data = train_set,
method = "rpart",
trControl = ctrl
)
mod_rpart
## CART
##
## 89806 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 71845, 71844, 71846, 71845, 71844
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.002215888 0.9904573 0.9809146
## 0.002427455 0.9882303 0.9764605
## 0.974634212 0.6942211 0.3884645
##
## Accuracy was used to select the optimal model using the one SE rule.
## The final value used for the model was cp = 0.002215888.
pred_rpart <- predict(mod_rpart, test_set[-1])
confusionMatrix(pred_rpart, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19179 350
## 1 65 18894
##
## Accuracy : 0.9892
## 95% CI : (0.9881, 0.9902)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9784
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9966
## Specificity : 0.9818
## Pos Pred Value : 0.9821
## Neg Pred Value : 0.9966
## Prevalence : 0.5000
## Detection Rate : 0.4983
## Detection Prevalence : 0.5074
## Balanced Accuracy : 0.9892
##
## 'Positive' Class : 0
##
ctrl <- trainControl(method = "cv", number = 5, selectionFunction = "oneSE")
set.seed(1)
mod_nb <- train(label ~ .,
data = train_set,
method = "nb",
trControl = ctrl
)
mod_nb
## Naive Bayes
##
## 89806 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 71845, 71844, 71846, 71845, 71844
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9814601 0.9629202
## TRUE 0.9916376 0.9832751
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the one SE rule.
## The final values used for the model were fL = 0, usekernel = TRUE
## and adjust = 1.
pred_nb <- predict(mod_nb, test_set[-1])
confusionMatrix(pred_nb, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19159 199
## 1 85 19045
##
## Accuracy : 0.9926
## 95% CI : (0.9917, 0.9935)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9852
## Mcnemar's Test P-Value : 2.01e-11
##
## Sensitivity : 0.9956
## Specificity : 0.9897
## Pos Pred Value : 0.9897
## Neg Pred Value : 0.9956
## Prevalence : 0.5000
## Detection Rate : 0.4978
## Detection Prevalence : 0.5030
## Balanced Accuracy : 0.9926
##
## 'Positive' Class : 0
##
ctrl <- trainControl(method = "cv", number = 3, selectionFunction = "oneSE")
set.seed(1)
mod_svm <- train(label ~ .,
data = train_set,
method = "svmRadial",
trControl = ctrl
)
mod_svm
## Support Vector Machines with Radial Basis Function Kernel
##
## 89806 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 59870, 59872, 59870
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9932855 0.9865711
## 0.50 0.9935639 0.9871278
## 1.00 0.9935973 0.9871946
##
## Tuning parameter 'sigma' was held constant at a value of 1.119024
## Accuracy was used to select the optimal model using the one SE rule.
## The final values used for the model were sigma = 1.119024 and C = 0.5.
pred_svm <- predict(mod_svm, test_set[-1])
confusionMatrix(pred_svm, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19103 79
## 1 141 19165
##
## Accuracy : 0.9943
## 95% CI : (0.9935, 0.995)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9886
## Mcnemar's Test P-Value : 3.912e-05
##
## Sensitivity : 0.9927
## Specificity : 0.9959
## Pos Pred Value : 0.9959
## Neg Pred Value : 0.9927
## Prevalence : 0.5000
## Detection Rate : 0.4963
## Detection Prevalence : 0.4984
## Balanced Accuracy : 0.9943
##
## 'Positive' Class : 0
##
library(randomForest)
set.seed(1)
mod_rf <- randomForest(x = train_set[-1],
y = train_set$label,
ntree = 100,
importance = T)
pred_rf <- predict(mod_rf, test_set[-1])
confusionMatrix(pred_rf, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19119 98
## 1 125 19146
##
## Accuracy : 0.9942
## 95% CI : (0.9934, 0.9949)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9884
## Mcnemar's Test P-Value : 0.08167
##
## Sensitivity : 0.9935
## Specificity : 0.9949
## Pos Pred Value : 0.9949
## Neg Pred Value : 0.9935
## Prevalence : 0.5000
## Detection Rate : 0.4968
## Detection Prevalence : 0.4993
## Balanced Accuracy : 0.9942
##
## 'Positive' Class : 0
##
## 构建一个数据框--包含变量的重要性对应值
(importance2 <- importance(mod_rf))
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## msg_length 7.148187 17.274667 15.806924 13941.3334
## num_x 5.729274 9.186615 9.888173 2412.2439
## pct_x 9.712275 17.497482 17.705742 531.8948
## num_puncts 11.615470 13.837237 17.244250 19153.7126
## pct_puncts 10.796521 5.987960 11.248701 8441.3769
var_importance2 <-
data.frame(Variables = row.names(importance2),
Importance = round(importance2[, 'MeanDecreaseGini'], 2)) %>%
arrange(desc(Importance))
# 可视化变量重要性
ggplot(var_importance2,
aes(x = reorder(Variables, Importance), y = Importance, fill = Variables)) +
geom_bar(stat = 'identity') +
labs(x = 'Variables') +
coord_flip() +
theme_minimal() +
guides(fill = 'none')
set.seed(1)
mod_rf_new1 <- randomForest(x = train_set %>% dplyr::select(num_puncts),
y = train_set$label,
ntree = 100)
pred_rf_new1 <- predict(mod_rf_new1, test_set[-1])
confusionMatrix(pred_rf_new1, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19116 347
## 1 128 18897
##
## Accuracy : 0.9877
## 95% CI : (0.9865, 0.9887)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9753
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9933
## Specificity : 0.9820
## Pos Pred Value : 0.9822
## Neg Pred Value : 0.9933
## Prevalence : 0.5000
## Detection Rate : 0.4967
## Detection Prevalence : 0.5057
## Balanced Accuracy : 0.9877
##
## 'Positive' Class : 0
##
set.seed(1)
mod_rf_new2 <- randomForest(x = train_set %>%
dplyr::select(num_puncts, msg_length),
y = train_set$label,
ntree = 100)
pred_rf_new2 <- predict(mod_rf_new2, test_set[-1])
confusionMatrix(pred_rf_new2, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19094 207
## 1 150 19037
##
## Accuracy : 0.9907
## 95% CI : (0.9897, 0.9917)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9814
## Mcnemar's Test P-Value : 0.003038
##
## Sensitivity : 0.9922
## Specificity : 0.9892
## Pos Pred Value : 0.9893
## Neg Pred Value : 0.9922
## Prevalence : 0.5000
## Detection Rate : 0.4961
## Detection Prevalence : 0.5015
## Balanced Accuracy : 0.9907
##
## 'Positive' Class : 0
##
set.seed(1)
mod_rf_new3 <- randomForest(x = train_set %>%
dplyr::select(num_puncts, msg_length, pct_puncts),
y = train_set$label,
ntree = 100)
pred_rf_new3 <- predict(mod_rf_new3, test_set[-1])
confusionMatrix(pred_rf_new3, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19088 201
## 1 156 19043
##
## Accuracy : 0.9907
## 95% CI : (0.9897, 0.9917)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9814
## Mcnemar's Test P-Value : 0.01987
##
## Sensitivity : 0.9919
## Specificity : 0.9896
## Pos Pred Value : 0.9896
## Neg Pred Value : 0.9919
## Prevalence : 0.5000
## Detection Rate : 0.4959
## Detection Prevalence : 0.5012
## Balanced Accuracy : 0.9907
##
## 'Positive' Class : 0
##
set.seed(1)
mod_rf_new4 <- randomForest(x = train_set %>%
dplyr::select(num_puncts, pct_puncts),
y = train_set$label,
ntree = 100)
pred_rf_new4 <- predict(mod_rf_new4, test_set[-1])
confusionMatrix(pred_rf_new4, test_set$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19113 239
## 1 131 19005
##
## Accuracy : 0.9904
## 95% CI : (0.9894, 0.9913)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9808
## Mcnemar's Test P-Value : 2.657e-08
##
## Sensitivity : 0.9932
## Specificity : 0.9876
## Pos Pred Value : 0.9876
## Neg Pred Value : 0.9932
## Prevalence : 0.5000
## Detection Rate : 0.4966
## Detection Prevalence : 0.5028
## Balanced Accuracy : 0.9904
##
## 'Positive' Class : 0
##