Jiaxiang Li 2019-03-07
之前使用 Keras 训练的 NLP 小例子 ,有一个技能没有训练。 里面的词向量是已经训练好了,重点是模型,这个例子,介绍了词向量的构建方式。
可以发现大部分的文字都没有学习和识别,这也是证明了一个Xgboost 加上简单的词效果不好。
参考 Blog
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# data wrangling
library(tidyverse)
library(readr)
# plotting
library(ggthemes)
theme_set(theme_minimal())
# text prep
library(text2vec)
# ml
library(caret)
library(xgboost)
# explanation
library(lime)
数据来自 www.kaggle.com
unzip("datasets/womens-ecommerce-clothing-reviews.zip", exdir = 'datasets')
解压数据。
clothing_reviews_raw <- read_csv("datasets/Womens Clothing E-Commerce Reviews.csv")
clothing_reviews_raw %>% head
## # A tibble: 6 x 11
## X1 `Clothing ID` Age Title `Review Text` Rating `Recommended IN~
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 0 767 33 <NA> Absolutely w~ 4 1
## 2 1 1080 34 <NA> "Love this d~ 5 1
## 3 2 1077 60 Some~ I had such h~ 3 0
## 4 3 1049 50 My f~ I love, love~ 5 1
## 5 4 847 47 Flat~ This shirt i~ 5 1
## 6 5 1080 49 Not ~ I love tracy~ 2 0
## # ... with 4 more variables: `Positive Feedback Count` <dbl>, `Division
## # Name` <chr>, `Department Name` <chr>, `Class Name` <chr>
clothing_reviews <-
clothing_reviews_raw %>%
mutate(Liked = as.factor(ifelse(Rating == 5, 1, 0)),
text = paste(Title, `Review Text`),
text = gsub("NA", "", text))
把评论标题和文本合并成一段文字。
glimpse(clothing_reviews)
## Observations: 23,486
## Variables: 13
## $ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...
## $ `Clothing ID` <dbl> 767, 1080, 1077, 1049, 847, 1080, 85...
## $ Age <dbl> 33, 34, 60, 50, 47, 49, 39, 39, 24, ...
## $ Title <chr> NA, NA, "Some major design flaws", "...
## $ `Review Text` <chr> "Absolutely wonderful - silky and se...
## $ Rating <dbl> 4, 5, 3, 5, 5, 2, 5, 4, 5, 5, 3, 5, ...
## $ `Recommended IND` <dbl> 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, ...
## $ `Positive Feedback Count` <dbl> 0, 4, 0, 0, 6, 4, 1, 4, 0, 0, 14, 2,...
## $ `Division Name` <chr> "Initmates", "General", "General", "...
## $ `Department Name` <chr> "Intimate", "Dresses", "Dresses", "B...
## $ `Class Name` <chr> "Intimates", "Dresses", "Dresses", "...
## $ Liked <fct> 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, ...
## $ text <chr> " Absolutely wonderful - silky and s...
clothing_reviews %>%
ggplot(aes(x = Liked, fill = Liked)) +
geom_bar(alpha = 0.8) +
# scale_fill_tableau(palette = "tableau20") +
guides(fill = FALSE)
查看好评的差异,没有不平衡问题。
todo 可以参考 ranking data analysis 做多分类分析。
set.seed(42)
idx <- createDataPartition(clothing_reviews$Liked,
p = 0.8,
list = FALSE,
times = 1)
clothing_reviews_train <- clothing_reviews[ idx,]
clothing_reviews_test <- clothing_reviews[-idx,]
引入 caret 的目的。
get_matrix <- function(text) {
it <- itoken(text, progressbar = FALSE)
create_dtm(it, vectorizer = hash_vectorizer())
}
建立产生词向量的函数。
clothing_reviews_train %>% dim
## [1] 18789 13
clothing_reviews_test %>% dim
## [1] 4697 13
训练集只有2万不到的样本。
dtm_train <- get_matrix(clothing_reviews_train$text)
str(dtm_train)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:889012] 304 764 786 788 793 794 1228 2799 2819 3041 ...
## ..@ p : int [1:262145] 0 0 0 0 0 0 0 0 0 0 ...
## ..@ Dim : int [1:2] 18789 262144
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:18789] "1" "2" "3" "4" ...
## .. ..$ : NULL
## ..@ x : num [1:889012] 1 1 2 1 2 1 1 1 1 1 ...
## ..@ factors : list()
dtm_test <- get_matrix(clothing_reviews_test$text)
str(dtm_test)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## ..@ i : int [1:222314] 2793 400 477 622 2818 2997 3000 4500 3524 2496 ...
## ..@ p : int [1:262145] 0 0 0 0 0 0 0 0 0 0 ...
## ..@ Dim : int [1:2] 4697 262144
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:4697] "1" "2" "3" "4" ...
## .. ..$ : NULL
## ..@ x : num [1:222314] 1 1 1 1 1 1 1 1 1 1 ...
## ..@ factors : list()
xgb_model <- xgb.train(list(max_depth = 7,
eta = 0.1,
objective = "binary:logistic",
eval_metric = "error", nthread = 1),
xgb.DMatrix(dtm_train,
label = clothing_reviews_train$Liked == "1"),
nrounds = 50)
建立一个 Xgboost 模型。
pred <- predict(xgb_model, dtm_test)
confusionMatrix(clothing_reviews_test$Liked,
as.factor(round(pred, digits = 0)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1370 701
## 1 421 2205
##
## Accuracy : 0.7611
## 95% CI : (0.7487, 0.7733)
## No Information Rate : 0.6187
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5085
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7649
## Specificity : 0.7588
## Pos Pred Value : 0.6615
## Neg Pred Value : 0.8397
## Prevalence : 0.3813
## Detection Rate : 0.2917
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.7619
##
## 'Positive' Class : 0
##
confusionMatrix
很快速的给出多个指标,非常方便。
todo 做中文的
explainer <- lime(clothing_reviews_train$text,
xgb_model,
preprocess = get_matrix)
interactive_text_explanations(explainer)
会产生一个 Shiny Apps,具体可以查看 Blog
explanations <- lime::explain(clothing_reviews_test$text[1:4], explainer, n_labels = 1, n_features = 5)
plot_text_explanations(explanations)
这里显示了哪些词是正向作用,那些是负向作用。
plot_features(explanations)
这里体现了具体一个 case 中每个变量的贡献程度。
这也解释了为什么一些模型结果是先反馈结果,成为一列,再反馈这个结果的概率是多少。 这也是一种反馈的范式。