A Minimal Example for NLP

错误: pandoc document conversion failed with error 5
停止执行
2019-01-22 11:25:43.917382: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2

产生了报错，因此不能show数据了。

knitr::opts_chunk$set(eval = FALSE)

参考 Chollet and Allaire (2018, 74–80)

You’ll be working with the Reuters dataset, a set of short newswires and their topics, published by Reuters in 1986. It’s a simple, widely used toy dataset for text classification. There are 46 different topics; some topics are more represented than others, but each topic has at least 10 examples in the training set.

这里有1.122810^{4}个新闻，共46个 topic，每个新闻中的单词都进行了字典编码，反馈对应的数据。

library(keras)
library(readr)
library(here)
reuters <- read_rds(here("datasets/reuters.rds"))
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% reuters

%<-%的解释参考，寻找下，应该在 Mac 上。

train_len <- length(train_data)
test_len <- length(test_data)
train_len;test_len
total_len <- train_len+test_len
library(wrapr)
library(magrittr)
(c(train_len,test_len)/total_len) %>% multiply_by(10) %>% round()

训练集和测试集比例为8:2。

train_data[1]
train_data[1][[1]] %>% length

其中的数字代表的是对应的单词，现在第一条新闻是87个单词。

word_index <- read_rds(here("datasets/dataset_reuters_word_index.rds"))

word_index是单词的字典，对应train_data中的每个数字。

word_index[1:3]

Note that the indices are offset by 3 because 0, 1, and 2 are reserved indices for “padding”, “start of sequence”, and “unknown”.

但是前三个 index 不包含信息，因此剔除。

# str(word_index)
reverse_word_index <- names(word_index)
reverse_word_index %>% head
names(reverse_word_index) <- word_index
decoded_newswire <- sapply(train_data[[1]], function(index) {
word <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
if (!is.null(word)) word else "?"
})

decoded_newswire
reverse_word_index %>% head

这是第一个样本的翻译。

library(dplyr)
train_labels[[1]]
train_labels %>% n_distinct()

一共有46个topic。

对 x 进行处理。

vectorize_sequences <- function(sequences, dimension = 10000) {
    results <- matrix(0, nrow = length(sequences), ncol = dimension)
    # one-hot
    for (i in 1:length(sequences))
    results[i, sequences[[i]]] <- 1
    results
}
x_train <- vectorize_sequences(train_data) 
x_train %>% write_rds(here("datasets/reuters_x_train.rds"))
# >= 600 MB
x_test <- vectorize_sequences(test_data)
x_test %>% write_rds(here("datasets/reuters_x_test.rds"))
dim_x_train <- dim(x_train)
dim_x_train %>% write_rds(here("datasets/reuters_x_train_dim.rds"))
library(usethis)
use_git_ignore(here("datasets/reuters_x_train.rds"))
use_git_ignore(here("datasets/reuters_x_test.rds"))

dim_x_train <- read_rds(here("datasets/reuters_x_train_dim.rds"))
dim_x_train

就是简单判断这个词是否出现在10000个词中，不考虑频率。因此这么大的数据集，目前keras可以处理，同理可以用来处理融360的数据集。

one_hot_train_labels <- to_categorical(train_labels)
one_hot_test_labels <- to_categorical(test_labels)

This topic-classification problem

这个多分类问题有46个 topic，因此中间的隐藏层不能太小，否则会损失太多重要信息，而且每一层的信息来自于上一层。如果上一层拦截了主要信息，那么当前的隐藏层将永远不能获得，因此建议扩大隐藏层的size (Chollet and Allaire 2018)。

model <- keras_model_sequential() %>%
    layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 46, activation = "softmax")

model %>% compile(
    optimizer = "rmsprop",
    loss = "categorical_crossentropy",
    metrics = c("accuracy")
)

取用1000个样本作为 validation set

val_indices <- 1:1000
x_val <- x_train[val_indices,]
partial_x_train <- x_train[-val_indices,]
y_val <- one_hot_train_labels[val_indices,]
partial_y_train = one_hot_train_labels[-val_indices,]

history <- model %>% fit(
    partial_x_train,
    partial_y_train,
    epochs = 20,
    batch_size = 512,
    validation_data = list(x_val, y_val)
)

如此这里加入了validation_data。

history2 <- read_rds(here("datasets/reuters_model.rds"))
summary(history2)

library(ggplot2)
plot(history2)
# ggsave(here("figure/reuters_pref.png"))

大概在第8次开始过拟合了。

model <- 
    keras_model_sequential() %>%
    layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
    layer_dense(units = 64, activation = "relu") %>%
    layer_dense(units = 46, activation = "softmax")
model %>% 
    compile(
        optimizer = "rmsprop",
        loss = "categorical_crossentropy",
        metrics = c("accuracy")
    )
history9 <- model %>% fit(
    partial_x_train,
    partial_y_train,
    epochs = 9,
    batch_size = 512,
    validation_data = list(x_val, y_val)
)
model %>% save_model_hdf5(here("datasets/reuters_model_epoch9.h5"))
history9 %>% write_rds(here("datasets/reuters_model_epoch9_fit.rds"))
results <- model %>% evaluate(x_test, one_hot_test_labels)
results %>% write_rds(here("datasets/reuters_model_epoch9_results.rds"))

results <- read_rds(here("datasets/reuters_model_epoch9_results.rds"))
results

set.seed(123)
test_labels_copy <- test_labels
test_labels_copy <- sample(test_labels_copy)
length(which(test_labels == test_labels_copy)) / length(test_labels)

随机情况下，正确率也有18%，因此模型效果不错。

model <- load_model_hdf5(here("datasets/reuters_model_epoch9.h5"))
x_test <- read_rds(here("datasets/reuters_x_test.rds"))
predictions <- model %>% predict(x_test)
dim(predictions)
sum(predictions[1,])

每一个测试集样本都有46个预测值。

which.max(predictions[1,])

选择最大的一个。

test_len
library(purrr)
pred <- rep(NA,test_len)
for (i in 1:test_len) {
    pred[i] <- which.max(predictions[i,])
}

尝试使用 map 函数完成

pred %>% 
    head

Chollet, François, and J.J. Allaire. 2018. Deep Learning with R. Manning Publications.

A Minimal Example for NLP

李家翔

2019-01-22