错误: pandoc document conversion failed with error 5
停止执行
2019-01-22 11:25:43.917382: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
产生了报错,因此不能show数据了。
knitr::opts_chunk$set(eval = FALSE)
参考 Chollet and Allaire (2018, 74–80)
You’ll be working with the Reuters dataset, a set of short newswires and their topics, published by Reuters in 1986. It’s a simple, widely used toy dataset for text classification. There are 46 different topics; some topics are more represented than others, but each topic has at least 10 examples in the training set.
这里有1.122810^{4}个新闻,共46个 topic,每个新闻中的单词都进行了字典编码,反馈对应的数据。
library(keras)
library(readr)
library(here)
reuters <- read_rds(here("datasets/reuters.rds"))
c(c(train_data, train_labels), c(test_data, test_labels)) %<-% reuters
%<-%
的解释参考,寻找下,应该在 Mac 上。
train_len <- length(train_data)
test_len <- length(test_data)
train_len;test_len
total_len <- train_len+test_len
library(wrapr)
library(magrittr)
(c(train_len,test_len)/total_len) %>% multiply_by(10) %>% round()
训练集和测试集比例为8:2。
train_data[1]
train_data[1][[1]] %>% length
其中的数字代表的是对应的单词,现在第一条新闻是87个单词。
word_index <- read_rds(here("datasets/dataset_reuters_word_index.rds"))
word_index
是单词的字典,对应train_data
中的每个数字。
word_index[1:3]
Note that the indices are offset by 3 because 0, 1, and 2 are reserved indices for “padding”, “start of sequence”, and “unknown”.
但是前三个 index 不包含信息,因此剔除。
# str(word_index)
reverse_word_index <- names(word_index)
reverse_word_index %>% head
names(reverse_word_index) <- word_index
decoded_newswire <- sapply(train_data[[1]], function(index) {
word <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
if (!is.null(word)) word else "?"
})
decoded_newswire
reverse_word_index %>% head
这是第一个样本的翻译。
library(dplyr)
train_labels[[1]]
train_labels %>% n_distinct()
一共有46个topic。
对 x 进行处理。
vectorize_sequences <- function(sequences, dimension = 10000) {
results <- matrix(0, nrow = length(sequences), ncol = dimension)
# one-hot
for (i in 1:length(sequences))
results[i, sequences[[i]]] <- 1
results
}
x_train <- vectorize_sequences(train_data)
x_train %>% write_rds(here("datasets/reuters_x_train.rds"))
# >= 600 MB
x_test <- vectorize_sequences(test_data)
x_test %>% write_rds(here("datasets/reuters_x_test.rds"))
dim_x_train <- dim(x_train)
dim_x_train %>% write_rds(here("datasets/reuters_x_train_dim.rds"))
library(usethis)
use_git_ignore(here("datasets/reuters_x_train.rds"))
use_git_ignore(here("datasets/reuters_x_test.rds"))
dim_x_train <- read_rds(here("datasets/reuters_x_train_dim.rds"))
dim_x_train
就是简单判断这个词是否出现在10000个词中,不考虑频率。 因此这么大的数据集,目前keras可以处理,同理可以用来处理融360的数据集。
one_hot_train_labels <- to_categorical(train_labels)
one_hot_test_labels <- to_categorical(test_labels)
This topic-classification problem
这个多分类问题有46个 topic,因此中间的隐藏层不能太小,否则会损失太多重要信息,而且每一层的信息来自于上一层。如果上一层拦截了主要信息,那么当前的隐藏层将永远不能获得,因此建议扩大隐藏层的size (Chollet and Allaire 2018)。
model <- keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 46, activation = "softmax")
model %>% compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
取用1000个样本作为 validation set
val_indices <- 1:1000
x_val <- x_train[val_indices,]
partial_x_train <- x_train[-val_indices,]
y_val <- one_hot_train_labels[val_indices,]
partial_y_train = one_hot_train_labels[-val_indices,]
history <- model %>% fit(
partial_x_train,
partial_y_train,
epochs = 20,
batch_size = 512,
validation_data = list(x_val, y_val)
)
如此这里加入了validation_data
。
history2 <- read_rds(here("datasets/reuters_model.rds"))
summary(history2)
library(ggplot2)
plot(history2)
# ggsave(here("figure/reuters_pref.png"))
大概在第8次开始过拟合了。
model <-
keras_model_sequential() %>%
layer_dense(units = 64, activation = "relu", input_shape = c(10000)) %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 46, activation = "softmax")
model %>%
compile(
optimizer = "rmsprop",
loss = "categorical_crossentropy",
metrics = c("accuracy")
)
history9 <- model %>% fit(
partial_x_train,
partial_y_train,
epochs = 9,
batch_size = 512,
validation_data = list(x_val, y_val)
)
model %>% save_model_hdf5(here("datasets/reuters_model_epoch9.h5"))
history9 %>% write_rds(here("datasets/reuters_model_epoch9_fit.rds"))
results <- model %>% evaluate(x_test, one_hot_test_labels)
results %>% write_rds(here("datasets/reuters_model_epoch9_results.rds"))
results <- read_rds(here("datasets/reuters_model_epoch9_results.rds"))
results
set.seed(123)
test_labels_copy <- test_labels
test_labels_copy <- sample(test_labels_copy)
length(which(test_labels == test_labels_copy)) / length(test_labels)
随机情况下,正确率也有18%,因此模型效果不错。
model <- load_model_hdf5(here("datasets/reuters_model_epoch9.h5"))
x_test <- read_rds(here("datasets/reuters_x_test.rds"))
predictions <- model %>% predict(x_test)
dim(predictions)
sum(predictions[1,])
每一个测试集样本都有46个预测值。
which.max(predictions[1,])
选择最大的一个。
test_len
library(purrr)
pred <- rep(NA,test_len)
for (i in 1:test_len) {
pred[i] <- which.max(predictions[i,])
}
尝试使用 map 函数完成
pred %>%
head
Chollet, François, and J.J. Allaire. 2018. Deep Learning with R. Manning Publications.