参考 Chollet and Allaire (2018, 81–87)

主要研究 Boston Housing Price 数据集,主要有两个特点

  1. 数据量很小,500个左右
  2. 量纲不同,需要标准化
dataset <- dataset_boston_housing()
dataset %>% write_rds(here("datasets/dataset_boston_housing.rds"))
library(readr)
library(here)
## here() starts at D:/work/learn_nn
library(keras)
dataset <- read_rds(here("datasets/dataset_boston_housing.rds"))
c(c(train_data, train_targets), c(test_data, test_targets)) %<-% dataset
str(train_data)
dim(train_data)
str(test_data)
dim(test_data)

一共有13个字段。

str(train_targets)
range(train_targets)
str(test_targets)
range(test_targets)

这里单位是 \(10^3\) 美元

注意这里需要使用train组的 mean 和 sd 进行标准化。

mean <- apply(train_data,2,mean)
std <- apply(train_data,2,sd)
# apply 常见于矩阵运算
mean %>% str
train_data <- scale(train_data,center = mean,scale = std)
test_data <- scale(test_data,center = mean,scale = std)
build_model <- function() {
    model <- keras_model_sequential() %>% 
        layer_dense(units = 64
                    ,activation = "relu"
                    ,input_shape = dim(train_data)[[2]]
                    ) %>% 
        layer_dense(units = 64
                    ,activation = "relu"
                    ) %>% 
        layer_dense(units = 1)
    
    model %>% 
        compile(
            optimizer = "rmsprop"
            ,loss = "mse"
            ,metrics = c("mae")
        )
}

为之后使用 CV 做准备。

K-fold validation scratch 函数。

When you’re working with little data, K-fold validation can help reliably evaluate your model.

当数据量很小时,K-fold 可以更高效的使用数据。

k <- 4
indices <- sample(1:nrow(train_data))
folds <- cut(indices, breaks = k, labels = FALSE)
num_epochs <- 100
all_scores <- c()
for (i in 1:k) {
    cat("processing fold #", i, "\n")
    # Good way write log
    
    val_indices <- which(folds == i, arr.ind = TRUE)
    val_data <- train_data[val_indices,]
    val_targets <- train_targets[val_indices]
    partial_train_data <- train_data[-val_indices,]
    partial_train_targets <- train_targets[-val_indices]
    
    model <- build_model()
    model %>% 
        fit(
            partial_train_data
            ,partial_train_targets
            ,epochs = num_epochs
            ,batch_size = 1
            ,verbose = 0
        )
    results <- model %>% evaluate(val_data, val_targets, verbose = 0)
    all_scores <- c(all_scores, results$mean_absolute_error)
    # append base way
}
all_scores %>% 
    write_rds(here("datasets/bhp_all_scores.rds"))

产生了报错

2019-01-22 17:13:09.289000: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
all_scores <- read_rds(here("datasets/bhp_all_scores.rds"))
all_scores
## [1] 2.557272 3.150928 2.372173 2.811112
all_scores %>% mean
## [1] 2.722871

误差在 2700

对于在 10000-50000 之间还是有点大, 因此尝试 500 次迭代。

# Do run, too long
num_epochs <- 500
all_mae_histories <- NULL
for (i in 1:k) {
    cat("processing fold #", i, "\n")
    val_indices <- which(folds == i, arr.ind = TRUE)
    val_data <- train_data[val_indices,]
    val_targets <- train_targets[val_indices]
    partial_train_data <- train_data[-val_indices,]
    partial_train_targets <- train_targets[-val_indices]
    model <- build_model()
    history <- model %>% fit(
        partial_train_data, partial_train_targets,
        validation_data = list(val_data, val_targets),
        epochs = num_epochs, batch_size = 1, verbose = 0
    )
    mae_history <- history$metrics$val_mean_absolute_error
    all_mae_histories <- rbind(all_mae_histories, mae_history)
}
average_mae_history <- data.frame(
    epoch = seq(1:ncol(all_mae_histories)),
    validation_mae = apply(all_mae_histories, 2, mean)
)
average_mae_history %>% 
    write_rds(here("datasets/bhp_average_mae_history.rds"))
average_mae_history <- read_rds(here("datasets/bhp_average_mae_history.rds"))
library(ggplot2)
ggplot(average_mae_history, aes(x = epoch, y = validation_mae)) + 
    geom_line() +
    geom_smooth()

因此确定,大概在100次迭代前就开始过拟合了,锁定了迭代次数<100

If you don’t have much training data, use a small network with only one or two hidden layers, to avoid severe overfitting.

When little training data is available, it’s preferable to use a small network with few hidden layers (typically only one or two), in order to avoid severe overfitting.

只用一层或者两层。

Chollet, François, and J.J. Allaire. 2018. Deep Learning with R. Manning Publications.