1 Introduction

mlr包可谓是机器学习的一大利器,该包综合了诸多的R机器学习包,统一了语法,提供了大量便捷的工具,同时使调参和模型选择更加便捷。

2 Download mlr

# install.packages("mlr", denpencies = T)

3 Quick start

在系统的学习mlr包之前,我们先用其做一个简单的案例,先见识一下其独特的语法。下面我们就用大家所熟悉的iris数据开始做一个分类模型:
注:下面仅是一个简单的举例,很多地方并不完善。

## 加载mlr包,并且确定数据
library(mlr)
dat <- iris

## 定义任务
task <- makeClassifTask(data = dat, target = "Species")

## 定义学习器
lrn <- makeLearner("classif.lda")

## 创建训练集和测试集
set.seed(45)
n <- nrow(dat)
train_set <- sample(n, size = 2/3*n)
test_set  <- setdiff(1:n, train_set)

## 训练模型
model_lda <- train(learner = lrn, task = task, subset = train_set)

## 预测
pred_lda <- predict(model_lda, task = task, subset = test_set)

## 模型评估
lda_evaluate <- performance(pred_lda, measures = list(mmce, acc))
lda_evaluate
## mmce  acc 
## 0.04 0.96

4 Tasks

在本部分,主要总结了如何创建各类任务、如何访问任务和如何修改任务。其中如何访问任务中包含了如何获取任务中的各种数据信息。

4.1 Task types and creation

mlr包中提供了以下几种任务:

  • RegrTask for regression problems(回归任务)
  • ClassifTask for binary and multi-class classification problems(分类任务)
  • SurvTask for survival analysis(生存分析)
  • ClusterTask for cluster analysis(聚类分析)
  • MultilabelTask for multilabel classification problems(多标签分类)
  • CostSensTask for general cost-sensitive classification (with example-specific costs)(成本敏感型分类)

下面我们分别用简短的例子去实现:

4.1.1 Regression

data(BostonHousing, package = "mlbench")
regr_task <- makeRegrTask(data = BostonHousing, target = "medv")
regr_task
## Supervised task: BostonHousing
## Type: regr
## Target: medv
## Observations: 506
## Features:
##    numerics     factors     ordered functionals 
##          12           1           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE

4.1.2 Classification

data(BreastCancer, package = "mlbench")
df <- BreastCancer
df$Id <- NULL
classif_task <- makeClassifTask(data = df, target = "Class", positive = "malignant")
classif_task
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
##    numerics     factors     ordered functionals 
##           0           4           5           0 
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
##    benign malignant 
##       458       241 
## Positive class: malignant

4.1.3 Survival analysis

data(lung, package = "survival")
lung$status <- (lung$status == 2)  #将此列转换为逻辑型变量
surv_task <- makeSurvTask(data = lung, target = c("time", "status"))
surv_task
## Supervised task: lung
## Type: surv
## Target: time,status
## Events: 165
## Observations: 228
## Features:
##    numerics     factors     ordered functionals 
##           8           0           0           0 
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE

4.1.4 Multilabel classification

yeast <- getTaskData(yeast.task)
labels <- colnames(yeast)[1:14]
yeast_task <- makeMultilabelTask(data = yeast, target = labels)
yeast_task
## Supervised task: yeast
## Type: multilabel
## Target: label1,label2,label3,label4,label5,label6,label7,label8,label9,label10,label11,label12,label13,label14
## Observations: 2417
## Features:
##    numerics     factors     ordered functionals 
##         103           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 14
##  label1  label2  label3  label4  label5  label6  label7  label8  label9 
##     762    1038     983     862     722     597     428     480     178 
## label10 label11 label12 label13 label14 
##     253     289    1816    1799      34

4.1.5 Cluster analysis

data(mtcars, package = "datasets")
cluster_task <- makeClusterTask(data = mtcars)
cluster_task
## Unsupervised task: mtcars
## Type: cluster
## Observations: 32
## Features:
##    numerics     factors     ordered functionals 
##          11           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE

4.1.6 Cost-sensitive classification

df <- iris
cost <- matrix(runif(150 * 3, 0, 2000), 150) * (1 - diag(3))[df$Species,]
df$Species <- NULL

costsens_task <- makeCostSensTask(data = df, cost = cost)
costsens_task
## Supervised task: df
## Type: costsens
## Observations: 150
## Features:
##    numerics     factors     ordered functionals 
##           4           0           0           0 
## Missings: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 3
## y1, y2, y3

4.2 Accessing a learning task

本小节涉及两个方面:访问任务和获取任务中的数据信息。

4.2.1 access task

getTaskDesc(classif_task)           #返回最详细的任务描述
## $id
## [1] "df"
## 
## $type
## [1] "classif"
## 
## $target
## [1] "Class"
## 
## $size
## [1] 699
## 
## $n.feat
##    numerics     factors     ordered functionals 
##           0           4           5           0 
## 
## $has.missings
## [1] TRUE
## 
## $has.weights
## [1] FALSE
## 
## $has.blocking
## [1] FALSE
## 
## $has.coordinates
## [1] FALSE
## 
## $class.levels
## [1] "benign"    "malignant"
## 
## $positive
## [1] "malignant"
## 
## $negative
## [1] "benign"
## 
## $class.distribution
## 
##    benign malignant 
##       458       241 
## 
## attr(,"class")
## [1] "ClassifTaskDesc"    "SupervisedTaskDesc" "TaskDesc"
getTaskId(classif_task)             #返回任务的ID
## [1] "df"
getTaskType(classif_task)           #返回任务类型
## [1] "classif"
getTaskTargetNames(classif_task)    #返回目标变量名
## [1] "Class"
getTaskSize(classif_task)           #返回数据大小
## [1] 699
getTaskNFeats(classif_task)         #返回输入变量
## [1] 9
getTaskClassLevels(classif_task)    #返回分类任务当中的'类'
## [1] "benign"    "malignant"

4.2.2 extract data

head(getTaskData(classif_task))     #获取任务数据
##   Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1            5         1          1             1            2           1
## 2            5         4          4             5            7          10
## 3            3         1          1             1            2           2
## 4            6         8          8             1            3           4
## 5            4         1          1             3            2           1
## 6            8        10         10             8            7          10
##   Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           3               1       1    benign
## 2           3               2       1    benign
## 3           3               1       1    benign
## 4           3               7       1    benign
## 5           3               1       1    benign
## 6           9               7       1 malignant
getTaskFeatureNames(classif_task)   #获取输入变量名
## [1] "Cl.thickness"    "Cell.size"       "Cell.shape"      "Marg.adhesion"  
## [5] "Epith.c.size"    "Bare.nuclei"     "Bl.cromatin"     "Normal.nucleoli"
## [9] "Mitoses"
head(getTaskTargets(classif_task))  #获取目标变量名
## [1] benign    benign    benign    benign    benign    malignant
## Levels: benign malignant
head(getTaskCosts(costsens_task))   #获取成本矩阵
##      y1        y2        y3
## [1,]  0  707.3011  932.5762
## [2,]  0   77.8694 1531.8597
## [3,]  0  984.2454  753.4774
## [4,]  0  740.8373  797.5681
## [5,]  0 1457.0867 1711.5323
## [6,]  0 1129.0749 1249.6415

4.3 Modifying a learning task

下面演示部分修改任务的操作,详细的操作请在https://www.rdocumentation.org/packages/mlr/versions/2.12.1/topics/capLargeValues中查看。后续我会补充进来…

4.3.1 subsetTask

## 设定一个子任务(挑选部分观测建立任务)
cluster_task_sub <- subsetTask(cluster_task, subset = 4:17)
cluster_task_sub
## Unsupervised task: mtcars
## Type: cluster
## Observations: 14
## Features:
##    numerics     factors     ordered functionals 
##          11           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE

4.3.2 removeConstantFeatures

## 移除恒定值的自变量
removeConstantFeatures(classif_task)
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
##    numerics     factors     ordered functionals 
##           0           4           5           0 
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
##    benign malignant 
##       458       241 
## Positive class: malignant

4.3.3 dropFeatures

## 移除选中的自变量
dropFeatures(classif_task, c("Cell.size", "Cell.shape"))
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
##    numerics     factors     ordered functionals 
##           0           4           3           0 
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
##    benign malignant 
##       458       241 
## Positive class: malignant

4.3.4 normalizeFeatures

## 对数值型变量标准化
task <- normalizeFeatures(cluster_task, method = "range")
summary(getTaskData(task))
##       mpg              cyl              disp              hp        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2138   1st Qu.:0.0000   1st Qu.:0.1240   1st Qu.:0.1572  
##  Median :0.3745   Median :0.5000   Median :0.3123   Median :0.2509  
##  Mean   :0.4124   Mean   :0.5469   Mean   :0.3982   Mean   :0.3346  
##  3rd Qu.:0.5277   3rd Qu.:1.0000   3rd Qu.:0.6358   3rd Qu.:0.4523  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       drat              wt              qsec              vs        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1475   1st Qu.:0.2731   1st Qu.:0.2848   1st Qu.:0.0000  
##  Median :0.4309   Median :0.4633   Median :0.3821   Median :0.0000  
##  Mean   :0.3855   Mean   :0.4358   Mean   :0.3987   Mean   :0.4375  
##  3rd Qu.:0.5346   3rd Qu.:0.5362   3rd Qu.:0.5238   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        am              gear             carb       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.1429  
##  Median :0.0000   Median :0.5000   Median :0.1429  
##  Mean   :0.4062   Mean   :0.3438   Mean   :0.2589  
##  3rd Qu.:1.0000   3rd Qu.:0.5000   3rd Qu.:0.4286  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

5 Learners

上面把有关task的内容介绍了一下,下面就该学习有关学习器的内容了。下面主要涉及到了如何构建学习器、获取学习器的信息、修改学习器和查询学习器列表。

5.1 Constructing a learner

在构建学习器时,不仅可以指定学习器的名称,还可以设置超参数、预测的输出类型(标签或概率)、设定学习器ID。下面简单地建立5个学习器展示一下如何构建学习器:

在构建学习器时,有以下下三个比较重要的参数:

  • 学习器名称:classif/regr/surv/cluster/multilabel.
  • 超参设置:可以通过…参数或者是par.vars参数来指定
  • 当测试集的因子水平少于训练集时,我们可以使用fix.factors.prediction = T来避免因此导致的错误。

指的一提的是,mlr包还提供了一个函数makeLearners(),该函数可以同时对同一个数据建立多个模型。

5.1.1 makeLearner

## 构建随机森林学习器,并且设定输出结果是标签的概率
classif_lrn <- makeLearner("classif.randomForest", predict.type = "prob", fix.factors.prediction = TRUE)

## 建立回归学习器,并且设置相关参数(目前不知道参数是干啥的,待解决)
regr_lrn <- makeLearner("regr.gbm", par.vals = list(n.trees = 500, interaction.depth = 3))

## 建立Cox风险模型,并设定模型ID
surv_lrn <- makeLearner("surv.coxph", id = "cph")

## 建立K-Means模型,设定聚为5类
cluster_lrn = makeLearner("cluster.kmeans", centers = 5)

## 建立Multilabel Random Ferns算法
multilabel_lrn = makeLearner("multilabel.rFerns")

5.1.2 makeLearners

## 建立回归树和线性判别模型,并且指定返回模型的类型和输出标签是概率
rp_lda_lrn <- makeLearners(c("rpart", "lda"), type = "classif", predict.type = "prob")

5.2 Accessing a learner

获取学习器的相关信息的时候有两种方法,第一种就是使用$提取,第二种就是使用mlr包中的函数:

  • 查看学习器的超参数:getHyperPars()
  • 查看学习器的参数集:getParamSet()
  • 查看学习器的简称:getLearnerShortName()
  • 查看学习器类型:getLearnerType()
  • 查看学习器需要下载的包:getLearnerPackaages()

cluster_lrn$par.vals
## $centers
## [1] 5
getHyperPars(cluster_lrn)
## $centers
## [1] 5
getParamSet(cluster_lrn)
##               Type len           Def                             Constr
## centers    untyped   -             -                                  -
## iter.max   integer   -            10                           1 to Inf
## nstart     integer   -             1                           1 to Inf
## algorithm discrete   - Hartigan-Wong Hartigan-Wong,Lloyd,Forgy,MacQueen
## trace      logical   -             -                                  -
##           Req Tunable Trafo
## centers     -    TRUE     -
## iter.max    -    TRUE     -
## nstart      -    TRUE     -
## algorithm   -    TRUE     -
## trace       -   FALSE     -
getParamSet(cluster_lrn)
##               Type len           Def                             Constr
## centers    untyped   -             -                                  -
## iter.max   integer   -            10                           1 to Inf
## nstart     integer   -             1                           1 to Inf
## algorithm discrete   - Hartigan-Wong Hartigan-Wong,Lloyd,Forgy,MacQueen
## trace      logical   -             -                                  -
##           Req Tunable Trafo
## centers     -    TRUE     -
## iter.max    -    TRUE     -
## nstart      -    TRUE     -
## algorithm   -    TRUE     -
## trace       -   FALSE     -
getLearnerId(cluster_lrn)
## [1] "cluster.kmeans"
getLearnerShortName(cluster_lrn)
## [1] "kmeans"
getLearnerType(cluster_lrn)
## [1] "cluster"
getLearnerPackages(cluster_lrn)
## [1] "stats" "clue"

5.3 Modifying a learner

与修改学习任务一样,在mlr包中也提供了修改学习器的函数,方便我们去修改某个学习器,而不用去重新创建,大大提高了工作效率。

下面主要演示了如何修改学习器ID、修改学习器返回目标变量的类型、修改参数、恢复自定义参数的默认值。

5.3.1 setLearnerId

surv_lrn <- setLearnerId(surv_lrn, "CoxModel")
surv_lrn
## Learner CoxModel from package survival
## Type: surv
## Name: Cox Proportional Hazard Model; Short name: coxph
## Class: surv.coxph
## Properties: numerics,factors,weights
## Predict-Type: response
## Hyperparameters:

5.3.2 setPredictType

classif_lrn <- setPredictType(classif_lrn, "response")
classif_lrn
## Learner classif.randomForest from package randomForest
## Type: classif
## Name: Random Forest; Short name: rf
## Class: classif.randomForest
## Properties: twoclass,multiclass,numerics,factors,ordered,prob,class.weights,oobpreds,featimp
## Predict-Type: response
## Hyperparameters:

5.3.3 setHyperPars

cluster_lrn <- setHyperPars(cluster_lrn, centers = 4)
cluster_lrn
## Learner cluster.kmeans from package stats,clue
## Type: cluster
## Name: K-Means; Short name: kmeans
## Class: cluster.kmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=4

5.3.4 removeHyperPars

regr_lrn <- removeHyperPars(regr_lrn, c("n.trees", "interaction.depth"))
regr_lrn
## Learner regr.gbm from package gbm
## Type: regr
## Name: Gradient Boosting Machine; Short name: gbm
## Class: regr.gbm
## Properties: missings,numerics,factors,weights,featimp
## Predict-Type: response
## Hyperparameters: distribution=gaussian,keep.data=FALSE

5.4 Listing learners

在mlr包中提供了上百种模型去供我们调用,如下图所示:

## 返回mlr里面的分类学习器和其对应的包
lrns <- listLearners()
head(lrns[c("class", "package")])
##                 class      package
## 1         classif.ada    ada,rpart
## 2  classif.adaboostm1        RWeka
## 3 classif.bartMachine  bartMachine
## 4    classif.binomial        stats
## 5  classif.blackboost mboost,party
## 6    classif.boosting adabag,rpart
## 返回mlr里面的可以返回概率的分类学习器和其对应的包
lrns = listLearners("classif", properties = "prob")
head(lrns[c("class", "package")])
##                 class      package
## 1         classif.ada    ada,rpart
## 2  classif.adaboostm1        RWeka
## 3 classif.bartMachine  bartMachine
## 4    classif.binomial        stats
## 5  classif.blackboost mboost,party
## 6    classif.boosting adabag,rpart
## 返回mlr里面能作用在iris.task且能返回概率的学习器和其对应的包
lrns = listLearners(iris.task, properties = "prob")
head(lrns[c("class", "package")])
##                class      package
## 1 classif.adaboostm1        RWeka
## 2   classif.boosting adabag,rpart
## 3        classif.C50          C50
## 4    classif.cforest        party
## 5      classif.ctree        party
## 6   classif.cvglmnet       glmnet
## 返回mlr包里面能...(没看懂)
## The calls above return character vectors, but you can also create learner objects
head(listLearners("cluster", create = TRUE), 2)
## [[1]]
## Learner cluster.cmeans from package e1071,clue
## Type: cluster
## Name: Fuzzy C-Means Clustering; Short name: cmeans
## Class: cluster.cmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=2
## 
## 
## [[2]]
## Learner cluster.Cobweb from package RWeka
## Type: cluster
## Name: Cobweb Clustering Algorithm; Short name: cobweb
## Class: cluster.Cobweb
## Properties: numerics
## Predict-Type: response
## Hyperparameters:

6 Train

上面介绍了有关任务和学习器的知识,接下来就是利用给定的任务和学习器进行训练建模了。

6.1 Training a Learner

一般而言,额外的去创建一个学习器是没必要的。但是当你不使用默认参数,需要设定超参数、设定输出类型等时才需要我额外去创建一个学习器。

## 定义一个任务
task <- makeClassifTask(data = iris, target = "Species")

## 创建学习器
lrn <- makeLearner("classif.lda")

## 训练建模
mod <- train(lrn, task)
mod
## Model for learner.id=classif.lda; learner.class=classif.lda
## Trained on: task.id = iris; obs = 150; features = 4
## Hyperparameters:
## 训练建模(没有额外创建学习器)
mod <- train("classif.lda", task)
mod
## Model for learner.id=classif.lda; learner.class=classif.lda
## Trained on: task.id = iris; obs = 150; features = 4
## Hyperparameters:
mod <- train("surv.coxph", lung.task)
mod
## Model for learner.id=surv.coxph; learner.class=surv.coxph
## Trained on: task.id = lung-example; obs = 167; features = 8
## Hyperparameters:

6.2 Accessing learner models

训练的模型中包括很多内容,比如有关学习器、任务、变量、观测数和训练时间等。返回的最终的模型可以用$learner.model或者getLearnerModel()来提取。下面使用Ruspini数据集(两个变量)进行聚类(K=4)。

## 导入数据,并且查看数据的二维分布
data(ruspini, package = "cluster")
plot(y ~ x, ruspini)

## 定义一个任务
ruspini_tast <- makeClusterTask(data = ruspini)

## 构建学习器
lrn <- makeLearner("cluster.kmeans", centers = 4)

## 训练模型
mod <- train(lrn, ruspini_tast)
mod
## Model for learner.id=cluster.kmeans; learner.class=cluster.kmeans
## Trained on: task.id = ruspini; obs = 75; features = 2
## Hyperparameters: centers=4
## 查看模型中的内容
names(mod)
## [1] "learner"       "learner.model" "task.desc"     "subset"       
## [5] "features"      "factor.levels" "time"          "dump"
mod$learner
## Learner cluster.kmeans from package stats,clue
## Type: cluster
## Name: K-Means; Short name: kmeans
## Class: cluster.kmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=4
mod$features
## [1] "x" "y"
mod$time
## [1] 0
getLearnerModel(mod)
## K-means clustering with 4 clusters of sizes 20, 23, 17, 15
## 
## Cluster means:
##          x        y
## 1 20.15000  64.9500
## 2 43.91304 146.0435
## 3 98.17647 114.8824
## 4 68.93333  19.4000
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
##  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3 
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 
##  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4 
## 
## Within cluster sum of squares by cluster:
## [1] 3689.500 3176.783 4558.235 1456.533
##  (between_SS / total_SS =  94.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

6.3 Further options & comments

训练模型使用的是train()函数,里面有一个参数subset可以选择所有观测的子集进行训练。然而,一般都会采用重抽样的策略,抽选子集不是一个好的选择。还有就是对于类别不平衡问题,可以设置weight进行平衡,在mlr中提供了很多中处理不平衡数据的方法,详情请点击下方链接。
https://mlr-org.github.io/mlr-tutorial/release/html/over_and_undersampling/index.html

6.3.1 subset

## 获取观察数量
n <- getTaskSize(bh.task)

## 选择1/3的观测作为训练集
set.seed(45)
train_set <- sample(n, size = n/3)

## 训练模型
mod <- train("regr.lm", task = bh.task, subset = train_set)
mod
## Model for learner.id=regr.lm; learner.class=regr.lm
## Trained on: task.id = BostonHousing-example; obs = 168; features = 13
## Hyperparameters:

6.3.2 weights

## 计算权重
target <- getTaskTargets(bc.task)
tab <- as.numeric(table(target))
w = 1/tab[target]

## 训练模型
mod <- train("classif.rpart", task = bc.task, weights = w)
mod
## Model for learner.id=classif.rpart; learner.class=classif.rpart
## Trained on: task.id = BreastCancer-example; obs = 683; features = 9
## Hyperparameters: xval=0

7 Predict

预测大概就是把新数据传入训练好的模型中,经过模型的计算返回结果的过程。上面已经介绍过如何建立模型了,那么现在需要的就是如何将新数据传入模型。在mlr包中提供了两种方式:

  • 通过从task中选取子集传入(subset)
  • 直接传入新数据(newdata)

注:使用mlr包进行预测的时候,不需要去剔除目标变量,它将会被自动删除。

7.1 Predicting Outcomes for New Data

下面使用BostonHousing和iris两个数据来演示两种传入数据进行预测的方式。

7.1.1 subset

n <- getTaskSize(bh.task)
train_set <- seq(1, n, by = 2)
test_set <- seq(2, n, by = 2)

lrn <- makeLearner("regr.gbm", n.trees = 100)
mod <- train(lrn, bh.task, subset = train_set)

task_pred <- predict(mod, task = bh.task, subset = test_set)
task_pred
## Prediction: 253 observations
## predict.type: response
## threshold: 
## time: 0.00
##    id truth response
## 2   2  21.6 22.22588
## 4   4  33.4 23.18013
## 6   6  28.7 22.37618
## 8   8  27.1 22.13908
## 10 10  18.9 22.13908
## 12 12  18.9 22.13908
## ... (#rows: 253, #cols: 3)

7.1.2 newdata

n <- nrow(iris)
iris_train <- iris[seq(1, n, by = 2), -5]
iris_test <- iris[seq(2, n, by = 2), -5]

task <- makeClusterTask(data = iris_train)
mod <- train("cluster.kmeans", task)

newdata_pred <- predict(mod, newdata = iris_test)
newdata_pred
## Prediction: 75 observations
## predict.type: response
## threshold: 
## time: 0.00
##    response
## 2         2
## 4         2
## 6         2
## 8         2
## 10        2
## 12        2
## ... (#rows: 75, #cols: 1)

7.2 Accessing the prediction

下面呢,我们分别用不同的方法来获取上面的两个预测的相关信息。

  • 使用as.dada.frame的方式获取的信息不只是预测的结果,还会有其他信息。
  • 使getPredictionTruth()getPredictionResponse()可以获取具体的信息。

7.2.1 getPrediction_

head(as.data.frame(task_pred))
##    id truth response
## 2   2  21.6 22.22588
## 4   4  33.4 23.18013
## 6   6  28.7 22.37618
## 8   8  27.1 22.13908
## 10 10  18.9 22.13908
## 12 12  18.9 22.13908
head(as.data.frame(newdata_pred))
##    response
## 2         2
## 4         2
## 6         2
## 8         2
## 10        2
## 12        2
head(getPredictionTruth(task_pred))
## [1] 21.6 33.4 28.7 27.1 18.9 18.9
head(getPredictionResponse(task_pred))
## [1] 22.22588 23.18013 22.37618 22.13908 22.13908 22.13908

7.2.2 Regression: standard errors

## 返回mlr中可以返回标准误差的回归学习器的类别和名称
head(listLearners("regr", check.packages = F, properties = "se")[c("class", "name")])
##          class
## 1   regr.bcart
## 2     regr.bgp
## 3  regr.bgpllm
## 4     regr.blm
## 5    regr.btgp
## 6 regr.btgpllm
##                                                                      name
## 1                                                           Bayesian CART
## 2                                               Bayesian Gaussian Process
## 3       Bayesian Gaussian Process with jumps to the Limiting Linear Model
## 4                                                   Bayesian Linear Model
## 5                                         Bayesian Treed Gaussian Process
## 6 Bayesian Treed Gaussian Process with jumps to the Limiting Linear Model
## Create learner and specify predict.type
lrn_lm <- makeLearner("regr.lm", predict.type = 'se')
mod_lm <- train(lrn_lm, bh.task, subset = train_set)
task_pred_lm <- predict(mod_lm, task = bh.task, subset = test_set)
task_pred_lm
## Prediction: 253 observations
## predict.type: se
## threshold: 
## time: 0.00
##    id truth response        se
## 2   2  21.6 24.83734 0.7501615
## 4   4  33.4 28.38206 0.8742590
## 6   6  28.7 25.16725 0.8652139
## 8   8  27.1 19.38145 1.1963265
## 10 10  18.9 18.66449 1.1793944
## 12 12  18.9 21.25802 1.0727918
## ... (#rows: 253, #cols: 4)
getPredictionSE(task_pred_lm)
##   [1] 0.7501615 0.8742590 0.8652139 1.1963265 1.1793944 1.0727918 0.8429301
##   [8] 0.8501582 0.7679939 0.7978658 0.8242887 0.9518889 0.8870152 0.8855407
##  [15] 0.8701704 0.9574664 0.8840397 0.7264163 0.7792051 1.2116561 1.1769924
##  [22] 1.0915279 0.8271369 1.0202378 0.8842189 0.8842040 0.8469392 1.3276524
##  [29] 1.4106337 0.8445755 1.2408730 1.0284519 1.1041909 0.8291724 0.7543209
##  [36] 0.9929409 1.1375092 0.8367674 0.8402063 0.9082750 0.7022658 0.5771416
##  [43] 0.6308225 0.7278137 0.7811490 0.8189424 1.1384919 0.8335267 1.1364319
##  [50] 0.9138132 0.8760649 0.8655742 1.0203456 0.9359339 0.9109483 0.7508824
##  [57] 0.8448710 0.8043901 0.7906399 0.7786039 1.6494629 1.7380621 1.6491231
##  [64] 1.0193436 1.0088684 1.1054014 1.0286131 1.0927726 1.1071940 1.0774204
##  [71] 1.4149076 1.5083177 1.6783724 1.5859976 1.4461011 1.5548512 1.4818863
##  [78] 1.9595402 1.2321197 1.6034097 1.3099265 1.6743209 1.2322197 1.1830271
##  [85] 1.0915488 1.1511754 0.8948695 0.9734422 0.8509782 0.8854074 0.9652907
##  [92] 1.1272297 0.8477232 1.1092160 0.9952516 0.8699082 0.9101939 1.2047344
##  [99] 1.2692851 1.3234582 1.2451465 1.5187036 0.9656436 0.6745382 1.5195856
## [106] 1.4771975 0.8085293 0.6804735 0.8445263 1.2367161 1.4679595 0.7451217
## [113] 1.3120481 0.7550944 1.0506752 0.7824162 1.0591481 0.6721549 0.7658662
## [120] 0.5971137 0.8165095 0.8888276 1.1275579 1.2189724 0.9495364 0.9423921
## [127] 1.7165446 1.1507946 1.5170934 1.3085342 1.2224821 1.2380792 1.4748090
## [134] 1.3338453 1.3168927 0.9507626 1.3621199 0.7914587 1.3003635 0.9604363
## [141] 0.8560980 1.7185417 0.8309263 0.8365242 0.8283471 1.3929607 1.2917833
## [148] 1.2660946 1.2910508 1.0252428 0.6695624 0.8263094 1.0279423 1.0776391
## [155] 0.6124678 0.7961893 0.6542160 0.6485521 0.5305173 0.5148376 0.6159465
## [162] 0.7209050 1.0336628 0.6941150 1.3315840 0.7510469 1.0942469 1.0304272
## [169] 0.9124961 0.8779543 0.8502599 0.8013981 1.1782294 1.1445984 1.1966195
## [176] 1.4522210 1.9706832 1.6584676 1.4066579 1.0953827 0.9976729 1.3700237
## [183] 2.2591492 1.9818337 1.4634298 1.0009532 1.2710083 1.0686214 1.0153052
## [190] 0.8512834 0.9063260 0.8898067 1.0443378 1.1404651 0.8633209 0.8303382
## [197] 0.7996595 0.8201927 0.8099873 1.1723586 0.8322803 0.9681173 2.3119839
## [204] 0.9942718 1.0421645 1.3309447 1.1610612 1.3856653 0.9935274 1.3833241
## [211] 0.7574302 1.3899578 1.2231405 1.5140112 1.1854390 1.2557147 1.0655491
## [218] 1.1503370 1.3098914 0.8885734 0.9079921 0.9101242 1.2690583 0.8749875
## [225] 0.7832464 0.9013180 1.1767535 1.2340304 1.3179744 0.8855876 0.9116910
## [232] 0.9899714 1.0197644 0.9329738 0.9604394 1.1078087 0.9803479 1.0075409
## [239] 0.8705739 0.8214196 1.0967451 1.2078927 1.0442883 1.0051250 1.8768373
## [246] 1.8896669 0.8663897 1.2048017 0.6488765 0.7503097 0.9700472 1.1082776
## [253] 1.0145445

7.2.3 clustering: probabilities

lrn <- makeLearner("cluster.cmeans", predict.type = "prob")
mod <- train(lrn, mtcars.task)

pred <- predict(mod, task = mtcars.task)
head(getPredictionProbabilities(pred))
##                            1           2
## Mazda RX4         0.97959271 0.020407288
## Mazda RX4 Wag     0.97963293 0.020367069
## Datsun 710        0.99266074 0.007339256
## Hornet 4 Drive    0.54290616 0.457093843
## Hornet Sportabout 0.01870482 0.981295175
## Valiant           0.75745403 0.242545967

7.2.4 Classification: probabilities

lrn <- makeLearner("classif.rpart", predict.type = "prob")

mod <- train(lrn, task = iris.task)

pred <- predict(mod, newdata = iris)
head(as.data.frame(pred))
##    truth prob.setosa prob.versicolor prob.virginica response
## 1 setosa           1               0              0   setosa
## 2 setosa           1               0              0   setosa
## 3 setosa           1               0              0   setosa
## 4 setosa           1               0              0   setosa
## 5 setosa           1               0              0   setosa
## 6 setosa           1               0              0   setosa
head(getPredictionProbabilities(pred))  # 只获取概率
##   setosa versicolor virginica
## 1      1          0         0
## 2      1          0         0
## 3      1          0         0
## 4      1          0         0
## 5      1          0         0
## 6      1          0         0

7.2.5 Classifiction: Confusion matrix

calculateConfusionMatrix(pred)
##             predicted
## true         setosa versicolor virginica -err.-
##   setosa         50          0         0      0
##   versicolor      0         49         1      1
##   virginica       0          5        45      5
##   -err.-          0          5         1      6
conf_matrix <- calculateConfusionMatrix(pred, relative = T); conf_matrix
## Relative confusion matrix (normalized by row/column):
##             predicted
## true         setosa    versicolor virginica -err.-   
##   setosa     1.00/1.00 0.00/0.00  0.00/0.00 0.00     
##   versicolor 0.00/0.00 0.98/0.91  0.02/0.02 0.02     
##   virginica  0.00/0.00 0.10/0.09  0.90/0.98 0.10     
##   -err.-          0.00      0.09       0.02 0.04     
## 
## 
## Absolute confusion matrix:
##             predicted
## true         setosa versicolor virginica -err.-
##   setosa         50          0         0      0
##   versicolor      0         49         1      1
##   virginica       0          5        45      5
##   -err.-          0          5         1      6
conf_matrix$relative.row
##            setosa versicolor virginica -err-
## setosa          1       0.00      0.00  0.00
## versicolor      0       0.98      0.02  0.02
## virginica       0       0.10      0.90  0.10

7.2.6 Classification: Adjusting the decision threshold

lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, task = sonar.task)

## 查看此任务的正例
getTaskDesc(sonar.task)$positive
## [1] "M"
## 查看默认阈值
pred1 <- predict(mod, sonar.task)
pred1$threshold
##   M   R 
## 0.5 0.5
## 自定义阈值
pred2 <- setThreshold(pred1, 0.9)
pred2$threshold
##   M   R 
## 0.9 0.1
pred2
## Prediction: 208 observations
## predict.type: prob
## threshold: M=0.90,R=0.10
## time: 0.00
##   id truth    prob.M    prob.R response
## 1  1     R 0.1060606 0.8939394        R
## 2  2     R 0.7333333 0.2666667        R
## 3  3     R 0.0000000 1.0000000        R
## 4  4     R 0.1060606 0.8939394        R
## 5  5     R 0.9250000 0.0750000        M
## 6  6     R 0.0000000 1.0000000        R
## ... (#rows: 208, #cols: 5)
## 查看彼此的混淆矩阵
calculateConfusionMatrix(pred1)
##         predicted
## true      M  R -err.-
##   M      95 16     16
##   R      10 87     10
##   -err.- 10 16     26
calculateConfusionMatrix(pred2)
##         predicted
## true      M  R -err.-
##   M      84 27     27
##   R       6 91      6
##   -err.-  6 27     33
## 获取每个样本被预测为正例的概率
head(getPredictionProbabilities(pred1))
## [1] 0.1060606 0.7333333 0.0000000 0.1060606 0.9250000 0.0000000
## 获取每个样本被预测为正例或者反例的概率
head(getPredictionProbabilities(pred1, cl = c("M", "R")))
##           M         R
## 1 0.1060606 0.8939394
## 2 0.7333333 0.2666667
## 3 0.0000000 1.0000000
## 4 0.1060606 0.8939394
## 5 0.9250000 0.0750000
## 6 0.0000000 1.0000000
## 对于多分类同样可以设置阈值
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, iris.task)
pred <- predict(mod, newdata = iris)
pred$threshold
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333
table(as.data.frame(pred)$response)
## 
##     setosa versicolor  virginica 
##         50         54         46
pred <- setThreshold(pred, c(setosa = 0.01, versicolor = 50, virginica = 1))
pred$threshold
##     setosa versicolor  virginica 
##       0.01      50.00       1.00
table(as.data.frame(pred)$response)
## 
##     setosa versicolor  virginica 
##         50          0        100

7.2.7 Visualizing the prediction

对于不同的任务,会默认选择数据的前两个特征进行相应的学习器的训练建立模型(可以自定义两个变量),然后就行预测,把预测结果进行可视化。 对于回归任务,得选择单一或者两个变量会产生不同的效果。

## 分类任务
lrn <- makeLearner("classif.rpart", id = "CART")
plotLearnerPrediction(lrn, task = iris.task)

## 聚类任务
lrn <- makeLearner("cluster.kmeans")
plotLearnerPrediction(lrn, task = mtcars.task, features = c("disp", "drat"), cv = 0)

## 回归任务
plotLearnerPrediction("regr.lm", features = "lstat", task = bh.task)          # 单一变量

plotLearnerPrediction("regr.lm", features = c("lstat", "rm"), task = bh.task) # 两个变量

8 Preprocessing(NO)

9 Performance

9.1 Listing measures

不同的任务有不同的评估方法,可以使用listMeaasures()来获取指定任务的所有评估方法。为了方便,mlr包中会针对不同的任务指定一个默认的评估方法。比如说在回归任务中,默认使用均方误差(mse),在分类任务中使用误分类率(mmce)等。如何想知道具体的默认方法,可以使用getDefaultMeasure()去查看各任务所对应的各种评估方法。

# 查看适合多分类学习器的评估方法
listMeasures("classif", properties = "classif.multi")
##  [1] "featperc"         "mmce"             "lsr"             
##  [4] "bac"              "qsr"              "timeboth"        
##  [7] "multiclass.aunp"  "timetrain"        "multiclass.aunu" 
## [10] "ber"              "timepredict"      "multiclass.brier"
## [13] "ssr"              "acc"              "logloss"         
## [16] "wkappa"           "multiclass.au1p"  "multiclass.au1u" 
## [19] "kappa"
# 查看适合iris.task的评估方法
listMeasures(iris.task)
##  [1] "featperc"         "mmce"             "lsr"             
##  [4] "bac"              "qsr"              "timeboth"        
##  [7] "multiclass.aunp"  "timetrain"        "multiclass.aunu" 
## [10] "ber"              "timepredict"      "multiclass.brier"
## [13] "ssr"              "acc"              "logloss"         
## [16] "wkappa"           "multiclass.au1p"  "multiclass.au1u" 
## [19] "kappa"
# 查看iris.task中默认的评估方法
getDefaultMeasure(iris.task)
## Name: Mean misclassification error
## Performance measure: mmce
## Properties: classif,classif.multi,req.pred,req.truth
## Minimize: TRUE
## Best: 0; Worst: 1
## Aggregated by: test.mean
## Arguments: 
## Note: Defined as: mean(response != truth)
# 查看regr.lm中默认的评估方法
getDefaultMeasure(makeLearner("regr.lm"))
## Name: Mean of squared errors
## Performance measure: mse
## Properties: regr,req.pred,req.truth
## Minimize: TRUE
## Best: 0; Worst: Inf
## Aggregated by: test.mean
## Arguments: 
## Note: Defined as: mean((response - truth)^2)

9.2 Calculate performance meaasures

n <- getTaskSize(bh.task)
lrn <- makeLearner("regr.gbm", n.trees = 1000)
mod <- train(lrn, task = bh.task, subset = seq(1, n, 2))
pred <- predict(mod, task = bh.task, subset = seq(2, n, 2))

performance(pred)
##      mse 
## 42.83207
performance(pred, measures = medse)
##    medse 
## 9.315383
performance(pred, measures = list(mse, medse, mae))
##       mse     medse       mae 
## 42.832075  9.315383  4.557009

9.3 Requirements of performance measures

performance(pred, measures = timetrain, model = mod)
## timetrain 
##      0.28
lrn <- makeLearner("cluster.kmeans", centers = 3)
mod <- train(lrn, mtcars.task)
pred <- predict(mod, task = mtcars.task)

performance(pred, measures = dunn, task = mtcars.task)
##      dunn 
## 0.1462919
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, task = sonar.task)
pred <- predict(mod, task = sonar.task)

performance(pred, measures = auc)
##       auc 
## 0.9224018

9.4 Access a performance measure

## Mean misclassification error
str(mmce)
## List of 10
##  $ id        : chr "mmce"
##  $ minimize  : logi TRUE
##  $ properties: chr [1:4] "classif" "classif.multi" "req.pred" "req.truth"
##  $ fun       :function (task, model, pred, feats, extra.args)  
##  $ extra.args: list()
##  $ best      : num 0
##  $ worst     : num 1
##  $ name      : chr "Mean misclassification error"
##  $ note      : chr "Defined as: mean(response != truth)"
##  $ aggr      :List of 4
##   ..$ id        : chr "test.mean"
##   ..$ name      : chr "Test mean"
##   ..$ fun       :function (task, perf.test, perf.train, measure, group, pred)  
##   ..$ properties: chr "req.test"
##   ..- attr(*, "class")= chr "Aggregation"
##  - attr(*, "class")= chr "Measure"

9.5 Plot performance versus threshold

## 训练模型,并预测
lrn <- makeLearner("classif.lda", predict.type = "prob")
n <- getTaskSize(sonar.task)
mod <- train(lrn, task = sonar.task, subset = seq(1, n, by = 2))
pred <- predict(mod, task = sonar.task, subset = seq(2, n, by = 2))

## 评估模型(阈值是0.5)
performance(pred, measures = list(fpr, fnr, mmce))
##       fpr       fnr      mmce 
## 0.2500000 0.3035714 0.2788462
## 查看不同阈值下的模型指标
d <- generateThreshVsPerfData(pred, measures = list(fpr, fnr, mmce))
plotThreshVsPerf(d)

9.6 ROC measures

r <- calculateROCMeasures(pred); r
##     predicted
## true M         R                            
##    M 39        17        tpr: 0.7  fnr: 0.3 
##    R 12        36        fpr: 0.25 tnr: 0.75
##      ppv: 0.76 for: 0.32 lrp: 2.79 acc: 0.72
##      fdr: 0.24 npv: 0.68 lrm: 0.4  dor: 6.88
## 
## 
## Abbreviations:
## tpr - True positive rate (Sensitivity, Recall)
## fpr - False positive rate (Fall-out)
## fnr - False negative rate (Miss rate)
## tnr - True negative rate (Specificity)
## ppv - Positive predictive value (Precision)
## for - False omission rate
## lrp - Positive likelihood ratio (LR+)
## fdr - False discovery rate
## npv - Negative predictive value
## acc - Accuracy
## lrm - Negative likelihood ratio (LR-)
## dor - Diagnostic odds ratio

10 Resampling

10.1 Defining the resampling strategy

mlr包中提供了6种重采样策略:

  • Cross-validation (“CV”)
  • Leave-one-out cross-validation (“LOO”)
  • Repeated cross-validation (“RepCV”)
  • Out-of-bag bootstrap and other variants like b632 (“Bootstrap”)
  • Subsampling, also called Monte-Carlo cross-validation (“Subsample”)
  • Holdout (training/test) (“Holdout”)

## 3折交叉验证
rdesc <- makeResampleDesc("CV", iters = 3); rdesc
## Resample description: cross-validation with 3 iterations.
## Predict: test
## Stratification: FALSE
## Holdout
rdesc <- makeResampleDesc("Holdout"); rdesc
## Resample description: holdout with 0.67 split rate.
## Predict: test
## Stratification: FALSE

10.2 Perforing the resampling

下面是对BostonHousing拟合其medv(因变量)值,使用了3折交叉验证的重采样策略。目前我对于最后返回的medv的理解就是三次结果取平均,因为最后的mse是三个模型mse的均值。

rdesc <- makeResampleDesc("CV", iters = 3)

set.seed(45)
r <- resample("regr.lm", bh.task, rdesc); r
## Resample Result
## Task: BostonHousing-example
## Learner: regr.lm
## Aggr perf: mse.test.mean=24.0414320
## Runtime: 0.044003