mlr包可谓是机器学习的一大利器,该包综合了诸多的R机器学习包,统一了语法,提供了大量便捷的工具,同时使调参和模型选择更加便捷。
# install.packages("mlr", denpencies = T)
在系统的学习mlr包之前,我们先用其做一个简单的案例,先见识一下其独特的语法。下面我们就用大家所熟悉的iris数据开始做一个分类模型:
注:下面仅是一个简单的举例,很多地方并不完善。
## 加载mlr包,并且确定数据
library(mlr)
dat <- iris
## 定义任务
task <- makeClassifTask(data = dat, target = "Species")
## 定义学习器
lrn <- makeLearner("classif.lda")
## 创建训练集和测试集
set.seed(45)
n <- nrow(dat)
train_set <- sample(n, size = 2/3*n)
test_set <- setdiff(1:n, train_set)
## 训练模型
model_lda <- train(learner = lrn, task = task, subset = train_set)
## 预测
pred_lda <- predict(model_lda, task = task, subset = test_set)
## 模型评估
lda_evaluate <- performance(pred_lda, measures = list(mmce, acc))
lda_evaluate
## mmce acc
## 0.04 0.96
在本部分,主要总结了如何创建各类任务、如何访问任务和如何修改任务。其中如何访问任务中包含了如何获取任务中的各种数据信息。
mlr包中提供了以下几种任务:
下面我们分别用简短的例子去实现:
data(BostonHousing, package = "mlbench")
regr_task <- makeRegrTask(data = BostonHousing, target = "medv")
regr_task
## Supervised task: BostonHousing
## Type: regr
## Target: medv
## Observations: 506
## Features:
## numerics factors ordered functionals
## 12 1 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
data(BreastCancer, package = "mlbench")
df <- BreastCancer
df$Id <- NULL
classif_task <- makeClassifTask(data = df, target = "Class", positive = "malignant")
classif_task
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
## numerics factors ordered functionals
## 0 4 5 0
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
## benign malignant
## 458 241
## Positive class: malignant
data(lung, package = "survival")
lung$status <- (lung$status == 2) #将此列转换为逻辑型变量
surv_task <- makeSurvTask(data = lung, target = c("time", "status"))
surv_task
## Supervised task: lung
## Type: surv
## Target: time,status
## Events: 165
## Observations: 228
## Features:
## numerics factors ordered functionals
## 8 0 0 0
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
yeast <- getTaskData(yeast.task)
labels <- colnames(yeast)[1:14]
yeast_task <- makeMultilabelTask(data = yeast, target = labels)
yeast_task
## Supervised task: yeast
## Type: multilabel
## Target: label1,label2,label3,label4,label5,label6,label7,label8,label9,label10,label11,label12,label13,label14
## Observations: 2417
## Features:
## numerics factors ordered functionals
## 103 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 14
## label1 label2 label3 label4 label5 label6 label7 label8 label9
## 762 1038 983 862 722 597 428 480 178
## label10 label11 label12 label13 label14
## 253 289 1816 1799 34
data(mtcars, package = "datasets")
cluster_task <- makeClusterTask(data = mtcars)
cluster_task
## Unsupervised task: mtcars
## Type: cluster
## Observations: 32
## Features:
## numerics factors ordered functionals
## 11 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
df <- iris
cost <- matrix(runif(150 * 3, 0, 2000), 150) * (1 - diag(3))[df$Species,]
df$Species <- NULL
costsens_task <- makeCostSensTask(data = df, cost = cost)
costsens_task
## Supervised task: df
## Type: costsens
## Observations: 150
## Features:
## numerics factors ordered functionals
## 4 0 0 0
## Missings: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 3
## y1, y2, y3
本小节涉及两个方面:访问任务和获取任务中的数据信息。
getTaskDesc(classif_task) #返回最详细的任务描述
## $id
## [1] "df"
##
## $type
## [1] "classif"
##
## $target
## [1] "Class"
##
## $size
## [1] 699
##
## $n.feat
## numerics factors ordered functionals
## 0 4 5 0
##
## $has.missings
## [1] TRUE
##
## $has.weights
## [1] FALSE
##
## $has.blocking
## [1] FALSE
##
## $has.coordinates
## [1] FALSE
##
## $class.levels
## [1] "benign" "malignant"
##
## $positive
## [1] "malignant"
##
## $negative
## [1] "benign"
##
## $class.distribution
##
## benign malignant
## 458 241
##
## attr(,"class")
## [1] "ClassifTaskDesc" "SupervisedTaskDesc" "TaskDesc"
getTaskId(classif_task) #返回任务的ID
## [1] "df"
getTaskType(classif_task) #返回任务类型
## [1] "classif"
getTaskTargetNames(classif_task) #返回目标变量名
## [1] "Class"
getTaskSize(classif_task) #返回数据大小
## [1] 699
getTaskNFeats(classif_task) #返回输入变量
## [1] 9
getTaskClassLevels(classif_task) #返回分类任务当中的'类'
## [1] "benign" "malignant"
head(getTaskData(classif_task)) #获取任务数据
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei
## 1 5 1 1 1 2 1
## 2 5 4 4 5 7 10
## 3 3 1 1 1 2 2
## 4 6 8 8 1 3 4
## 5 4 1 1 3 2 1
## 6 8 10 10 8 7 10
## Bl.cromatin Normal.nucleoli Mitoses Class
## 1 3 1 1 benign
## 2 3 2 1 benign
## 3 3 1 1 benign
## 4 3 7 1 benign
## 5 3 1 1 benign
## 6 9 7 1 malignant
getTaskFeatureNames(classif_task) #获取输入变量名
## [1] "Cl.thickness" "Cell.size" "Cell.shape" "Marg.adhesion"
## [5] "Epith.c.size" "Bare.nuclei" "Bl.cromatin" "Normal.nucleoli"
## [9] "Mitoses"
head(getTaskTargets(classif_task)) #获取目标变量名
## [1] benign benign benign benign benign malignant
## Levels: benign malignant
head(getTaskCosts(costsens_task)) #获取成本矩阵
## y1 y2 y3
## [1,] 0 707.3011 932.5762
## [2,] 0 77.8694 1531.8597
## [3,] 0 984.2454 753.4774
## [4,] 0 740.8373 797.5681
## [5,] 0 1457.0867 1711.5323
## [6,] 0 1129.0749 1249.6415
下面演示部分修改任务的操作,详细的操作请在https://www.rdocumentation.org/packages/mlr/versions/2.12.1/topics/capLargeValues中查看。后续我会补充进来…
## 设定一个子任务(挑选部分观测建立任务)
cluster_task_sub <- subsetTask(cluster_task, subset = 4:17)
cluster_task_sub
## Unsupervised task: mtcars
## Type: cluster
## Observations: 14
## Features:
## numerics factors ordered functionals
## 11 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## 移除恒定值的自变量
removeConstantFeatures(classif_task)
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
## numerics factors ordered functionals
## 0 4 5 0
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
## benign malignant
## 458 241
## Positive class: malignant
## 移除选中的自变量
dropFeatures(classif_task, c("Cell.size", "Cell.shape"))
## Supervised task: df
## Type: classif
## Target: Class
## Observations: 699
## Features:
## numerics factors ordered functionals
## 0 4 3 0
## Missings: TRUE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
## benign malignant
## 458 241
## Positive class: malignant
## 对数值型变量标准化
task <- normalizeFeatures(cluster_task, method = "range")
summary(getTaskData(task))
## mpg cyl disp hp
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2138 1st Qu.:0.0000 1st Qu.:0.1240 1st Qu.:0.1572
## Median :0.3745 Median :0.5000 Median :0.3123 Median :0.2509
## Mean :0.4124 Mean :0.5469 Mean :0.3982 Mean :0.3346
## 3rd Qu.:0.5277 3rd Qu.:1.0000 3rd Qu.:0.6358 3rd Qu.:0.4523
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## drat wt qsec vs
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1475 1st Qu.:0.2731 1st Qu.:0.2848 1st Qu.:0.0000
## Median :0.4309 Median :0.4633 Median :0.3821 Median :0.0000
## Mean :0.3855 Mean :0.4358 Mean :0.3987 Mean :0.4375
## 3rd Qu.:0.5346 3rd Qu.:0.5362 3rd Qu.:0.5238 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.1429
## Median :0.0000 Median :0.5000 Median :0.1429
## Mean :0.4062 Mean :0.3438 Mean :0.2589
## 3rd Qu.:1.0000 3rd Qu.:0.5000 3rd Qu.:0.4286
## Max. :1.0000 Max. :1.0000 Max. :1.0000
上面把有关task的内容介绍了一下,下面就该学习有关学习器的内容了。下面主要涉及到了如何构建学习器、获取学习器的信息、修改学习器和查询学习器列表。
在构建学习器时,不仅可以指定学习器的名称,还可以设置超参数、预测的输出类型(标签或概率)、设定学习器ID。下面简单地建立5个学习器展示一下如何构建学习器:
在构建学习器时,有以下下三个比较重要的参数:
fix.factors.prediction = T
来避免因此导致的错误。指的一提的是,mlr包还提供了一个函数makeLearners()
,该函数可以同时对同一个数据建立多个模型。
## 构建随机森林学习器,并且设定输出结果是标签的概率
classif_lrn <- makeLearner("classif.randomForest", predict.type = "prob", fix.factors.prediction = TRUE)
## 建立回归学习器,并且设置相关参数(目前不知道参数是干啥的,待解决)
regr_lrn <- makeLearner("regr.gbm", par.vals = list(n.trees = 500, interaction.depth = 3))
## 建立Cox风险模型,并设定模型ID
surv_lrn <- makeLearner("surv.coxph", id = "cph")
## 建立K-Means模型,设定聚为5类
cluster_lrn = makeLearner("cluster.kmeans", centers = 5)
## 建立Multilabel Random Ferns算法
multilabel_lrn = makeLearner("multilabel.rFerns")
## 建立回归树和线性判别模型,并且指定返回模型的类型和输出标签是概率
rp_lda_lrn <- makeLearners(c("rpart", "lda"), type = "classif", predict.type = "prob")
获取学习器的相关信息的时候有两种方法,第一种就是使用$
提取,第二种就是使用mlr包中的函数:
getHyperPars()
getParamSet()
getLearnerShortName()
getLearnerType()
getLearnerPackaages()
cluster_lrn$par.vals
## $centers
## [1] 5
getHyperPars(cluster_lrn)
## $centers
## [1] 5
getParamSet(cluster_lrn)
## Type len Def Constr
## centers untyped - - -
## iter.max integer - 10 1 to Inf
## nstart integer - 1 1 to Inf
## algorithm discrete - Hartigan-Wong Hartigan-Wong,Lloyd,Forgy,MacQueen
## trace logical - - -
## Req Tunable Trafo
## centers - TRUE -
## iter.max - TRUE -
## nstart - TRUE -
## algorithm - TRUE -
## trace - FALSE -
getParamSet(cluster_lrn)
## Type len Def Constr
## centers untyped - - -
## iter.max integer - 10 1 to Inf
## nstart integer - 1 1 to Inf
## algorithm discrete - Hartigan-Wong Hartigan-Wong,Lloyd,Forgy,MacQueen
## trace logical - - -
## Req Tunable Trafo
## centers - TRUE -
## iter.max - TRUE -
## nstart - TRUE -
## algorithm - TRUE -
## trace - FALSE -
getLearnerId(cluster_lrn)
## [1] "cluster.kmeans"
getLearnerShortName(cluster_lrn)
## [1] "kmeans"
getLearnerType(cluster_lrn)
## [1] "cluster"
getLearnerPackages(cluster_lrn)
## [1] "stats" "clue"
与修改学习任务一样,在mlr包中也提供了修改学习器的函数,方便我们去修改某个学习器,而不用去重新创建,大大提高了工作效率。
下面主要演示了如何修改学习器ID、修改学习器返回目标变量的类型、修改参数、恢复自定义参数的默认值。
surv_lrn <- setLearnerId(surv_lrn, "CoxModel")
surv_lrn
## Learner CoxModel from package survival
## Type: surv
## Name: Cox Proportional Hazard Model; Short name: coxph
## Class: surv.coxph
## Properties: numerics,factors,weights
## Predict-Type: response
## Hyperparameters:
classif_lrn <- setPredictType(classif_lrn, "response")
classif_lrn
## Learner classif.randomForest from package randomForest
## Type: classif
## Name: Random Forest; Short name: rf
## Class: classif.randomForest
## Properties: twoclass,multiclass,numerics,factors,ordered,prob,class.weights,oobpreds,featimp
## Predict-Type: response
## Hyperparameters:
cluster_lrn <- setHyperPars(cluster_lrn, centers = 4)
cluster_lrn
## Learner cluster.kmeans from package stats,clue
## Type: cluster
## Name: K-Means; Short name: kmeans
## Class: cluster.kmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=4
regr_lrn <- removeHyperPars(regr_lrn, c("n.trees", "interaction.depth"))
regr_lrn
## Learner regr.gbm from package gbm
## Type: regr
## Name: Gradient Boosting Machine; Short name: gbm
## Class: regr.gbm
## Properties: missings,numerics,factors,weights,featimp
## Predict-Type: response
## Hyperparameters: distribution=gaussian,keep.data=FALSE
在mlr包中提供了上百种模型去供我们调用,如下图所示:
## 返回mlr里面的分类学习器和其对应的包
lrns <- listLearners()
head(lrns[c("class", "package")])
## class package
## 1 classif.ada ada,rpart
## 2 classif.adaboostm1 RWeka
## 3 classif.bartMachine bartMachine
## 4 classif.binomial stats
## 5 classif.blackboost mboost,party
## 6 classif.boosting adabag,rpart
## 返回mlr里面的可以返回概率的分类学习器和其对应的包
lrns = listLearners("classif", properties = "prob")
head(lrns[c("class", "package")])
## class package
## 1 classif.ada ada,rpart
## 2 classif.adaboostm1 RWeka
## 3 classif.bartMachine bartMachine
## 4 classif.binomial stats
## 5 classif.blackboost mboost,party
## 6 classif.boosting adabag,rpart
## 返回mlr里面能作用在iris.task且能返回概率的学习器和其对应的包
lrns = listLearners(iris.task, properties = "prob")
head(lrns[c("class", "package")])
## class package
## 1 classif.adaboostm1 RWeka
## 2 classif.boosting adabag,rpart
## 3 classif.C50 C50
## 4 classif.cforest party
## 5 classif.ctree party
## 6 classif.cvglmnet glmnet
## 返回mlr包里面能...(没看懂)
## The calls above return character vectors, but you can also create learner objects
head(listLearners("cluster", create = TRUE), 2)
## [[1]]
## Learner cluster.cmeans from package e1071,clue
## Type: cluster
## Name: Fuzzy C-Means Clustering; Short name: cmeans
## Class: cluster.cmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=2
##
##
## [[2]]
## Learner cluster.Cobweb from package RWeka
## Type: cluster
## Name: Cobweb Clustering Algorithm; Short name: cobweb
## Class: cluster.Cobweb
## Properties: numerics
## Predict-Type: response
## Hyperparameters:
上面介绍了有关任务和学习器的知识,接下来就是利用给定的任务和学习器进行训练建模了。
一般而言,额外的去创建一个学习器是没必要的。但是当你不使用默认参数,需要设定超参数、设定输出类型等时才需要我额外去创建一个学习器。
## 定义一个任务
task <- makeClassifTask(data = iris, target = "Species")
## 创建学习器
lrn <- makeLearner("classif.lda")
## 训练建模
mod <- train(lrn, task)
mod
## Model for learner.id=classif.lda; learner.class=classif.lda
## Trained on: task.id = iris; obs = 150; features = 4
## Hyperparameters:
## 训练建模(没有额外创建学习器)
mod <- train("classif.lda", task)
mod
## Model for learner.id=classif.lda; learner.class=classif.lda
## Trained on: task.id = iris; obs = 150; features = 4
## Hyperparameters:
mod <- train("surv.coxph", lung.task)
mod
## Model for learner.id=surv.coxph; learner.class=surv.coxph
## Trained on: task.id = lung-example; obs = 167; features = 8
## Hyperparameters:
训练的模型中包括很多内容,比如有关学习器、任务、变量、观测数和训练时间等。返回的最终的模型可以用$learner.model
或者getLearnerModel()
来提取。下面使用Ruspini数据集(两个变量)进行聚类(K=4)。
## 导入数据,并且查看数据的二维分布
data(ruspini, package = "cluster")
plot(y ~ x, ruspini)
## 定义一个任务
ruspini_tast <- makeClusterTask(data = ruspini)
## 构建学习器
lrn <- makeLearner("cluster.kmeans", centers = 4)
## 训练模型
mod <- train(lrn, ruspini_tast)
mod
## Model for learner.id=cluster.kmeans; learner.class=cluster.kmeans
## Trained on: task.id = ruspini; obs = 75; features = 2
## Hyperparameters: centers=4
## 查看模型中的内容
names(mod)
## [1] "learner" "learner.model" "task.desc" "subset"
## [5] "features" "factor.levels" "time" "dump"
mod$learner
## Learner cluster.kmeans from package stats,clue
## Type: cluster
## Name: K-Means; Short name: kmeans
## Class: cluster.kmeans
## Properties: numerics,prob
## Predict-Type: response
## Hyperparameters: centers=4
mod$features
## [1] "x" "y"
mod$time
## [1] 0
getLearnerModel(mod)
## K-means clustering with 4 clusters of sizes 20, 23, 17, 15
##
## Cluster means:
## x y
## 1 20.15000 64.9500
## 2 43.91304 146.0435
## 3 98.17647 114.8824
## 4 68.93333 19.4000
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3
## 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##
## Within cluster sum of squares by cluster:
## [1] 3689.500 3176.783 4558.235 1456.533
## (between_SS / total_SS = 94.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
训练模型使用的是train()
函数,里面有一个参数subset
可以选择所有观测的子集进行训练。然而,一般都会采用重抽样的策略,抽选子集不是一个好的选择。还有就是对于类别不平衡问题,可以设置weight
进行平衡,在mlr中提供了很多中处理不平衡数据的方法,详情请点击下方链接。
https://mlr-org.github.io/mlr-tutorial/release/html/over_and_undersampling/index.html
## 获取观察数量
n <- getTaskSize(bh.task)
## 选择1/3的观测作为训练集
set.seed(45)
train_set <- sample(n, size = n/3)
## 训练模型
mod <- train("regr.lm", task = bh.task, subset = train_set)
mod
## Model for learner.id=regr.lm; learner.class=regr.lm
## Trained on: task.id = BostonHousing-example; obs = 168; features = 13
## Hyperparameters:
## 计算权重
target <- getTaskTargets(bc.task)
tab <- as.numeric(table(target))
w = 1/tab[target]
## 训练模型
mod <- train("classif.rpart", task = bc.task, weights = w)
mod
## Model for learner.id=classif.rpart; learner.class=classif.rpart
## Trained on: task.id = BreastCancer-example; obs = 683; features = 9
## Hyperparameters: xval=0
预测大概就是把新数据传入训练好的模型中,经过模型的计算返回结果的过程。上面已经介绍过如何建立模型了,那么现在需要的就是如何将新数据传入模型。在mlr包中提供了两种方式:
注:使用mlr包进行预测的时候,不需要去剔除目标变量,它将会被自动删除。
下面使用BostonHousing和iris两个数据来演示两种传入数据进行预测的方式。
n <- getTaskSize(bh.task)
train_set <- seq(1, n, by = 2)
test_set <- seq(2, n, by = 2)
lrn <- makeLearner("regr.gbm", n.trees = 100)
mod <- train(lrn, bh.task, subset = train_set)
task_pred <- predict(mod, task = bh.task, subset = test_set)
task_pred
## Prediction: 253 observations
## predict.type: response
## threshold:
## time: 0.00
## id truth response
## 2 2 21.6 22.22588
## 4 4 33.4 23.18013
## 6 6 28.7 22.37618
## 8 8 27.1 22.13908
## 10 10 18.9 22.13908
## 12 12 18.9 22.13908
## ... (#rows: 253, #cols: 3)
n <- nrow(iris)
iris_train <- iris[seq(1, n, by = 2), -5]
iris_test <- iris[seq(2, n, by = 2), -5]
task <- makeClusterTask(data = iris_train)
mod <- train("cluster.kmeans", task)
newdata_pred <- predict(mod, newdata = iris_test)
newdata_pred
## Prediction: 75 observations
## predict.type: response
## threshold:
## time: 0.00
## response
## 2 2
## 4 2
## 6 2
## 8 2
## 10 2
## 12 2
## ... (#rows: 75, #cols: 1)
下面呢,我们分别用不同的方法来获取上面的两个预测的相关信息。
getPredictionTruth()
和getPredictionResponse()
可以获取具体的信息。head(as.data.frame(task_pred))
## id truth response
## 2 2 21.6 22.22588
## 4 4 33.4 23.18013
## 6 6 28.7 22.37618
## 8 8 27.1 22.13908
## 10 10 18.9 22.13908
## 12 12 18.9 22.13908
head(as.data.frame(newdata_pred))
## response
## 2 2
## 4 2
## 6 2
## 8 2
## 10 2
## 12 2
head(getPredictionTruth(task_pred))
## [1] 21.6 33.4 28.7 27.1 18.9 18.9
head(getPredictionResponse(task_pred))
## [1] 22.22588 23.18013 22.37618 22.13908 22.13908 22.13908
## 返回mlr中可以返回标准误差的回归学习器的类别和名称
head(listLearners("regr", check.packages = F, properties = "se")[c("class", "name")])
## class
## 1 regr.bcart
## 2 regr.bgp
## 3 regr.bgpllm
## 4 regr.blm
## 5 regr.btgp
## 6 regr.btgpllm
## name
## 1 Bayesian CART
## 2 Bayesian Gaussian Process
## 3 Bayesian Gaussian Process with jumps to the Limiting Linear Model
## 4 Bayesian Linear Model
## 5 Bayesian Treed Gaussian Process
## 6 Bayesian Treed Gaussian Process with jumps to the Limiting Linear Model
## Create learner and specify predict.type
lrn_lm <- makeLearner("regr.lm", predict.type = 'se')
mod_lm <- train(lrn_lm, bh.task, subset = train_set)
task_pred_lm <- predict(mod_lm, task = bh.task, subset = test_set)
task_pred_lm
## Prediction: 253 observations
## predict.type: se
## threshold:
## time: 0.00
## id truth response se
## 2 2 21.6 24.83734 0.7501615
## 4 4 33.4 28.38206 0.8742590
## 6 6 28.7 25.16725 0.8652139
## 8 8 27.1 19.38145 1.1963265
## 10 10 18.9 18.66449 1.1793944
## 12 12 18.9 21.25802 1.0727918
## ... (#rows: 253, #cols: 4)
getPredictionSE(task_pred_lm)
## [1] 0.7501615 0.8742590 0.8652139 1.1963265 1.1793944 1.0727918 0.8429301
## [8] 0.8501582 0.7679939 0.7978658 0.8242887 0.9518889 0.8870152 0.8855407
## [15] 0.8701704 0.9574664 0.8840397 0.7264163 0.7792051 1.2116561 1.1769924
## [22] 1.0915279 0.8271369 1.0202378 0.8842189 0.8842040 0.8469392 1.3276524
## [29] 1.4106337 0.8445755 1.2408730 1.0284519 1.1041909 0.8291724 0.7543209
## [36] 0.9929409 1.1375092 0.8367674 0.8402063 0.9082750 0.7022658 0.5771416
## [43] 0.6308225 0.7278137 0.7811490 0.8189424 1.1384919 0.8335267 1.1364319
## [50] 0.9138132 0.8760649 0.8655742 1.0203456 0.9359339 0.9109483 0.7508824
## [57] 0.8448710 0.8043901 0.7906399 0.7786039 1.6494629 1.7380621 1.6491231
## [64] 1.0193436 1.0088684 1.1054014 1.0286131 1.0927726 1.1071940 1.0774204
## [71] 1.4149076 1.5083177 1.6783724 1.5859976 1.4461011 1.5548512 1.4818863
## [78] 1.9595402 1.2321197 1.6034097 1.3099265 1.6743209 1.2322197 1.1830271
## [85] 1.0915488 1.1511754 0.8948695 0.9734422 0.8509782 0.8854074 0.9652907
## [92] 1.1272297 0.8477232 1.1092160 0.9952516 0.8699082 0.9101939 1.2047344
## [99] 1.2692851 1.3234582 1.2451465 1.5187036 0.9656436 0.6745382 1.5195856
## [106] 1.4771975 0.8085293 0.6804735 0.8445263 1.2367161 1.4679595 0.7451217
## [113] 1.3120481 0.7550944 1.0506752 0.7824162 1.0591481 0.6721549 0.7658662
## [120] 0.5971137 0.8165095 0.8888276 1.1275579 1.2189724 0.9495364 0.9423921
## [127] 1.7165446 1.1507946 1.5170934 1.3085342 1.2224821 1.2380792 1.4748090
## [134] 1.3338453 1.3168927 0.9507626 1.3621199 0.7914587 1.3003635 0.9604363
## [141] 0.8560980 1.7185417 0.8309263 0.8365242 0.8283471 1.3929607 1.2917833
## [148] 1.2660946 1.2910508 1.0252428 0.6695624 0.8263094 1.0279423 1.0776391
## [155] 0.6124678 0.7961893 0.6542160 0.6485521 0.5305173 0.5148376 0.6159465
## [162] 0.7209050 1.0336628 0.6941150 1.3315840 0.7510469 1.0942469 1.0304272
## [169] 0.9124961 0.8779543 0.8502599 0.8013981 1.1782294 1.1445984 1.1966195
## [176] 1.4522210 1.9706832 1.6584676 1.4066579 1.0953827 0.9976729 1.3700237
## [183] 2.2591492 1.9818337 1.4634298 1.0009532 1.2710083 1.0686214 1.0153052
## [190] 0.8512834 0.9063260 0.8898067 1.0443378 1.1404651 0.8633209 0.8303382
## [197] 0.7996595 0.8201927 0.8099873 1.1723586 0.8322803 0.9681173 2.3119839
## [204] 0.9942718 1.0421645 1.3309447 1.1610612 1.3856653 0.9935274 1.3833241
## [211] 0.7574302 1.3899578 1.2231405 1.5140112 1.1854390 1.2557147 1.0655491
## [218] 1.1503370 1.3098914 0.8885734 0.9079921 0.9101242 1.2690583 0.8749875
## [225] 0.7832464 0.9013180 1.1767535 1.2340304 1.3179744 0.8855876 0.9116910
## [232] 0.9899714 1.0197644 0.9329738 0.9604394 1.1078087 0.9803479 1.0075409
## [239] 0.8705739 0.8214196 1.0967451 1.2078927 1.0442883 1.0051250 1.8768373
## [246] 1.8896669 0.8663897 1.2048017 0.6488765 0.7503097 0.9700472 1.1082776
## [253] 1.0145445
lrn <- makeLearner("cluster.cmeans", predict.type = "prob")
mod <- train(lrn, mtcars.task)
pred <- predict(mod, task = mtcars.task)
head(getPredictionProbabilities(pred))
## 1 2
## Mazda RX4 0.97959271 0.020407288
## Mazda RX4 Wag 0.97963293 0.020367069
## Datsun 710 0.99266074 0.007339256
## Hornet 4 Drive 0.54290616 0.457093843
## Hornet Sportabout 0.01870482 0.981295175
## Valiant 0.75745403 0.242545967
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, task = iris.task)
pred <- predict(mod, newdata = iris)
head(as.data.frame(pred))
## truth prob.setosa prob.versicolor prob.virginica response
## 1 setosa 1 0 0 setosa
## 2 setosa 1 0 0 setosa
## 3 setosa 1 0 0 setosa
## 4 setosa 1 0 0 setosa
## 5 setosa 1 0 0 setosa
## 6 setosa 1 0 0 setosa
head(getPredictionProbabilities(pred)) # 只获取概率
## setosa versicolor virginica
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 0
calculateConfusionMatrix(pred)
## predicted
## true setosa versicolor virginica -err.-
## setosa 50 0 0 0
## versicolor 0 49 1 1
## virginica 0 5 45 5
## -err.- 0 5 1 6
conf_matrix <- calculateConfusionMatrix(pred, relative = T); conf_matrix
## Relative confusion matrix (normalized by row/column):
## predicted
## true setosa versicolor virginica -err.-
## setosa 1.00/1.00 0.00/0.00 0.00/0.00 0.00
## versicolor 0.00/0.00 0.98/0.91 0.02/0.02 0.02
## virginica 0.00/0.00 0.10/0.09 0.90/0.98 0.10
## -err.- 0.00 0.09 0.02 0.04
##
##
## Absolute confusion matrix:
## predicted
## true setosa versicolor virginica -err.-
## setosa 50 0 0 0
## versicolor 0 49 1 1
## virginica 0 5 45 5
## -err.- 0 5 1 6
conf_matrix$relative.row
## setosa versicolor virginica -err-
## setosa 1 0.00 0.00 0.00
## versicolor 0 0.98 0.02 0.02
## virginica 0 0.10 0.90 0.10
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, task = sonar.task)
## 查看此任务的正例
getTaskDesc(sonar.task)$positive
## [1] "M"
## 查看默认阈值
pred1 <- predict(mod, sonar.task)
pred1$threshold
## M R
## 0.5 0.5
## 自定义阈值
pred2 <- setThreshold(pred1, 0.9)
pred2$threshold
## M R
## 0.9 0.1
pred2
## Prediction: 208 observations
## predict.type: prob
## threshold: M=0.90,R=0.10
## time: 0.00
## id truth prob.M prob.R response
## 1 1 R 0.1060606 0.8939394 R
## 2 2 R 0.7333333 0.2666667 R
## 3 3 R 0.0000000 1.0000000 R
## 4 4 R 0.1060606 0.8939394 R
## 5 5 R 0.9250000 0.0750000 M
## 6 6 R 0.0000000 1.0000000 R
## ... (#rows: 208, #cols: 5)
## 查看彼此的混淆矩阵
calculateConfusionMatrix(pred1)
## predicted
## true M R -err.-
## M 95 16 16
## R 10 87 10
## -err.- 10 16 26
calculateConfusionMatrix(pred2)
## predicted
## true M R -err.-
## M 84 27 27
## R 6 91 6
## -err.- 6 27 33
## 获取每个样本被预测为正例的概率
head(getPredictionProbabilities(pred1))
## [1] 0.1060606 0.7333333 0.0000000 0.1060606 0.9250000 0.0000000
## 获取每个样本被预测为正例或者反例的概率
head(getPredictionProbabilities(pred1, cl = c("M", "R")))
## M R
## 1 0.1060606 0.8939394
## 2 0.7333333 0.2666667
## 3 0.0000000 1.0000000
## 4 0.1060606 0.8939394
## 5 0.9250000 0.0750000
## 6 0.0000000 1.0000000
## 对于多分类同样可以设置阈值
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, iris.task)
pred <- predict(mod, newdata = iris)
pred$threshold
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
table(as.data.frame(pred)$response)
##
## setosa versicolor virginica
## 50 54 46
pred <- setThreshold(pred, c(setosa = 0.01, versicolor = 50, virginica = 1))
pred$threshold
## setosa versicolor virginica
## 0.01 50.00 1.00
table(as.data.frame(pred)$response)
##
## setosa versicolor virginica
## 50 0 100
对于不同的任务,会默认选择数据的前两个特征进行相应的学习器的训练建立模型(可以自定义两个变量),然后就行预测,把预测结果进行可视化。 对于回归任务,得选择单一或者两个变量会产生不同的效果。
## 分类任务
lrn <- makeLearner("classif.rpart", id = "CART")
plotLearnerPrediction(lrn, task = iris.task)
## 聚类任务
lrn <- makeLearner("cluster.kmeans")
plotLearnerPrediction(lrn, task = mtcars.task, features = c("disp", "drat"), cv = 0)
## 回归任务
plotLearnerPrediction("regr.lm", features = "lstat", task = bh.task) # 单一变量
plotLearnerPrediction("regr.lm", features = c("lstat", "rm"), task = bh.task) # 两个变量
不同的任务有不同的评估方法,可以使用listMeaasures()
来获取指定任务的所有评估方法。为了方便,mlr包中会针对不同的任务指定一个默认的评估方法。比如说在回归任务中,默认使用均方误差(mse),在分类任务中使用误分类率(mmce)等。如何想知道具体的默认方法,可以使用getDefaultMeasure()
去查看各任务所对应的各种评估方法。
# 查看适合多分类学习器的评估方法
listMeasures("classif", properties = "classif.multi")
## [1] "featperc" "mmce" "lsr"
## [4] "bac" "qsr" "timeboth"
## [7] "multiclass.aunp" "timetrain" "multiclass.aunu"
## [10] "ber" "timepredict" "multiclass.brier"
## [13] "ssr" "acc" "logloss"
## [16] "wkappa" "multiclass.au1p" "multiclass.au1u"
## [19] "kappa"
# 查看适合iris.task的评估方法
listMeasures(iris.task)
## [1] "featperc" "mmce" "lsr"
## [4] "bac" "qsr" "timeboth"
## [7] "multiclass.aunp" "timetrain" "multiclass.aunu"
## [10] "ber" "timepredict" "multiclass.brier"
## [13] "ssr" "acc" "logloss"
## [16] "wkappa" "multiclass.au1p" "multiclass.au1u"
## [19] "kappa"
# 查看iris.task中默认的评估方法
getDefaultMeasure(iris.task)
## Name: Mean misclassification error
## Performance measure: mmce
## Properties: classif,classif.multi,req.pred,req.truth
## Minimize: TRUE
## Best: 0; Worst: 1
## Aggregated by: test.mean
## Arguments:
## Note: Defined as: mean(response != truth)
# 查看regr.lm中默认的评估方法
getDefaultMeasure(makeLearner("regr.lm"))
## Name: Mean of squared errors
## Performance measure: mse
## Properties: regr,req.pred,req.truth
## Minimize: TRUE
## Best: 0; Worst: Inf
## Aggregated by: test.mean
## Arguments:
## Note: Defined as: mean((response - truth)^2)
n <- getTaskSize(bh.task)
lrn <- makeLearner("regr.gbm", n.trees = 1000)
mod <- train(lrn, task = bh.task, subset = seq(1, n, 2))
pred <- predict(mod, task = bh.task, subset = seq(2, n, 2))
performance(pred)
## mse
## 42.83207
performance(pred, measures = medse)
## medse
## 9.315383
performance(pred, measures = list(mse, medse, mae))
## mse medse mae
## 42.832075 9.315383 4.557009
performance(pred, measures = timetrain, model = mod)
## timetrain
## 0.28
lrn <- makeLearner("cluster.kmeans", centers = 3)
mod <- train(lrn, mtcars.task)
pred <- predict(mod, task = mtcars.task)
performance(pred, measures = dunn, task = mtcars.task)
## dunn
## 0.1462919
lrn <- makeLearner("classif.rpart", predict.type = "prob")
mod <- train(lrn, task = sonar.task)
pred <- predict(mod, task = sonar.task)
performance(pred, measures = auc)
## auc
## 0.9224018
## Mean misclassification error
str(mmce)
## List of 10
## $ id : chr "mmce"
## $ minimize : logi TRUE
## $ properties: chr [1:4] "classif" "classif.multi" "req.pred" "req.truth"
## $ fun :function (task, model, pred, feats, extra.args)
## $ extra.args: list()
## $ best : num 0
## $ worst : num 1
## $ name : chr "Mean misclassification error"
## $ note : chr "Defined as: mean(response != truth)"
## $ aggr :List of 4
## ..$ id : chr "test.mean"
## ..$ name : chr "Test mean"
## ..$ fun :function (task, perf.test, perf.train, measure, group, pred)
## ..$ properties: chr "req.test"
## ..- attr(*, "class")= chr "Aggregation"
## - attr(*, "class")= chr "Measure"
## 训练模型,并预测
lrn <- makeLearner("classif.lda", predict.type = "prob")
n <- getTaskSize(sonar.task)
mod <- train(lrn, task = sonar.task, subset = seq(1, n, by = 2))
pred <- predict(mod, task = sonar.task, subset = seq(2, n, by = 2))
## 评估模型(阈值是0.5)
performance(pred, measures = list(fpr, fnr, mmce))
## fpr fnr mmce
## 0.2500000 0.3035714 0.2788462
## 查看不同阈值下的模型指标
d <- generateThreshVsPerfData(pred, measures = list(fpr, fnr, mmce))
plotThreshVsPerf(d)
r <- calculateROCMeasures(pred); r
## predicted
## true M R
## M 39 17 tpr: 0.7 fnr: 0.3
## R 12 36 fpr: 0.25 tnr: 0.75
## ppv: 0.76 for: 0.32 lrp: 2.79 acc: 0.72
## fdr: 0.24 npv: 0.68 lrm: 0.4 dor: 6.88
##
##
## Abbreviations:
## tpr - True positive rate (Sensitivity, Recall)
## fpr - False positive rate (Fall-out)
## fnr - False negative rate (Miss rate)
## tnr - True negative rate (Specificity)
## ppv - Positive predictive value (Precision)
## for - False omission rate
## lrp - Positive likelihood ratio (LR+)
## fdr - False discovery rate
## npv - Negative predictive value
## acc - Accuracy
## lrm - Negative likelihood ratio (LR-)
## dor - Diagnostic odds ratio
mlr包中提供了6种重采样策略:
## 3折交叉验证
rdesc <- makeResampleDesc("CV", iters = 3); rdesc
## Resample description: cross-validation with 3 iterations.
## Predict: test
## Stratification: FALSE
## Holdout
rdesc <- makeResampleDesc("Holdout"); rdesc
## Resample description: holdout with 0.67 split rate.
## Predict: test
## Stratification: FALSE
下面是对BostonHousing拟合其medv(因变量)值,使用了3折交叉验证的重采样策略。目前我对于最后返回的medv的理解就是三次结果取平均,因为最后的mse是三个模型mse的均值。
rdesc <- makeResampleDesc("CV", iters = 3)
set.seed(45)
r <- resample("regr.lm", bh.task, rdesc); r
## Resample Result
## Task: BostonHousing-example
## Learner: regr.lm
## Aggr perf: mse.test.mean=24.0414320
## Runtime: 0.044003