使用 RMarkdown 的 child 参数，进行文档拼接。
这样拼接以后的笔记方便复习。
相关问题提交到 Issue

参考 https://github.com/lukasz-f/data-camp 整合代码。

1 notebook

附录

1.1 调参案例

%load_ext blackcellmagic

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic

import pandas as pd
import xgboost as xgb
import numpy as np

default_params = {
    "eval_metric": "mae",
    "booster": "gbtree",
    "eta": 0.05,
    "subsample": 0.35,
    "colsample_bytree": 0.7,
    "num_parallel_tree": 3,
    "min_child_weight": 40,
    "gamma": 10,
    "max_depth": 3,
    "silent": 1,
}

# num_boost_round = 3000,

以上是目前默认超参数。

# 导入对象
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess
%store -r y
%store -r X
dtrain = xgb.DMatrix(X, label = y)

D:\install\miniconda\lib\site-packages\xgboost\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version
  if getattr(data, 'base', None) is not None and \
D:\install\miniconda\lib\site-packages\xgboost\core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version
  data.base is not None and isinstance(data, np.ndarray) \

untuned_cv_results_mae = xgb.cv(
    obj = cauchyobj,
    dtrain=dtrain,
    params=default_params,
    early_stopping_rounds = 25,
    num_boost_round = 3000,
    as_pandas=True,
    seed=123,
    verbose_eval = 50
)

[0] train-mae:11.2447+0.085538  test-mae:11.2587+0.16325
[50]    train-mae:7.43714+0.0840341 test-mae:7.93709+0.137656
[100]   train-mae:6.99125+0.0716425 test-mae:7.91611+0.143741

print("Untuned rmse: %f" % ((untuned_cv_results_mae["test-mae-mean"]).tail(1)))

Untuned rmse: 7.899204

1.1.1 nrounds

# Create list of number of boosting rounds
num_rounds = [50,75,100,150,200,3000]

# Empty list to store final round mae per XGBoost model
final_mae_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(
        obj = cauchyobj,
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds = 25,
        num_boost_round=curr_num_rounds,
        as_pandas=True,
        seed=123,
    )

    # Append final round mae
    final_mae_per_round.append(cv_results["test-mae-mean"].tail().values[-1])

    # Print the resultant DataFrame
    num_rounds_maes = list(zip(num_rounds, final_mae_per_round))

pd.DataFrame(num_rounds_maes, columns=["num_boosting_rounds", "mae"])

	num_boosting_rounds	mae
0	50	7.940525
1	75	7.901142
2	100	7.916022
3	150	7.899204
4	200	7.899204
5	3000	7.899204

1.1.2 early stopping

# Create list of number of boosting rounds
early_stopping_round_list = list(np.multiply(list(range(1,20)),5))
early_stopping_round_list.append(None)

# Empty list to store final round mae per XGBoost model
final_mae_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_val in early_stopping_round_list:
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(
        obj = cauchyobj,
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds=curr_val,
        num_boost_round=150,
        as_pandas=True,
        seed=123,
    )

    # Append final round mae
    final_mae_per_round.append(cv_results["test-mae-mean"].tail().values[-1])

    # Print the resultant DataFrame
    early_stopping_round_maes = list(zip(early_stopping_round_list, final_mae_per_round))

pd.DataFrame(early_stopping_round_maes, columns=["early_stopping_rounds", "mae"])

	early_stopping_rounds	mae
0	5.0	7.899892
1	10.0	7.899892
2	15.0	7.899204
3	20.0	7.899204
4	25.0	7.899204
5	30.0	7.899204
6	35.0	7.899204
7	40.0	7.899204
8	45.0	7.899204
9	50.0	7.899204
10	55.0	7.899204
11	60.0	7.899204
12	65.0	7.899204
13	70.0	7.899204
14	75.0	7.939355
15	80.0	7.939355
16	85.0	7.939355
17	90.0	7.939355
18	95.0	7.939355
19	NaN	7.939355

1.1.3 learning rate

default_params

{‘eval_metric’: ‘mae’, ‘booster’: ‘gbtree’, ‘eta’: 0.05, ‘subsample’: 0.35, ‘colsample_bytree’: 0.7, ‘num_parallel_tree’: 3, ‘min_child_weight’: 40, ‘gamma’: 10, ‘max_depth’: 3, ‘silent’: 1}

# Create list of eta values and empty list to store final round mae per xgboost model
eta_vals = [0.001, 0.01, 0.1, 0.2, 0.3]
best_mae = []

# Systematically vary the eta
for curr_val in eta_vals:

    default_params["eta"] = curr_val

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(
        obj = cauchyobj,
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds=15,
        num_boost_round=150,
        as_pandas=True,
        seed=123,
    )

    # Append the final round mae to best_mae
    best_mae.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
pd.DataFrame(list(zip(eta_vals, best_mae)), columns=["eta", "best_mae"])

	eta	best_mae
0	0.001	10.841469
1	0.010	8.241155
2	0.100	7.941816
3	0.200	7.941957
4	0.300	7.995030

default_params["eta"] = 0.100

1.1.4 max_depth

# Create list of max_depth values
max_depths = [2, 3,5, 6,10, 20]
best_mae = []

# Systematically vary the max_depth
for curr_val in max_depths:

    default_params["max_depth"] = curr_val

    # Perform cross-validation
    cv_results = xgb.cv(
        obj = cauchyobj,
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds=20,
        num_boost_round=150,
        as_pandas=True,
        seed=123,
    )

    # Append the final round mae to best_mae
    best_mae.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
pd.DataFrame(list(zip(max_depths, best_mae)), columns=["max_depth", "best_mae"])

	max_depth	best_mae
0	2	7.926231
1	3	7.941816
2	5	7.916245
3	6	7.929146
4	10	7.937616
5	20	7.937616

default_params["max_depth"] = 5

1.1.5 colsample_bytree

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 0.9, 0.95, 1]
best_mae = []

# Systematically vary the hyperparameter value
for curr_val in colsample_bytree_vals:

    default_params["colsample_bytree"] = curr_val

    # Perform cross-validation
    cv_results = xgb.cv(
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds=20,
        num_boost_round=150,
        metrics="mae",
        as_pandas=True,
        seed=123,
    )

    # Append the final round mae to best_mae
    best_mae.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
pd.DataFrame(
    list(zip(colsample_bytree_vals, best_mae)),
    columns=["colsample_bytree", "best_mae"],
)

	colsample_bytree	best_mae
0	0.10	8.044577
1	0.50	7.795100
2	0.80	7.761571
3	0.90	7.761844
4	0.95	7.760438
5	1.00	7.826004

default_params["colsample_bytree"] = 0.8

1.1.6 subsample

# Create list of hyperparameter values: colsample_bytree_vals
subsample_vals = [0.1, 0.5, 0.8, 0.9, 0.95, 1]
best_mae = []

# Systematically vary the hyperparameter value
for curr_val in subsample_vals:

    default_params["subsample"] = curr_val

    # Perform cross-validation
    cv_results = xgb.cv(
        dtrain=dtrain,
        params=default_params,
        early_stopping_rounds=20,
        num_boost_round=150,
        metrics="mae",
        as_pandas=True,
        seed=123,
    )

    # Append the final round mae to best_mae
    best_mae.append(cv_results["test-mae-mean"].tail().values[-1])

# Print the resultant DataFrame
pd.DataFrame(
    list(zip(subsample_vals, best_mae)),
    columns=["subsample", "best_mae"],
)

	subsample	best_mae
0	0.10	8.181179
1	0.50	7.829253
2	0.80	7.781617
3	0.90	7.745327
4	0.95	7.747116
5	1.00	7.761571

default_params["subsample"] = 0.90

已经降低了误差，因此最好的超参数是

default_params

{‘eval_metric’: ‘mae’, ‘booster’: ‘gbtree’, ‘eta’: 0.1, ‘subsample’: 0.9, ‘colsample_bytree’: 0.8, ‘num_parallel_tree’: 3, ‘min_child_weight’: 40, ‘gamma’: 10, ‘max_depth’: 5, ‘silent’: 1}

1.1.7 search

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

X.__class__
X.values.__class__

numpy.ndarray

y.__class__
y.values.__class__

numpy.ndarray

default_params

def cauchyobj2(preds, labels):
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

gbm_param_grid = {
    "num_boost_round": [125, 150, 175],
    "early_stopping_rounds": [15, 20, 25],
    "eta": [0.05, 0.1, 0.15],
    "max_depth": [4, 5, 6],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "subsample": [0.8, 0.9, 1],
    "gamma": [10],
    "silent": [1],
    "eval_metric": ["mae"],
    "booster": ["gbtree"],
    "num_parallel_tree": [3],
    "min_child_weight": [40],
}
# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(objective=cauchyobj2)
# Perform grid search: grid_mse
randomized_mse = RandomizedSearchCV(
    estimator=gbm, param_distributions=gbm_param_grid, cv=3, verbose=1,
)

# Fit grid_mse to the data
randomized_mse.fit(X.values, y.values)

Fitting 3 folds for each of 10 candidates, totalling 30 fits

D:-packages_selection_search.py:814: DeprecationWarning: The default of the iid parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)

RandomizedSearchCV(cv=3, error_score=‘raise-deprecating’, estimator=XGBRegressor(base_score=0.5, booster=‘gbtree’, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, importance_type=‘gain’, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective=<function cauchyobj2 at 0… ‘colsample_bytree’: [0.7, 0.8, 0.9], ‘early_stopping_rounds’: [15, 20, 25], ‘eta’: [0.05, 0.1, 0.15], ‘eval_metric’: [‘mae’], ‘gamma’: [10], ‘max_depth’: [4, 5, 6], ‘min_child_weight’: [40], ‘num_boost_round’: [125, 150, 175], ‘num_parallel_tree’: [3], ‘silent’: [1], ‘subsample’: [0.8, 0.9, 1]}, pre_dispatch=’2*n_jobs’, random_state=None, refit=True, return_train_score=False, scoring=None, verbose=1)

# Print the best parameters and lowest mae
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest mae found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Best parameters found: {‘subsample’: 0.9, ‘silent’: 1, ‘num_parallel_tree’: 3, ‘num_boost_round’: 175, ‘min_child_weight’: 40, ‘max_depth’: 4, ‘gamma’: 10, ‘eval_metric’: ‘mae’, ‘eta’: 0.05, ‘early_stopping_rounds’: 25, ‘colsample_bytree’: 0.7, ‘booster’: ‘gbtree’} Lowest mae found: 858.5727168878916

best_params = randomized_mse.best_params_

best_mae= []
# Perform cross-validation
cv_results = xgb.cv(
    dtrain=dtrain,
    params=best_params,
    early_stopping_rounds=20,
    num_boost_round=150,
    metrics="mae",
    as_pandas=True,
    seed=123,
)

# Append the final round mae to best_mae
best_mae.append(cv_results["test-mae-mean"].tail().values[-1])


# Print the resultant DataFrame
print(best_mae)

[7.7172540000000005]

误差继续降低，最后的最优超参数

early_stopping_rounds=20, num_boost_round=150,

best_params

{‘subsample’: 0.9, ‘silent’: 1, ‘num_parallel_tree’: 3, ‘num_boost_round’: 175, ‘min_child_weight’: 40, ‘max_depth’: 4, ‘gamma’: 10, ‘eval_metric’: ‘mae’, ‘eta’: 0.05, ‘early_stopping_rounds’: 25, ‘colsample_bytree’: 0.7, ‘booster’: ‘gbtree’}

1.2 自定义损失函数

参考 https://stackoverflow.com/a/59689299/8625228

def gradient_se(y_pred, y_true):
    #Compute the gradient squared error.
    return 2*(y_pred - y_true)

1.3 自定义评价函数

参考 https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html r2 如下，

from sklearn.metrics import r2_score
def r_square(y_pred, dtrain):
    y_true = dtrain.get_label()
    return 'r2', float(r2_score(y_true, y_pred))

1.4 偶然性导致和过拟合

偶然性导致过拟合。

假设一个二分类问题，label为0和1，特征有100维，如果有1w个样本，但其中只要10个正样本1，而这些样本的特征 f1的值为全为1，而其余9990条样本的f1特征都为0(在高维稀疏的情况下这种情况很常见)。

我们都知道在这种情况下，树模型很容易优化出一个使用f1特征作为重要分裂节点的树，因为这个结点直接能够将训练数据划分的很好，但是当测试的时候，却会发现效果很差，因为这个特征f1只是刚好偶然间跟y拟合到了这个规律，这也是我们常说的过拟合。

那么这种情况下，如果采用LR的话，应该也会出现类似过拟合的情况：y = W1f1 + Wifi+….，其中 W1特别大以拟合这10个样本。为什么此时树模型就过拟合的更严重呢？

仔细想想发现，因为现在的模型普遍都会带着正则项，而 LR 等线性模型的正则项是对权重的惩罚，也就是 W1一旦过大，惩罚就会很大，进一步压缩 W1的值，使他不至于过大。但是，树模型则不一样，树模型的惩罚项通常为叶子节点数和深度等，而我们都知道，对于上面这种 case，树只需要一个节点就可以完美分割9990和10个样本，一个结点，最终产生的惩罚项极其之小。

对于稀疏变量的处理，正则化如果是通过树的深度来惩罚的话，就基本效果很差了。 这是 tree learner 和 linear learner 的差距。

一般来说，图像、NLP等样本量远远小于特征数量的样本，XGBoost 都不适用。

样本量远远小于特征数量=>本身就是过拟合的特征，会出现一个特征偶然的和 y 变量拟合得很好。那么一般来说，CTR我们现在做的 XGB 效果好，就是因为样本量很大而已。

1.5 verbose 问题

参考 https://github.com/dmlc/xgboost/issues/2372 处理了 slience 等问题
参考 https://github.com/dmlc/xgboost/issues/4460#issuecomment-491551242 处理了 eval_metric 没有用

这里需要定义 eval_set 如下，

model = xgb.train(default_params,
          dtrain=dtrain,
          num_boost_round=100,
          feval=r_square,
          evals=[(dtrain, 'dtrain'), (dtrain_dev, 'dtrain_dev')])

1.6 提取迭代过程

参考 https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.plotting xgboost.train加入参数evals_result = dict()

evals_result = dict()

model = xgb.train(
    default_params,
    dtrain=dtrain,
    num_boost_round=100,
    feval=r_square,
    evals=[(dtrain, "dtrain"), (dtrain_dev, "dtrain_dev")],
    evals_result = evals_result,
    verbose_eval = 10
)

作图

参考 https://www.jianshu.com/p/05ec35a120b1

# retrieve performance metrics
results = bst.evals_result()
#print(results)


epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

1.7 load 模型

参考 https://xgboost.readthedocs.io/en/latest/python/python_intro.html#prediction

bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('model.bin')  # load data

Xgboost using Python 学习笔记