import numpy as np
import pandas as pd
import lightgbm as lgb
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
test
录入很慢,因为 test 表很大,大于之前融360的表。
print(train.shape)
print(test.shape)
train.head()
train.dtypes.describe()
Y = np.log1p(train.target)
train.drop(['target'], axis=1, inplace=True)
test_ID = test.ID
test.drop(['ID'], axis=1, inplace=True)
train_ID = train.ID
train.drop(['ID'], axis=1, inplace=True)
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train, Y, test_size=0.2, random_state=42)
lgbm_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'max_depth': 8,
'num_leaves': 32, # 63, 127, 255
'feature_fraction': 0.8, # 0.1, 0.01
'bagging_fraction': 0.8,
'learning_rate': 0.001, #0.00625,#125,#0.025,#05,
'verbose': 0
}
lgtrain = lgb.Dataset(x_train.values, y_train.values,
feature_name=train.columns.tolist(),
)
lgvalid = lgb.Dataset(x_valid.values, y_valid.values,
feature_name=train.columns.tolist(),
)
Y_target = []
modelstart = time.time()
lgb_clf = lgb.train(
lgbm_params,
lgtrain,
valid_sets=[lgtrain, lgvalid],
valid_names=['train','valid'],
num_boost_round=300,
early_stopping_rounds=50,
verbose_eval=100
)
test_pred = lgb_clf.predict(test.values)
Y_target.append(np.exp(test_pred)-1)
print('fold finish after', time.time()-modelstart)
Y_target = np.array(Y_target)
sub = pd.read_csv('input/sample_submission.csv')
sub['target'] = Y_target.mean(axis=0)
sub.to_csv('sub_lgb_baseline.csv', index=False)