In [89]:
import numpy as np
import pandas as pd
import lightgbm as lgb
In [90]:
train = pd.read_csv('input/train.csv')
In [93]:
test = pd.read_csv('input/test.csv')

test 录入很慢,因为 test 表很大,大于之前融360的表。

EDA

In [13]:
print(train.shape)
print(test.shape)
(4459, 4993)
(49342, 4992)
In [85]:
train.head()
Out[85]:
48df886f9 0deb4b6a8 34b15f335 a8cb14b00 2f0771a37 30347e683 d08d1fbe3 6ee66e115 20aa07010 dc5a8f1d8 ... 3ecc09859 9281abeea 8675bec0b 3a13ed79a f677d4d13 71b203550 137efaa80 fb36b89d9 7e293fbaf 9fc776466
0 0.0 0 0.0 0 0 0 0 0 0.0 0.0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
1 0.0 0 0.0 0 0 0 0 0 2200000.0 0.0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
2 0.0 0 0.0 0 0 0 0 0 0.0 0.0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
3 0.0 0 0.0 0 0 0 0 0 0.0 0.0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0
4 0.0 0 0.0 0 0 0 0 0 2000000.0 0.0 ... 0.0 0.0 0.0 0 0 0 0 0 0 0

5 rows × 4991 columns

In [86]:
train.dtypes.describe()
Out[86]:
count      4991
unique        2
top       int64
freq       3147
dtype: object

Data Preprocessing

In [91]:
Y = np.log1p(train.target)
train.drop(['target'], axis=1, inplace=True)
In [96]:
test_ID = test.ID
test.drop(['ID'], axis=1, inplace=True)
train_ID = train.ID
train.drop(['ID'], axis=1, inplace=True)
In [97]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train, Y, test_size=0.2, random_state=42)

Modeling

In [98]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 8,
    'num_leaves': 32,  # 63, 127, 255
    'feature_fraction': 0.8, # 0.1, 0.01
    'bagging_fraction': 0.8,
    'learning_rate': 0.001, #0.00625,#125,#0.025,#05,
    'verbose': 0
}
In [99]:
lgtrain = lgb.Dataset(x_train.values, y_train.values,
            feature_name=train.columns.tolist(),
                     )
lgvalid = lgb.Dataset(x_valid.values, y_valid.values,
            feature_name=train.columns.tolist(),
                     )
Y_target = []

modelstart = time.time()
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=[lgtrain, lgvalid],
    valid_names=['train','valid'],
    num_boost_round=300,
    early_stopping_rounds=50,
    verbose_eval=100
)

test_pred = lgb_clf.predict(test.values)
Y_target.append(np.exp(test_pred)-1)
print('fold finish after', time.time()-modelstart)
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 1.72155	valid's rmse: 1.66826
[200]	train's rmse: 1.68227	valid's rmse: 1.6427
[300]	train's rmse: 1.64642	valid's rmse: 1.62034
Did not meet early stopping. Best iteration is:
[300]	train's rmse: 1.64642	valid's rmse: 1.62034
fold finish after 20.004000186920166
In [100]:
Y_target = np.array(Y_target)

Submission

In [101]:
sub = pd.read_csv('input/sample_submission.csv')
sub['target'] = Y_target.mean(axis=0)
sub.to_csv('sub_lgb_baseline.csv', index=False)