import time
import numpy as np
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
%matplotlib inline
def dropun(X):
for x in X.columns:
if x[:7]=='Unnamed':
X=X.drop(columns=[x])
return X
def hist(L):
kwargs = dict(histtype='stepfilled',density=True,alpha=0.3,bins=40)
for X in L:
plt.hist(X, **kwargs)
def prediction(y_pred, y_test, plot=True):
sum_erro = mean_squared_error(y_pred, y_test)
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
print ("MSE:", sum_erro)
if plot:
# 做ROC曲线
plt.figure()
plt.plot(range(len(y_pred)), y_pred, 'b', label="predict")
plt.plot(range(len(y_pred)), y_test, 'r', label="test")
plt.legend(loc="upper right") # 显示图中的标签
return sum_erro
X=pd.read_csv('data/factors2013-0-2-1.csv')
Y=pd.read_csv('data/daily2011-2017-1.csv')
X=dropun(X)
Y=dropun(Y)
factors=list(X.columns)
factors.remove('ts_code')
factors.remove('trade_date')
days=set(X['trade_date'])
days=list(days)
days.sort()
def get_split_by_trade_date(date, state=0, remove_factors = []):
# state=0表示不进行缺失值去除/填充
# state=1表示直接去除含有缺失值股票的数据
# state=2表示使用当天的平均值进行填充缺失数据
# '2013-03-01'
x=X[X['trade_date']==date].drop(columns=['trade_date'] + remove_factors)
y=Y[Y['trade_date']==date].drop(columns=['trade_date'])
z=pd.merge(x,y,on='ts_code')
if state==1:
z.dropna(inplace=True)
elif state==2:
z.fillna(value=dict(z.mean()), inplace=True)
x=z[set(factors)-set(remove_factors)]
y=z['yield']*100
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # random_state=0
# print('x_train.shape={}\ny_train.shape ={}\nx_test.shape={}\ny_test.shape={}'.format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))
return x_train, x_test, y_train, y_test
XGBoost的SHAP解释
基于sklearn的接口的回归
# 划分数据集
x_train, x_test, y_train, y_test = get_split_by_trade_date('2013-03-01')
x_train.shape=(1812, 56)
y_train.shape =(1812,)
x_test.shape=(454, 56)
y_test.shape=(454,)
# 模型训练
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=50, objective='reg:squarederror')
model.fit(x_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.1, max_delta_step=0, max_depth=5,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=50, n_jobs=12, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
# 模型预测
y_pred = model.predict(x_test)
prediction(y_pred, y_test.to_numpy())
MSE: 2.7456872833567583
Permutation Importance
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = factors, top=56)
SHAP
import shap # package used to calculate Shap values
# Create object that can calculate shap values
data_for_prediction = x_test.iloc[1:2, :]
explainer = shap.Explainer(model)
shap.initjs()
shap.force_plot(explainer.expected_value, explainer.shap_values(data_for_prediction), data_for_prediction)
data_for_prediction = x_test
shap.force_plot(explainer.expected_value, explainer.shap_values(data_for_prediction), data_for_prediction)
Summary Plots
# Make plot. 回归问题不需要加入下标[1],因为只有一个预测值
shap.summary_plot(explainer.shap_values(x_test), x_test)
XGBoost调参
官方文档:https://xgboost.readthedocs.io/en/latest/parameter.html
sklearn中文文档:https://www.scikitlearn.com.cn/0.21.3/12/
sklearn官方文档:https://scikit-learn.org/stable/modules/classes.html
在调参方面,有以下考量:
单天模型调参
- 为了高速训练,使用到gpu,并且不再使用默认的精确贪心,而是使用近似算法(直方图)
- 使用其他的损失函数,在评价指标处也添加相应的进行参考
- 修改n_estimators,learning_rate,max_depth,max_leaf_nodes,max_delta_step等参数
- 使用early stopping,防止过拟合,并且可以减少计算量
def XGBoost_train(date, state=0, remove_factors=[]):
# 划分数据集
x_train, x_test, y_train, y_test = get_split_by_trade_date(date, state, remove_factors)
# 模型训练
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=100, objective='reg:squarederror')#,tree_method='gpu_hist')
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=5)
# model.fit(x_train, y_train)
# 模型预测
y_pred = model.predict(x_test)
return prediction(y_pred, y_test.to_numpy(), False)
XGBoost模型理论上比线性模型在预测结果上更有力,以2013-01-04的数据为例,线性模型去除缺失值后的RMSE达到3.618,需要通过调整参数使得XGBoost的RMSE低于这一值:
XGBoost_train('2013-01-04')
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
[0] validation_0-rmse:2.71505
[1] validation_0-rmse:2.67176
[2] validation_0-rmse:2.63221
[3] validation_0-rmse:2.59650
[4] validation_0-rmse:2.56234
...
MSE: 4.962805064459501
XGBoost_err1 = []
for day in days:
print(day)
XGBoost_err1.append(XGBoost_train(day))
2013-01-04
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
[0] validation_0-rmse:2.71505
[1] validation_0-rmse:2.67176
...
2013-12-31
x_train.shape=(1781, 56)
y_train.shape =(1781,)
x_test.shape=(446, 56)
y_test.shape=(446,)
[0] validation_0-rmse:1.89066
[1] validation_0-rmse:1.87314
[2] validation_0-rmse:1.85777
[3] validation_0-rmse:1.84825
[4] validation_0-rmse:1.83475
[5] validation_0-rmse:1.82586
[6] validation_0-rmse:1.81955
[7] validation_0-rmse:1.81137
[8] validation_0-rmse:1.80614
[9] validation_0-rmse:1.80126
[10] validation_0-rmse:1.79343
[11] validation_0-rmse:1.79312
[12] validation_0-rmse:1.78894
[13] validation_0-rmse:1.78855
[14] validation_0-rmse:1.78387
[15] validation_0-rmse:1.77989
[16] validation_0-rmse:1.77749
[17] validation_0-rmse:1.77625
[18] validation_0-rmse:1.77587
[19] validation_0-rmse:1.77392
[20] validation_0-rmse:1.77337
[21] validation_0-rmse:1.77304
[22] validation_0-rmse:1.77251
[23] validation_0-rmse:1.76933
[24] validation_0-rmse:1.76907
[25] validation_0-rmse:1.76951
[26] validation_0-rmse:1.76602
[27] validation_0-rmse:1.76603
[28] validation_0-rmse:1.76556
[29] validation_0-rmse:1.76377
[30] validation_0-rmse:1.76443
[31] validation_0-rmse:1.76462
[32] validation_0-rmse:1.76322
[33] validation_0-rmse:1.76281
[34] validation_0-rmse:1.76222
[35] validation_0-rmse:1.76286
[36] validation_0-rmse:1.76353
[37] validation_0-rmse:1.76506
[38] validation_0-rmse:1.76757
MSE: 3.105432687887249
np.array(XGBoost_err1).mean()
3.888354364794232
去除缺失值结果:
XGBoost_err2 = []
for day in days:
print(day)
XGBoost_err2.append(XGBoost_train(day, 1))
2013-01-04
x_train.shape=(1185, 56)
y_train.shape =(1185,)
x_test.shape=(297, 56)
y_test.shape=(297,)
[0] validation_0-rmse:2.45329
[1] validation_0-rmse:2.40804
[2] validation_0-rmse:2.36358
[3] validation_0-rmse:2.32372
...
[51] validation_0-rmse:1.94131
[52] validation_0-rmse:1.94386
MSE: 3.7682869642648247
...
2013-12-31
x_train.shape=(1257, 56)
y_train.shape =(1257,)
x_test.shape=(315, 56)
y_test.shape=(315,)
[0] validation_0-rmse:1.69060
[1] validation_0-rmse:1.67843
[2] validation_0-rmse:1.67219
...
[44] validation_0-rmse:1.58519
[45] validation_0-rmse:1.58426
MSE: 2.505393251914421
np.array(XGBoost_err2).mean()
3.741997772942808
时间序列上调参
除此之外,在宏观上,在时间序列上,前一天和后一天的XGBoost模型应该是较为类似的,是否可以采用前一天的超参数(甚至是参数)去训练下一天的,相当于finetune的过程。
- 不使用前一天的参数,但参照前一天的超参数去训练下一天的。借鉴遗传算法,第二天的超参数在第一天的基础上上下浮动训练,选择最好的一个
- 使用前一天的参数,但回归树需要有淘汰才能更新。可以借助XGBoost的dart方法,在每次训练时淘汰以前的一些树
使用dart方法
API:https://xgboost.readthedocs.io/en/latest/tutorials/dart.html
论文:http://sunie.top:9009/Server/files/论文/korlakaivinayak15.pdf
XGBoost主要结合了大量的回归树和较小的学习率。在这种情况下,早期添加的树木很重要,而后期添加的树木并不重要。Vinayak和Gilad-Bachrach提出了一种新的方法,将深度神经网络社区的dropout技术添加到增强的树木中,并在某些情况下报告了更好的结果。
随机丢弃生成的决策树,然后再从剩下的决策树集中迭代优化提升树,这是DART的主要思想。伪码DART的原理:
# 训练初步模型
state = 0
x_train, x_test, y_train, y_test = get_split_by_trade_date('2013-01-04', state)
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=100,
objective='reg:squarederror',booster='dart',tree_method= 'gpu_hist',
rate_drop=0.1,skip_drop=0.5)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=5)
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
[0] validation_0-rmse:2.71505
[1] validation_0-rmse:2.67176
[2] validation_0-rmse:2.63221
[3] validation_0-rmse:2.59650
[4] validation_0-rmse:2.56234
[5] validation_0-rmse:2.54804
[6] validation_0-rmse:2.51640
[7] validation_0-rmse:2.49027
[8] validation_0-rmse:2.46852
[9] validation_0-rmse:2.44842
[10] validation_0-rmse:2.42765
[11] validation_0-rmse:2.40894
[12] validation_0-rmse:2.39110
[13] validation_0-rmse:2.37801
[14] validation_0-rmse:2.36158
[15] validation_0-rmse:2.34943
[16] validation_0-rmse:2.34672
[17] validation_0-rmse:2.34293
[18] validation_0-rmse:2.33335
[19] validation_0-rmse:2.32521
[20] validation_0-rmse:2.31349
[21] validation_0-rmse:2.30608
[22] validation_0-rmse:2.30415
[23] validation_0-rmse:2.30161
[24] validation_0-rmse:2.29694
[25] validation_0-rmse:2.28987
[26] validation_0-rmse:2.28165
[27] validation_0-rmse:2.27723
[28] validation_0-rmse:2.27351
[29] validation_0-rmse:2.26792
[30] validation_0-rmse:2.26717
[31] validation_0-rmse:2.26474
[32] validation_0-rmse:2.26086
[33] validation_0-rmse:2.25885
[34] validation_0-rmse:2.25811
[35] validation_0-rmse:2.25738
[36] validation_0-rmse:2.25292
[37] validation_0-rmse:2.25248
[38] validation_0-rmse:2.25169
[39] validation_0-rmse:2.24581
[40] validation_0-rmse:2.24643
[41] validation_0-rmse:2.24598
[42] validation_0-rmse:2.24540
[43] validation_0-rmse:2.24279
[44] validation_0-rmse:2.24284
[45] validation_0-rmse:2.24146
[46] validation_0-rmse:2.23933
[47] validation_0-rmse:2.23897
[48] validation_0-rmse:2.24002
[49] validation_0-rmse:2.24009
[50] validation_0-rmse:2.23750
[51] validation_0-rmse:2.23841
[52] validation_0-rmse:2.23786
[53] validation_0-rmse:2.23783
[54] validation_0-rmse:2.23655
[55] validation_0-rmse:2.23644
[56] validation_0-rmse:2.23723
[57] validation_0-rmse:2.23693
[58] validation_0-rmse:2.23136
[59] validation_0-rmse:2.23283
[60] validation_0-rmse:2.23281
[61] validation_0-rmse:2.22976
[62] validation_0-rmse:2.23159
[63] validation_0-rmse:2.23105
[64] validation_0-rmse:2.23125
[65] validation_0-rmse:2.23136
XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
importance_type='gain', interaction_constraints='',
learning_rate=0.05, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
rate_drop=0.1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
skip_drop=0.5, subsample=1, tree_method='gpu_hist',
validate_parameters=1, verbosity=None)
XGBoost_err3 = []
for day in days:
print(day)
x_train, x_test, y_train, y_test = get_split_by_trade_date(day, state)
# 重新训练(相当于finetune)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=5)
# 模型预测
y_pred = model.predict(x_test)
XGBoost_err3.append(prediction(y_pred, y_test.to_numpy(), False))
2013-01-04
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
[0] validation_0-rmse:2.71505
[1] validation_0-rmse:2.67176
[2] validation_0-rmse:2.63221
[3] validation_0-rmse:2.59650
[4] validation_0-rmse:2.56234
[5] validation_0-rmse:2.54804
[6] validation_0-rmse:2.51640
[7] validation_0-rmse:2.49027
[8] validation_0-rmse:2.46852
[9] validation_0-rmse:2.44842
[10] validation_0-rmse:2.42765
[11] validation_0-rmse:2.40894
[12] validation_0-rmse:2.39110
[13] validation_0-rmse:2.37801
[14] validation_0-rmse:2.36158
[15] validation_0-rmse:2.34943
[16] validation_0-rmse:2.34672
[17] validation_0-rmse:2.34293
[18] validation_0-rmse:2.33335
[19] validation_0-rmse:2.32521
[20] validation_0-rmse:2.31349
[21] validation_0-rmse:2.30608
[22] validation_0-rmse:2.30415
[23] validation_0-rmse:2.30161
[24] validation_0-rmse:2.29694
[25] validation_0-rmse:2.28987
[26] validation_0-rmse:2.28165
[27] validation_0-rmse:2.27723
[28] validation_0-rmse:2.27351
[29] validation_0-rmse:2.26792
[30] validation_0-rmse:2.26717
[31] validation_0-rmse:2.26474
[32] validation_0-rmse:2.26086
[33] validation_0-rmse:2.25885
[34] validation_0-rmse:2.25811
[35] validation_0-rmse:2.25738
[36] validation_0-rmse:2.25292
[37] validation_0-rmse:2.25248
[38] validation_0-rmse:2.25169
[39] validation_0-rmse:2.24581
[40] validation_0-rmse:2.24643
[41] validation_0-rmse:2.24598
[42] validation_0-rmse:2.24540
[43] validation_0-rmse:2.24279
[44] validation_0-rmse:2.24284
[45] validation_0-rmse:2.24146
[46] validation_0-rmse:2.23933
[47] validation_0-rmse:2.23897
[48] validation_0-rmse:2.24002
[49] validation_0-rmse:2.24009
[50] validation_0-rmse:2.23750
[51] validation_0-rmse:2.23841
[52] validation_0-rmse:2.23786
[53] validation_0-rmse:2.23783
[54] validation_0-rmse:2.23655
[55] validation_0-rmse:2.23644
[56] validation_0-rmse:2.23723
[57] validation_0-rmse:2.23693
[58] validation_0-rmse:2.23136
[59] validation_0-rmse:2.23283
[60] validation_0-rmse:2.23281
[61] validation_0-rmse:2.22976
[62] validation_0-rmse:2.23159
[63] validation_0-rmse:2.23105
[64] validation_0-rmse:2.23125
[65] validation_0-rmse:2.23136
[66] validation_0-rmse:2.23117
MSE: 4.972797932588777
...
2013-12-31
x_train.shape=(1781, 56)
y_train.shape =(1781,)
x_test.shape=(446, 56)
y_test.shape=(446,)
[0] validation_0-rmse:1.89066
[1] validation_0-rmse:1.87273
[2] validation_0-rmse:1.85728
[3] validation_0-rmse:1.84731
[4] validation_0-rmse:1.83396
[5] validation_0-rmse:1.83045
[6] validation_0-rmse:1.82118
[7] validation_0-rmse:1.81282
[8] validation_0-rmse:1.80430
[9] validation_0-rmse:1.79986
[10] validation_0-rmse:1.79477
[11] validation_0-rmse:1.78617
[12] validation_0-rmse:1.78131
[13] validation_0-rmse:1.77416
[14] validation_0-rmse:1.77229
[15] validation_0-rmse:1.76928
[16] validation_0-rmse:1.76798
[17] validation_0-rmse:1.76636
[18] validation_0-rmse:1.76145
[19] validation_0-rmse:1.76249
[20] validation_0-rmse:1.76086
[21] validation_0-rmse:1.76011
[22] validation_0-rmse:1.75982
[23] validation_0-rmse:1.75863
[24] validation_0-rmse:1.75799
[25] validation_0-rmse:1.75662
[26] validation_0-rmse:1.75613
[27] validation_0-rmse:1.75613
[28] validation_0-rmse:1.75578
[29] validation_0-rmse:1.75474
[30] validation_0-rmse:1.75479
[31] validation_0-rmse:1.75551
[32] validation_0-rmse:1.75698
[33] validation_0-rmse:1.75701
[34] validation_0-rmse:1.75652
MSE: 3.0796502006396387
np.array(XGBoost_err3).mean()
3.902909523475317
使用超参变换方法
后一天在前一天的基础上随机选择一个参数进行变换,因为有了early stopping,我们不再需要更改迭代次数,有以下参数可以尝试更改:
learning_rate,max_depth,max_leaf_nodes,max_delta_step
准确率对比
2013年全年MSE平均值 | Linear | XGBoost |
---|---|---|
使用平均值/未去除缺失值 | 3.921317593905477 | 3.888354364794232 |
去除缺失值 | 3.8005630796082426 | 3.741997772942808 |
使用Dart方法 | 3.902909523475317 |
XGBoost相对于Linear回归的准确率提升极其有限,每天的预测都有2个百分点的误差,使用XGBoost的情况下误差并没大幅下降。
需要考虑背后的原因是什么?模型的问题还是数据的问题,这样用当天的数据预测当天的收益率真的是一个好的选择吗?
线性模型的SHAP解释
from sklearn.linear_model import LinearRegression #线性回归
def linear_train(date, state, remove_factors=[]):
# 划分数据集
x_train, x_test, y_train, y_test = get_split_by_trade_date(date, state, remove_factors)
# 模型训练
model = LinearRegression()
model.fit(x_train, y_train)
# 模型预测
y_pred = model.predict(x_test)
return prediction(y_pred, y_test.to_numpy(), False)
同一天平均值策略填充
linear_err2 = []
for day in days:
print(day)
linear_err2.append(linear_train(day, 2))
2013-01-04
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
MSE: 5.082764268029054
...
2013-12-31
x_train.shape=(1781, 56)
y_train.shape =(1781,)
x_test.shape=(446, 56)
y_test.shape=(446,)
MSE: 2.938150673006415
np.array(linear_err2).mean()
3.921317593905477
去除含缺失数据的股票
linear_err1 = []
for day in days:
print(day)
linear_err1.append(linear_train(day, 1))
2013-01-04
x_train.shape=(1185, 56)
y_train.shape =(1185,)
x_test.shape=(297, 56)
y_test.shape=(297,)
MSE: 3.617532364939729
...
2013-12-31
x_train.shape=(1257, 56)
y_train.shape =(1257,)
x_test.shape=(315, 56)
y_test.shape=(315,)
MSE: 2.6755668015230807
np.array(linear_err1).mean()
3.8005630796082426
hist([XGBoost_err, XGBoost_err1, linear_err2, linear_err1])
SHAP解释
x_train, x_test, y_train, y_test = get_split_by_trade_date('2013-03-01', 1)
# 模型训练
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
# 模型预测
y_pred = linear_model.predict(x_test)
prediction(y_pred, y_test.to_numpy())
MSE: 2.4033171979775267
data_for_prediction = x_test.iloc[1, :]
explainer = shap.LinearExplainer(linear_model, x_train)
shap_values = explainer.shap_values(data_for_prediction)
shap.force_plot(explainer.expected_value, shap_values, data_for_prediction)
shap_values=pd.Series(shap_values, index = data_for_prediction.index)
print(shap_values)
size -0.015262
beta 0.049783
betad 0.004743
idvol -0.289487
total_vol 0.345953
idskew -0.012252
skew 0.147472
coskew -0.033706
turn -0.020896
std_turn 0.027815
volumed 0.136912
std_dvol 0.050331
retnmax -0.192386
illq 0.000854
LM 0.075389
sharechg 0.003900
age 0.132466
mom12 0.226785
mom6 -0.090234
momchg -0.312232
imom 0.012036
lagretn -0.023846
BM -0.065847
AM -0.072066
LEV 0.137308
EP -0.128975
CFP -0.017867
OCFP -0.073490
DP 0.033503
SP -0.004204
AG -0.014497
LG 0.108457
BVEG -0.001503
INVG 0.047389
INVchg 0.029190
SG -0.548335
SgINVg 0.516971
PMG 0.000243
TAXchg 0.108613
ACC -0.028669
ACCP 0.000803
ROE 0.097101
ROA -0.042718
PA -0.177476
CT -0.015307
cash 0.007811
cashpr -0.011033
RD 0.046751
RDsales -0.037200
CR 0.014752
QR -0.006193
CFdebt 0.073267
salecash 0.010830
saleinv -0.005143
CRG -0.133693
QRG 0.049332
dtype: float64
print(explainer.expected_value, sum(shap_values)+explainer.expected_value)
1.0574470845321335 1.1796899805758478
linear_model.coef_ * data_for_prediction
size -0.009059
beta 0.029039
betad 0.001264
idvol -0.365071
total_vol 0.430522
idskew 0.027157
skew 0.058997
coskew -0.116983
turn -0.032765
std_turn 0.034614
volumed 0.086125
std_dvol 0.035373
retnmax -0.202701
illq 0.000672
LM 0.054087
sharechg 0.004187
age 0.047193
mom12 0.243860
mom6 0.024967
momchg -0.397633
imom -0.002186
lagretn -0.050609
BM -0.071573
AM -0.072236
LEV 0.128845
EP -0.121051
CFP -0.036522
OCFP -0.085358
DP 0.036700
SP 0.001477
AG 0.012922
LG 0.084886
BVEG -0.003271
INVG 0.058538
INVchg 0.035820
SG -0.579433
SgINVg 0.531597
PMG -0.000612
TAXchg 0.110552
ACC -0.063614
ACCP 0.002962
ROE 0.153399
ROA -0.058626
PA -0.222794
CT -0.031880
cash 0.001383
cashpr -0.010167
RD 0.056974
RDsales -0.004046
CR 0.177789
QR -0.111748
CFdebt 0.112152
salecash 0.002116
saleinv -0.002321
CRG -0.143290
QRG 0.056218
Name: 421, dtype: float64
print(linear_model.intercept_, sum(linear_model.coef_ * data_for_prediction) + linear_model.intercept_)
1.3328549268786625 1.1796899805758476
SHAP的linear explainer 与线性模型中直接计算的权重不同,但相加的结果是相同的。如何解释这种现象?必须深入理解shap背后的原理
lightGBM的SHAP解释
官方文档:https://lightgbm.readthedocs.io/en/latest/
from lightgbm import LGBMRegressor
def lightGBM_train(date):
# 划分数据集
x_train, x_test, y_train, y_test = get_split_by_trade_date(date)
# 模型训练
gbm = LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=50)
gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='l1', early_stopping_rounds=5)
# 模型预测
y_pred = model.predict(x_test)
return prediction(y_pred, y_test.to_numpy(), False)
lightGBM_train('2013-01-04')
x_train.shape=(1820, 56)
y_train.shape =(1820,)
x_test.shape=(455, 56)
y_test.shape=(455,)
[1] valid_0's l1: 1.65549 valid_0's l2: 5.77881
Training until validation scores don't improve for 5 rounds
[2] valid_0's l1: 1.64699 valid_0's l2: 5.71442
[3] valid_0's l1: 1.64027 valid_0's l2: 5.66422
[4] valid_0's l1: 1.63132 valid_0's l2: 5.60613
[5] valid_0's l1: 1.62461 valid_0's l2: 5.55388
[6] valid_0's l1: 1.62116 valid_0's l2: 5.52183
[7] valid_0's l1: 1.61607 valid_0's l2: 5.48249
[8] valid_0's l1: 1.61166 valid_0's l2: 5.44449
[9] valid_0's l1: 1.60571 valid_0's l2: 5.39704
[10] valid_0's l1: 1.60002 valid_0's l2: 5.36303
[11] valid_0's l1: 1.59866 valid_0's l2: 5.33837
[12] valid_0's l1: 1.59527 valid_0's l2: 5.31458
[13] valid_0's l1: 1.59401 valid_0's l2: 5.29896
[14] valid_0's l1: 1.59157 valid_0's l2: 5.27727
[15] valid_0's l1: 1.58855 valid_0's l2: 5.24443
[16] valid_0's l1: 1.58613 valid_0's l2: 5.22414
[17] valid_0's l1: 1.5817 valid_0's l2: 5.19576
[18] valid_0's l1: 1.57925 valid_0's l2: 5.17984
[19] valid_0's l1: 1.5779 valid_0's l2: 5.17388
[20] valid_0's l1: 1.57414 valid_0's l2: 5.15314
[21] valid_0's l1: 1.57326 valid_0's l2: 5.13235
[22] valid_0's l1: 1.56768 valid_0's l2: 5.09056
[23] valid_0's l1: 1.56799 valid_0's l2: 5.0894
[24] valid_0's l1: 1.56936 valid_0's l2: 5.08946
[25] valid_0's l1: 1.57107 valid_0's l2: 5.08751
[26] valid_0's l1: 1.5714 valid_0's l2: 5.08143
[27] valid_0's l1: 1.57045 valid_0's l2: 5.07271
Early stopping, best iteration is:
[22] valid_0's l1: 1.56768 valid_0's l2: 5.09056
MSE: 8.693856844554604
待解决的问题(4.8)
- XGBoost采用超参变换方法训练模型,如何提升准确率?(暂时不做)
- 阅读Shap论文,深入理解各种Kernal背后原理,分析为什么与线性模型分析的结果不一致
- 分析为什么lightGBM模型计算得出的MSE这么高,进行相应的调参(暂时不做)
- 根据permutation importance去除无用特征
- 分板块对股票分别进行回归,想想如何进行分板块可视化,如何跨板块对比? cross-sectional具体指的是什么?
评论区