import time
import numpy as np
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

def dropun(X):
    for x in X.columns:
        if x[:7]=='Unnamed':
            X=X.drop(columns=[x])
    return X

def hist(L):
    kwargs = dict(histtype='stepfilled',density=True,alpha=0.3,bins=40)
    for X in L:
        plt.hist(X, **kwargs)
        
def prediction(y_pred, y_test):
    sum_mean = 0
    for i in range(len(y_pred)):
        sum_mean += (y_pred[i] - y_test[i]) ** 2
    sum_erro = np.sqrt(sum_mean /len(y_pred))  # 测试级的数量
    # calculate RMSE
    print ("RMSE by hand:", sum_erro)
    
    # 做ROC曲线
    plt.figure()
    plt.plot(range(len(y_pred)), y_pred, 'b', label="predict")
    plt.plot(range(len(y_pred)), y_test, 'r', label="test")
    plt.legend(loc="upper right")  # 显示图中的标签
X=pd.read_csv('data/factors2013-0-2-1.csv')
Y=pd.read_csv('data/daily2011-2017-1.csv')
X=dropun(X)
Y=dropun(Y)

factors=list(X.columns)
factors.remove('ts_code')
factors.remove('trade_date')

days=set(X['trade_date'])
days=list(days)
days.sort()

x=X[X['trade_date']=='2013-03-01'].drop(columns=['trade_date'])
y=Y[Y['trade_date']=='2013-03-01'].drop(columns=['trade_date'])
z=pd.merge(x,y,on='ts_code')
x=z[factors]
y=z['yield']
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print('x_train.shape={}\ny_train.shape ={}\nx_test.shape={}\ny_test.shape={}'.format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))

基于XGBoost原生接口的回归

由于每天的都要生成一个因子到收益率的XGBoost,每个XGBoost由上百个回归树组成

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print('x_train.shape={}\ny_train.shape ={}\nx_test.shape={}\ny_test.shape={}'.format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))
x_train.shape=(1812, 56)
y_train.shape =(1812,)
x_test.shape=(454, 56)
y_test.shape=(454,)
# 参数设置
params = {
        'booster': 'gbtree',
        #'objective': 'reg:gamma', # 回归的损失函数,gmma回归要求非负(不合适)
        #'gamma': 0.1,
        'objective': 'reg:squarederror',
        'max_depth': 5,
        'lambda': 3,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'min_child_weight': 3,
#         'silent': 1,
        'eta': 0.1,
        'seed': 1000,
#         'nthread': 4,
    }
plst = list(params.items())

注意这里的损失函数!

RMSE=1ni=1n(yiy^i)2RMSE = \sqrt {\frac{1}{n}\sum_{i=1}^n(y_i-\hat y_i)^2}

RMSLE=1ni=1n(log(yi+1)log(y^i+1))2RMSLE = \sqrt {\frac{1}{n}\sum_{i=1}^n(log(y_i+1)-log(\hat y_i+1))^2}

  1. RMSLE 惩罚欠预测大于过预测,适用于某些需要欠预测损失更大的场景
  2. 如果预测的值的范围很大,RMSE 会被一些大的值主导。这样即使你很多小的值预测准了,但是有一个非常大的值预测的不准确,RMSE 就会很大

XGBoost支持的回归损失函数如下:

reg:squarederror:损失平方回归。

reg:squaredlogerror:回归与对数损失平方 12[log(pred+1)−log(label+1)]2。所有输入标签都必须大于-1。另外,请参阅指标rmsle以了解此目标可能存在的问题。

reg:logistic:逻辑回归

reg:pseudohubererror:使用伪Huber损失进行回归,这是绝对损失的两倍可微选择。

reg:gamma:使用对数链接进行伽马回归。输出是伽马分布的平均值。例如,对于建模保险索赔的严重性或对可能是伽马分布的任何结果,它可能是有用的。

reg:tweedie:使用对数链接进行Tweedie回归。它可能有用,例如,用于建模保险的总损失,或用于可能是Tweedie-distributed的任何结果。

# 数据集格式转换
dx = xgb.DMatrix(x,feature_names = factors)
dtrain = xgb.DMatrix(x_train, y_train, feature_names = factors)
dtest = xgb.DMatrix(x_test,feature_names = factors)
# 模型训练
num_rounds = 50
model = xgb.train(plst, dtrain, num_rounds)
# 模型预测
y_pred = model.predict(dtest)
prediction(y_pred, y_test.to_numpy())
RMSE by hand: 0.016748246844114766

png

# 显示重要特征
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111)
plot_importance(model,ax=ax, importance_type ="weight")
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>


png

# 可视化树的生成情况,num_trees是树的索引

trees = int(num_rounds/3)
size = 10
fig = plt.figure(figsize=(size*trees, size)) # figsize=(20, 20)
ax = []
for i in range(trees):
    ax.append(fig.add_subplot(1, trees, i+1))
plt.tight_layout()
    
for i in range(trees):
    ax[i].set_title('tree %d' % (trees+i), fontsize=size * 3, color='r')
    plot_tree(model, ax=ax[i], num_trees=trees+i)

plt.show()

png

# 将基学习器输出到txt文件中
model.dump_model("output/model.txt")
model.save_model("output/model.json")

XGBoost预测过程分析与重现

使用平均值策略进行因子收益率计算

手动通过model.txt构造森林,将子节点的平均值当做中间节点的预测值

x_in=pd.DataFrame(x, columns=factors, index = z['ts_code'])
x_out=pd.DataFrame(np.zeros(x_in.shape),index=x_in.index, columns=x_in.columns)
f = open('output/model.txt')
lines = f.read()
boosters = lines.split('booster')
class Node:
    def __init__(self, string):
        self.yes = None
        self.no = None
        self.leaf = False

        # string = string.strip()[string.find(':')+1:] 一定要分开做
        string = string.strip()
        string = string[string.find(':') + 1:]

        if string[:4] == 'leaf':
            self.leaf = True
            self.value = float(string[5:])
            # print(self.value)
        else:
            p1 = string.find(']')
            p2 = string.find('\n')
            assert p1 != -1
            assert p2 != -1

            self.key, condition = string[1:p1].split('<')
            self.condition = float(condition)

            yes, no, missing = string[p1 + 2:p2].split(',')
            yes = yes[4:]
            no = no[3:]
            missing = missing[8:]

            # 不能直接找 5: 6:   因为后面5:中可能出现36:
            # 加上\t的前缀就没问题了
            p3 = string.find('\t' + yes + ':')
            p4 = string.find('\t' + no + ':')

            assert p3 != -1
            assert p4 != -1

            self.yes = Node(string[p3:p4])
            self.no = Node(string[p4:])
            
            if missing == yes:
                self.missing = self.yes
            else:
                self.missing = self.no
            self.calculate()

    def print(self, table=0):
        print('\t' * table, end='')
        if self.leaf:
            print(self.value)
        else:
            print(self.key, '<', self.condition)

            self.yes.print(table + 1)
            self.no.print(table + 1)

    def calculate(self):
        if not self.leaf:
            self.yes.calculate()
            self.no.calculate()
            self.value = (self.yes.value + self.no.value) / 2

    def predict(self, x_in, x_out):
        # x应该是 DataFrame, xx是与x结构相同的DataFrame,用于积分
        if not self.leaf:
            x_out[self.key][x_in[self.key] < self.condition] += self.yes.value - self.value
            x_out[self.key][x_in[self.key] >= self.condition] += self.no.value - self.value
            x_out[self.key][x_in[self.key].isnull()] += self.missing.value - self.value
            
            self.yes.predict(x_in, x_out)
            self.no.predict(x_in, x_out)
forest = []
for booster in boosters:
    if len(booster) > 0:
        pre, suf = booster.split(":\n")
        order = int(pre[1:-1])
        forest.append(Node(suf))
for tree in forest:
    tree.print()
-0.339819431
-0.213186726
retnmax < 0.118408218
	-0.161993876
	-0.137030452
age < -0.70818764
	-0.10697522
	std_turn < 2.68779325
		-0.126113519
		-0.0452805348
BM < -0.351566017
	-0.0852892846
	-0.0996474773
LEV < -0.188691437
	mom12 < 2.79354906
		std_turn < 1.37936687
			-0.0740301535
			mom12 < 1.57889247
				-0.0593126193
				-0.0126145324
		-0.0281936433
	-0.0852679908
retnmax < -0.37396872
	-0.0690938756
	beta < 1.34343338
		-0.0556405373
		-0.0766395554
size < 0.774105906
	mom12 < 3.12097263
		age < 0.192883372
			mom6 < 0.663088202
				SG < 3.347785
					-0.0508309864
					-0.011613722
				-0.0361237191
			-0.0586737283
		-0.00810176786
	LEV < -0.673452616
		-0.0240357164
		-0.0723168179
LEV < 0.595337152
	retnmax < 0.167033881
		mom12 < 0.808440983
			size < -0.418863624
				-0.0427704081
				PA < -1.88745999
					-0.0181995761
					-0.0514330529
			-0.0342022255
		age < 1.62103617
			age < 0.698003411
				saleinv < 0.350993812
					-0.0365095213
					-0.0186509993
				ROA < -1.5740701
					-0.0175514277
					-0.0503382087
			beta < 0.371157259
				-0.0196212586
				0.00141685177
	-0.057177186
size < 0.296797454
	age < -0.928444743
		CFP < 0.142466873
			SP < -0.667316198
				-0.00549633615
				-0.0290522128
			coskew < 0.810310841
				retnmax < 1.70913982
					-0.0253706817
					0.00163265876
				-0.0395192169
		retnmax < 1.69785523
			-0.0421992429
			momchg < -0.378071725
				SG < 0.00142598525
					-0.0153430644
					0.00567442458
				size < 0.12059468
					-0.0359715335
					-0.00470872456
	-0.0500659905
size < 0.819216728
	retnmax < 0.137323022
		LEV < -0.625233769
			-0.0272775721
			size < -0.294304788
				-0.0336558074
				-0.0410611555
		DP < -0.497044444
			DP < -0.497973949
				QRG < 0.401603341
					-0.0209637433
					-0.0348131545
				0.0035358686
			SP < -0.645963609
				-0.0136125823
				BVEG < -0.77666086
					-0.00697107334
					-0.0343943052
	-0.0441419296
...
PMG < 0.709713936
	retnmax < -0.514937043
		-0.00107836875
		size < -0.442181051
			LG < -1.01129258
				-0.00935130008
				total_vol < -0.373696357
					-0.00364476419
					0.00135123439
			beta < 0.801499724
				QR < -0.303542674
					0.00493162172
					0.000802078925
				mom6 < 1.02592838
					-0.0031901896
					0.00473507354
	PMG < 1.17458177
		total_vol < -0.700189233
			0.00561590493
			skew < 0.463339239
				-0.0109476913
				0.00587760611
		skew < -0.0456393585
			0.00385934417
			-0.00550035806
bias = 0
for tree in forest:
    if tree.leaf:
        bias += tree.value
    tree.predict(x_in, x_out)
bias
-0.553006157
x_out['sum_nbias'] = x_out.apply(lambda x: x.sum(), axis=1)
x_out['bias'] = bias
x_out['sum'] = x_out.apply(lambda x: x.sum(), axis=1)
# x_out.to_csv('output/x_out.csv')
x_out['pred'] = model.predict(dx)
x_out['real'] = y
x_out
size beta betad idvol total_vol idskew skew coskew turn std_turn ... CFdebt salecash saleinv CRG QRG sum_nbias bias sum pred real
ts_code
000001.SZ -0.017619 -0.002868 -0.021627 -0.007686 -0.003678 -0.003118 -0.010287 0.024998 -0.019226 -0.049461 ... 0.0 -0.006322 0.025936 0.012437 0.009742 -0.439190 -0.553006 -1.431387 0.104325 0.105669
000002.SZ -0.017619 -0.002868 -0.021627 -0.007686 0.003733 -0.005892 -0.040134 -0.008461 -0.026985 -0.064061 ... 0.0 -0.013059 0.012374 0.016209 0.009742 -0.662169 -0.553006 -1.877344 0.100761 0.102498
000004.SZ 0.017619 -0.002868 -0.016653 -0.007686 0.003733 -0.003118 -0.018879 0.024998 -0.019390 -0.046201 ... 0.0 -0.013059 -0.002083 -0.020566 -0.016658 -0.345697 -0.553006 -1.244400 0.113855 0.103976
000005.SZ 0.049197 -0.002868 -0.021627 -0.007686 0.003733 -0.007874 -0.031936 0.024998 -0.026985 -0.064061 ... 0.0 -0.006322 0.007675 0.016209 0.000392 -0.443058 -0.553006 -1.439122 0.111119 0.103333
000006.SZ 0.018288 -0.002868 -0.021627 -0.001085 -0.018519 0.014659 -0.010287 0.024998 -0.033661 -0.064061 ... 0.0 -0.013059 -0.016981 -0.004281 -0.016658 -0.662001 -0.553006 -1.877009 0.100056 0.096154
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
603333.SH 0.041927 -0.007386 -0.016653 0.007686 -0.018519 0.014659 -0.015111 0.024998 -0.001425 -0.061444 ... 0.0 -0.001750 0.001713 -0.013561 -0.016658 -0.583327 -0.553006 -1.719660 0.103489 0.110969
603366.SH 0.018288 0.032375 -0.016653 -0.007686 0.003733 -0.003118 0.015111 -0.018324 0.012068 -0.049461 ... 0.0 -0.013059 0.001713 -0.020566 -0.016658 -0.576422 -0.553006 -1.705850 0.109348 0.093337
603399.SH 0.030195 0.004059 0.018798 0.007686 -0.018519 -0.003522 -0.004496 -0.016262 0.041272 0.064061 ... 0.0 -0.001750 0.009731 -0.020566 -0.016658 -0.364569 -0.553006 -1.282144 0.104743 0.097743
603766.SH 0.018288 -0.007386 0.004513 -0.001085 -0.018519 -0.003118 -0.010287 -0.018324 0.012068 -0.061444 ... 0.0 -0.001750 0.025936 -0.013561 -0.016658 -0.727473 -0.553006 -2.007952 0.104064 0.112458
603993.SH -0.017619 0.004059 0.018798 0.007686 -0.018519 -0.003522 -0.004496 -0.016262 0.041272 0.064061 ... 0.0 -0.013059 0.007675 0.004281 0.012854 -0.221143 -0.553006 -0.995292 0.105327 0.096750

2266 rows × 61 columns

x_demo = x_out[:20]
fig = plt.figure(figsize=(20, 20))
for factor in factors:
    plt.barh(x_demo.index,x_demo[factor],align="center",label=factor)
    
# 设置x,y轴标签
plt.xlabel("yield")
plt.ylabel("stock")
plt.legend()
plt.plot()

png

遇到且待解决的问题(3.25)

  1. 预处理中因变量可以尝试k日平均收益率

  2. 手动编程使用XGBoost回归树得到的结果为什么跟使用接口不一致

  3. 能否使用回归树分裂前的权值作为因子贡献的一部分,而不是使用平均值(必须要深入源码了解树的分裂过程,保存贡献值)

  4. 未来考虑使用lightGBM可以对因子进行聚合,但是能否保持模型中因子的可解释性

预测流程探索

XGBoost官方文档:https://xgboost.readthedocs.io/en/latest/

核心预测代码:

_check_call(_LIB.XGBoosterPredict(self.handle, data.handle,
                                  ctypes.c_int(option_mask),
                                  ctypes.c_uint(ntree_limit),
                                  ctypes.c_int(training),
                                  ctypes.byref(length),
                                  ctypes.byref(preds)))
preds = ctypes2numpy(preds, length.value, np.float32)

在predict函数中,调用了_LIBXGBoosterPredict,其中_LIB = _load_lib(),而该函数是从C++的dll导入的:lib = ctypes.cdll.LoadLibrary(lib_path),这一函数内部实现细节是不可见的

先看看preds输出的结果是什么?

image-20210330205920250

一个297维的向量,为什么是297?因为预测的自变量是x_test,可以发现,这里的preds正是XGBoost对于自变量预测的结果。经过检验数值上是一致的。

x_train.shape=(1186, 56)
y_train.shape =(1186,)
x_test.shape=(297, 56)
y_test.shape=(297,)

参数pred_leaf

会跟着option_mask传入预测dll函数中,最后出来的结果也不再是297维变量,而是14850=297*50维,每个样例在每颗树都会预测一个叶结点编号出来(0-48的整数,不同树重新编号)

When this option is on, the output will be a matrix of (nsample,ntrees) with each record indicating the predicted leaf index ofeach sample in each tree. Note that the leaf index of a tree isunique per tree, so you may find leaf 1 in both tree 1 and tree 0.

y_pred2 = model.predict(dx, pred_leaf = True)
print(y_pred2.shape)
df_pred2 = pd.DataFrame(y_pred2)
df_pred2
(2266, 50)
0 1 2 3 4 5 6 7 8 9 ... 40 41 42 43 44 45 46 47 48 49
0 0 0 2 3 2 2 3 6 2 2 ... 15 48 4 34 41 5 46 19 5 42
1 0 0 2 3 2 2 3 6 2 2 ... 15 12 4 39 41 5 27 10 5 42
2 0 0 2 3 1 9 3 8 15 27 ... 8 47 18 44 3 5 46 19 38 19
3 0 0 1 3 1 5 1 8 18 9 ... 8 6 17 17 3 5 27 3 37 3
4 0 0 1 3 2 2 3 8 18 9 ... 15 48 17 17 41 5 27 3 10 41
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2261 0 0 1 1 2 2 1 11 18 21 ... 6 44 31 1 42 5 5 3 10 3
2262 0 0 1 1 2 5 1 11 18 16 ... 31 43 31 36 42 5 42 30 10 3
2263 0 0 2 1 1 2 3 11 21 16 ... 32 12 34 44 41 5 42 19 38 43
2264 0 0 1 1 2 2 1 11 2 16 ... 6 12 31 34 41 5 42 28 38 3
2265 0 0 2 1 1 2 3 6 21 2 ... 32 41 14 34 2 5 41 39 10 43

2266 rows × 50 columns

参数pred_contribs

对于每个样例的每个特征,都会输出一个名为SHAP value的东西,并且其加和正是预测值!(SHAP的目标就是通过计算x中每一个特征对prediction的贡献, 来对模型判断结果的解释)

When this is True the output will be a matrix of size (nsample,nfeats + 1) with each record indicating the feature contributions(SHAP values) for that prediction. The sum of all featurecontributions is equal to the raw untransformed margin value of theprediction. Note the final column is the bias term.

y_pred3 = model.predict(dx, pred_contribs = True)
print(y_pred3.shape)
df_pred3 = pd.DataFrame(y_pred3)
df_pred3
(2266, 57)
0 1 2 3 4 5 6 7 8 9 ... 47 48 49 50 51 52 53 54 55 56
0 -0.047273 0.001736 -0.000122 -0.003356 0.002849 0.002284 0.004681 0.000205 0.002105 0.002946 ... 0.000803 0.006890 -0.000339 0.009462 0.0 -0.001191 0.004691 0.000357 0.000622 -2.176252
1 -0.034986 0.002293 -0.002439 -0.000738 0.001690 -0.000475 -0.002560 -0.002315 0.001196 -0.008172 ... -0.000057 -0.008880 0.000033 0.011063 0.0 0.000058 -0.000520 0.000447 0.000232 -2.176252
2 0.001372 -0.001824 -0.000318 -0.000932 0.000668 0.000749 -0.002135 0.000687 0.006122 0.000180 ... -0.000265 0.004493 -0.000468 0.005895 0.0 0.000445 -0.003008 -0.004827 -0.014895 -2.176252
3 0.017375 0.000784 -0.000551 -0.000329 0.000815 -0.001845 -0.008944 0.001427 -0.001551 -0.005962 ... -0.000242 0.004187 0.000360 0.006285 0.0 -0.000337 0.002553 0.000571 0.000212 -2.176252
4 -0.005721 -0.002398 0.000193 0.001004 -0.001762 -0.012407 0.002866 0.000676 -0.001678 -0.005552 ... -0.000120 -0.014903 -0.000153 0.002838 0.0 0.000857 -0.025664 -0.001489 -0.007282 -2.176252
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2261 0.007847 -0.020793 -0.002197 0.000430 -0.004791 -0.018859 -0.018088 0.008974 0.002019 -0.001337 ... 0.000113 0.001505 -0.000203 0.005854 0.0 -0.000238 -0.001581 -0.001401 -0.006518 -2.176252
2262 -0.003991 0.003878 -0.001931 -0.001025 0.000649 0.003373 0.008426 -0.001203 -0.001328 -0.001676 ... 0.000323 -0.003365 -0.000203 0.005460 0.0 -0.001744 0.001759 -0.005778 -0.005245 -2.176252
2263 0.003321 -0.019329 0.001792 0.001174 -0.000878 -0.005546 -0.004332 -0.004847 0.006301 0.033037 ... -0.000120 -0.010854 -0.000604 -0.000552 0.0 -0.000698 0.000789 -0.012464 -0.010693 -2.176252
2264 -0.002582 -0.018233 -0.000616 -0.000100 -0.000274 0.003815 0.005402 -0.000236 0.006145 -0.002779 ... 0.000176 -0.002101 -0.000203 -0.000215 0.0 -0.000672 0.003693 -0.000730 -0.006229 -2.176252
2265 -0.057220 -0.015599 0.002475 0.003488 -0.000898 -0.005099 -0.003772 0.001490 0.005088 0.029954 ... -0.000331 -0.000212 -0.000354 0.006528 0.0 -0.001124 0.000126 -0.001942 -0.004091 -2.176252

2266 rows × 57 columns

bias1= df_pred3.loc[0, 56]
print(bias1)
df_pred3=df_pred3.drop(columns=[56])
-2.1762524
df_pred3['sum_nbias'] = df_pred3.apply(lambda x: x.sum(), axis=1)
df_pred3['sum'] = df_pred3['sum_nbias']+bias1
df_pred3['pred'] = model.predict(dx)
df_pred3['real'] = y
df_pred3
0 1 2 3 4 5 6 7 8 9 ... 50 51 52 53 54 55 sum_nbias sum pred real
0 -0.047273 0.001736 -0.000122 -0.003356 0.002849 0.002284 0.004681 0.000205 0.002105 0.002946 ... 0.009462 0.0 -0.001191 0.004691 0.000357 0.000622 -0.083990 -2.260242 0.104325 0.105669
1 -0.034986 0.002293 -0.002439 -0.000738 0.001690 -0.000475 -0.002560 -0.002315 0.001196 -0.008172 ... 0.011063 0.0 0.000058 -0.000520 0.000447 0.000232 -0.118753 -2.295005 0.100761 0.102498
2 0.001372 -0.001824 -0.000318 -0.000932 0.000668 0.000749 -0.002135 0.000687 0.006122 0.000180 ... 0.005895 0.0 0.000445 -0.003008 -0.004827 -0.014895 0.003420 -2.172832 0.113855 0.103976
3 0.017375 0.000784 -0.000551 -0.000329 0.000815 -0.001845 -0.008944 0.001427 -0.001551 -0.005962 ... 0.006285 0.0 -0.000337 0.002553 0.000571 0.000212 -0.020898 -2.197151 0.111119 0.103333
4 -0.005721 -0.002398 0.000193 0.001004 -0.001762 -0.012407 0.002866 0.000676 -0.001678 -0.005552 ... 0.002838 0.0 0.000857 -0.025664 -0.001489 -0.007282 -0.125771 -2.302023 0.100056 0.096154
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2261 0.007847 -0.020793 -0.002197 0.000430 -0.004791 -0.018859 -0.018088 0.008974 0.002019 -0.001337 ... 0.005854 0.0 -0.000238 -0.001581 -0.001401 -0.006518 -0.092035 -2.268287 0.103489 0.110969
2262 -0.003991 0.003878 -0.001931 -0.001025 0.000649 0.003373 0.008426 -0.001203 -0.001328 -0.001676 ... 0.005460 0.0 -0.001744 0.001759 -0.005778 -0.005245 -0.036967 -2.213220 0.109348 0.093337
2263 0.003321 -0.019329 0.001792 0.001174 -0.000878 -0.005546 -0.004332 -0.004847 0.006301 0.033037 ... -0.000552 0.0 -0.000698 0.000789 -0.012464 -0.010693 -0.079994 -2.256246 0.104743 0.097743
2264 -0.002582 -0.018233 -0.000616 -0.000100 -0.000274 0.003815 0.005402 -0.000236 0.006145 -0.002779 ... -0.000215 0.0 -0.000672 0.003693 -0.000730 -0.006229 -0.086498 -2.262750 0.104064 0.112458
2265 -0.057220 -0.015599 0.002475 0.003488 -0.000898 -0.005099 -0.003772 0.001490 0.005088 0.029954 ... 0.006528 0.0 -0.001124 0.000126 -0.001942 -0.004091 -0.074438 -2.250690 0.105327 0.096750

2266 rows × 60 columns

df = x_out.copy()
df=df.drop(columns=factors)
df['sum_nbias1']=df_pred3['sum_nbias'].to_numpy()
df['bias1']=bias1
df['sum1']=df_pred3['sum'].to_numpy()
df
sum_nbias bias sum pred real sum_nbias1 bias1 sum1
ts_code
000001.SZ -0.439190 -0.553006 -1.431387 0.104325 0.105669 -0.083990 -2.176252 -2.260242
000002.SZ -0.662169 -0.553006 -1.877344 0.100761 0.102498 -0.118753 -2.176252 -2.295005
000004.SZ -0.345697 -0.553006 -1.244400 0.113855 0.103976 0.003420 -2.176252 -2.172832
000005.SZ -0.443058 -0.553006 -1.439122 0.111119 0.103333 -0.020898 -2.176252 -2.197151
000006.SZ -0.662001 -0.553006 -1.877009 0.100056 0.096154 -0.125771 -2.176252 -2.302023
... ... ... ... ... ... ... ... ...
603333.SH -0.583327 -0.553006 -1.719660 0.103489 0.110969 -0.092035 -2.176252 -2.268287
603366.SH -0.576422 -0.553006 -1.705850 0.109348 0.093337 -0.036967 -2.176252 -2.213220
603399.SH -0.364569 -0.553006 -1.282144 0.104743 0.097743 -0.079994 -2.176252 -2.256246
603766.SH -0.727473 -0.553006 -2.007952 0.104064 0.112458 -0.086498 -2.176252 -2.262750
603993.SH -0.221143 -0.553006 -0.995292 0.105327 0.096750 -0.074438 -2.176252 -2.250690

2266 rows × 8 columns


使用RandomForest解释

参考链接:模型可解释性kaggle教程:https://www.kaggle.com/dansbecker/use-cases-for-model-insights

人们常说机器学习模型是“黑盒”,有较好的预测但不能与理解预测背后的原因。但是通过SHAP方法可以从复杂机器学习模型中发掘一些insight:

  • 哪些特征模型认为最重要?Permutation Importance
  • 在一次预测中,单一特征如何影响预测的?Partial Plots
  • 宏观上,每个特征是怎么影响模型预测的?SHAP, Summary Plots
  • 特征之间是如何相互影响的? Denpendence Contribution Plots

Feature Engineering: repeatedly creating new features using transformations of your raw data or features you have previously created.

Informing Human Decision-Making: insights > predictions

Building Trust: showing insights that fit their general understanding of the problem will help build trust

下面以RandomForest作为模型进行预测并解释,API:https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

z1 = z.dropna()
y1 = z1['yield']*100 # 后面可视化需要大一点的值效果更好,记住所有收益率都是百分比
x1 = z1[factors]
# 划分数据集
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=1)
print('x1_train.shape={}\ny1_train.shape ={}\nx1_test.shape={}\ny1_test.shape={}'.format(x1_train.shape, y1_train.shape, x1_test.shape, y1_test.shape))

my_model = RandomForestRegressor(random_state=1, max_depth=5).fit(x1_train, y1_train) # 默认n_estimators=100
x1_train.shape=(1192, 56)
y1_train.shape =(1192,)
x1_test.shape=(298, 56)
y1_test.shape=(298,)
y1_pred = my_model.predict(x1_test)
prediction(y1_pred, y1_test.to_numpy())
RMSE by hand: 1.716301821536559

png

from sklearn import tree
import graphviz
from IPython.display import Image
import pydotplus

#for each_model in my_model.estimators_[:3]:  # 可视化三颗
dot_data = tree.export_graphviz(my_model.estimators_[3], out_file=None, feature_names=factors, filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png()) 

下载

Permutation Importance(排序重要性)

eli5官方API:https://eli5.readthedocs.io/en/latest/tutorials/index.html

在模型训练完成后进行,随机打乱某一特征各个样例的数据,再进行预测,准确率会降低吗?

Shuffle

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(x1_test, y1_test)
eli5.show_weights(perm, feature_names = factors, top=56)
<tr style="background-color: hsl(120, 100.00%, 80.00%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0820

            &plusmn; 0.0320

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        retnmax
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 90.36%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0289

            &plusmn; 0.0207

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        beta
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 93.23%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0175

            &plusmn; 0.0067

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        size
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 94.67%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0124

            &plusmn; 0.0181

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        DP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.10%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0110

            &plusmn; 0.0029

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LEV
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.26%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0105

            &plusmn; 0.0103

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        cashpr
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.43%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0100

            &plusmn; 0.0091

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.98%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0083

            &plusmn; 0.0211

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        BM
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.08%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0080

            &plusmn; 0.0041

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.13%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0079

            &plusmn; 0.0089

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CR
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.42%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0070

            &plusmn; 0.0084

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        AM
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.47%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0069

            &plusmn; 0.0087

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        mom12
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.58%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0066

            &plusmn; 0.0057

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        AG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.62%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0065

            &plusmn; 0.0097

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        QRG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.72%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0062

            &plusmn; 0.0119

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        volumed
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.82%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0059

            &plusmn; 0.0072

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        PMG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.11%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0052

            &plusmn; 0.0078

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CFP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.19%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0050

            &plusmn; 0.0116

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        std_dvol
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.43%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0044

            &plusmn; 0.0219

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        saleinv
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.86%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0034

            &plusmn; 0.0119

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ROA
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.99%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0031

            &plusmn; 0.0393

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        age
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.06%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0029

            &plusmn; 0.0057

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        lagretn
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.42%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0022

            &plusmn; 0.0039

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ROE
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.43%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0022

            &plusmn; 0.0069

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        turn
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.47%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0021

            &plusmn; 0.0138

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        momchg
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.48%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0021

            &plusmn; 0.0031

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ACC
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.65%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0017

            &plusmn; 0.0086

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        EP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.72%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0016

            &plusmn; 0.0096

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        QR
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.78%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0015

            &plusmn; 0.0049

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CRG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 99.39%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0006

            &plusmn; 0.0021

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        PA
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.81%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0001

            &plusmn; 0.0055

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        illq
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.63%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0003

            &plusmn; 0.0050

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        sharechg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.17%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0009

            &plusmn; 0.0073

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        RD
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.05%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0011

            &plusmn; 0.0059

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        std_turn
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.79%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0015

            &plusmn; 0.0145

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        INVG
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.77%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0015

            &plusmn; 0.0056

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CFdebt
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.54%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0020

            &plusmn; 0.0042

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        imom
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.32%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0024

            &plusmn; 0.0068

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        TAXchg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.30%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0024

            &plusmn; 0.0087

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        idskew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.30%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0024

            &plusmn; 0.0078

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        OCFP
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.25%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0025

            &plusmn; 0.0062

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        idvol
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.50%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0042

            &plusmn; 0.0045

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SgINVg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.41%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0044

            &plusmn; 0.0092

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LM
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.40%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0044

            &plusmn; 0.0052

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CT
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.22%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0049

            &plusmn; 0.0133

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        INVchg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.22%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0049

            &plusmn; 0.0089

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        skew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.15%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0051

            &plusmn; 0.0042

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        cash
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.82%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0059

            &plusmn; 0.0093

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        total_vol
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.70%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0063

            &plusmn; 0.0059

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SG
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.69%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0063

            &plusmn; 0.0138

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        betad
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.50%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0068

            &plusmn; 0.0060

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        coskew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.40%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0071

            &plusmn; 0.0134

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ACCP
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 95.04%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0112

            &plusmn; 0.0162

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        mom6
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 94.63%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0125

            &plusmn; 0.0154

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        BVEG
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 93.31%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0172

            &plusmn; 0.0183

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        salecash
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 90.58%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0280

            &plusmn; 0.0044

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        RDsales
    </td>
</tr>
Weight Feature
perm = PermutationImportance(my_model, random_state=7).fit(x1_test, y1_test)
eli5.show_weights(perm, feature_names = factors, top=56)
<tr style="background-color: hsl(120, 100.00%, 80.00%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0733

            &plusmn; 0.0779

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        retnmax
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 88.19%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0345

            &plusmn; 0.0189

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        beta
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 89.80%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0280

            &plusmn; 0.0409

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        age
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 92.72%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0173

            &plusmn; 0.0097

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        size
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 93.88%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0135

            &plusmn; 0.0175

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        DP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 94.68%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0111

            &plusmn; 0.0066

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.29%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0093

            &plusmn; 0.0077

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CR
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.56%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0085

            &plusmn; 0.0214

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        mom12
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.64%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0083

            &plusmn; 0.0167

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        EP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 95.65%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0083

            &plusmn; 0.0120

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        QR
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.02%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0073

            &plusmn; 0.0057

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.11%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0071

            &plusmn; 0.0105

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        std_dvol
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.11%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0071

            &plusmn; 0.0064

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        AG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.11%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0071

            &plusmn; 0.0097

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        cashpr
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.34%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0065

            &plusmn; 0.0094

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LEV
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.35%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0065

            &plusmn; 0.0151

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        BM
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 96.67%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0057

            &plusmn; 0.0091

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        momchg
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.01%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0048

            &plusmn; 0.0058

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        volumed
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.03%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0048

            &plusmn; 0.0111

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        QRG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.03%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0048

            &plusmn; 0.0043

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CFP
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.45%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0039

            &plusmn; 0.0141

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        saleinv
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.62%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0035

            &plusmn; 0.0059

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        PMG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.70%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0033

            &plusmn; 0.0123

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ROA
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 97.94%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0029

            &plusmn; 0.0096

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        INVG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.26%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0022

            &plusmn; 0.0081

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CRG
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.29%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0022

            &plusmn; 0.0054

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ACC
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.68%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0015

            &plusmn; 0.0058

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        turn
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.82%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0013

            &plusmn; 0.0049

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        lagretn
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 98.85%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0012

            &plusmn; 0.0044

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ROE
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 99.18%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0008

            &plusmn; 0.0153

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        AM
    </td>
</tr>

<tr style="background-color: hsl(120, 100.00%, 99.41%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        0.0005

            &plusmn; 0.0073

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        std_turn
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.54%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0003

            &plusmn; 0.0048

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        imom
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 99.43%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0005

            &plusmn; 0.0019

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        sharechg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.95%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0011

            &plusmn; 0.0091

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        TAXchg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.62%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0016

            &plusmn; 0.0113

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        betad
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.39%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0020

            &plusmn; 0.0067

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CFdebt
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.34%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0021

            &plusmn; 0.0057

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        skew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.20%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0024

            &plusmn; 0.0086

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        RD
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 98.06%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0026

            &plusmn; 0.0079

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        idvol
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.86%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0030

            &plusmn; 0.0073

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        PA
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.27%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0043

            &plusmn; 0.0044

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SG
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.27%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0043

            &plusmn; 0.0021

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        SgINVg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.23%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0044

            &plusmn; 0.0033

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        idskew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.18%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0045

            &plusmn; 0.0092

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        coskew
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 97.09%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0047

            &plusmn; 0.0062

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        CT
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.93%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0050

            &plusmn; 0.0073

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        total_vol
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.61%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0058

            &plusmn; 0.0054

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        cash
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.45%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0062

            &plusmn; 0.0043

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        INVchg
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.41%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0063

            &plusmn; 0.0075

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        LM
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.20%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0068

            &plusmn; 0.0123

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        OCFP
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 96.06%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0072

            &plusmn; 0.0068

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        illq
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 95.42%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0089

            &plusmn; 0.0065

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        mom6
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 95.03%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0100

            &plusmn; 0.0061

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        ACCP
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 94.98%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0102

            &plusmn; 0.0131

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        salecash
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 93.42%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0150

            &plusmn; 0.0146

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        BVEG
    </td>
</tr>

<tr style="background-color: hsl(0, 100.00%, 90.11%); border: none;">
    <td style="padding: 0 1em 0 0.5em; text-align: right; border: none;">
        -0.0268

            &plusmn; 0.0151

    </td>
    <td style="padding: 0 0.5em 0 0.5em; text-align: left; border: none;">
        RDsales
    </td>
</tr>
Weight Feature

最靠上的是最重要的特征,而底部的特征最不重要:

  1. retnmax是最重要的特征,打乱顺序对于结果的影响最大,回顾一下含义:最大日收益率 (maximum daily return, retnmax):t 月份的最大日收益率等于在 t 月份整个月中的日收益率中最大的日收益率
  2. RDsales重要性为负值,打乱后预测值反而变得更加准确。着说明这种特征完全不重要,随机的排序提供了luck/chance的空间

Partial Plots(部分依赖图)

PDPbox API:https://pdpbox.readthedocs.io/en/latest/

也是模型训练完成后进行的:

  1. 在预测时对于一个样例:选中某一变量,不停地修改其值进行一系列预测,trace所有的结果能得到一条线(特征-预测值)
  2. 重复以上步骤若干次,形成一个区域,最后做出区域的均值
# Your Code Here
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

list_factor = ['retnmax', 'beta', 'age','size','DP']

for factor in list_factor:
    # Create the data that we will plot
    pdp_goals = pdp.pdp_isolate(model=my_model, dataset=x1_test, model_features=factors, 
                                feature=factor, num_grid_points=50)  # grid_type='equal'
    # plot it
    pdp.pdp_plot(pdp_goals, factor)
    plt.show()

png

png

png

png

png

在看上面的部分依赖图的时候,有两点值得注意的地方:

  • y轴表示的是模型预测相较于基线值或最左边的值的变化
  • 蓝色阴影部分表示置信区间。

弱相关属性:RDsales排序重要性为负,对预测值的影响最小

pdp_goals = pdp.pdp_isolate(model=my_model, dataset=x1_test, model_features=factors, feature='RDsales', num_grid_points=50)
# plot it
pdp.pdp_plot(pdp_goals, 'RDsales')
plt.show()

png

ROA_yield=z.groupby(['RDsales']).mean()['yield']*100
plt.scatter(ROA_yield.index, ROA_yield)
plt.show()

png

2D Partial Plots

对特征之间的相互作用进行描述

features_to_plot = ['size', 'retnmax']
inter1  =  pdp.pdp_interact(model=my_model, dataset=x1_test, model_features=factors, features=features_to_plot, num_grid_points=[20,20])
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour', plot_pdp=True)
plt.show()

png

SHAP

SHAP官方文档:https://shap.readthedocs.io/en/latest/#

github链接:https://github.com/slundberg/shap#citations

SHAP(SHapley Additive exPlanations),是合作博弈论中的一个解决方案。每个因子是具体怎样影响预测的,影响了多少

有以下等式:

prediction(i)=baseline+j=1featuresSHAPj(i)prediction^{(i)} = baseline + \sum_{j=1}^{|features|} SHAP^{(i)}_j

样例i的预测值等于一个baseline,加上每个特征的SHAP值,而这个baseline对于所有样例是一致的。SHAP技术比较复杂,原理见:

https://towardsdatascience.com/one-feature-attribution-method-to-supposedly-rule-them-all-shapley-values-f3e04534983d

import shap  # package used to calculate Shap values

feature_index=195
data_for_prediction = x1_test.iloc[feature_index,:]
my_model.predict(data_for_prediction.to_numpy().reshape(1, -1))
array([4.84538454])
y1_test.iloc[feature_index]
3.93613070924618
# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)
shap.initjs()

注意,shap.waterfall_plotshap.plots.bar默认保留2位小数显示,但收益率是一个比较小的数值,这里调整到5位

包括 shap\plots\_waterfall.pyshap\plots\_bar.py,将所有0.02f替换成0.05f,linux下在/anaconda3/envs/jupyter/lib/python3.9/site-packages/shap/plots#文件夹下使用此命令

sed -i "s/0.02f/0.05f/g" `grep 0.02f -rl .`

然而修改之后还是无效,小数最多显示两位,因此最终将收益率统一改为了百分比的形式

shap_value=explainer(data_for_prediction)
shap_value.base_values = shap_value.base_values[0][0]
shap.waterfall_plot(shap_value)

png

以上展示了特征对于模型输出的影响,使结果增加的特征为红色,减少为蓝色,使用力图也能可视化:

shap.plots.force(shap_value)

image-20210401122423790

将力图旋转90度,并且水平上堆积,可以看到整个数据集上的解释:

shap_values=explainer(x1_test)
shap.force_plot(explainer.expected_value, shap_values.values, x1_test)

image-20210401122603366

shap.force_plot参数:

base_value : float 对于所有样例都是一致的

This is the reference value that the feature contributions start from. For SHAP values it should be the value of explainer.expected_value.

shap_values : numpy.array

Matrix of SHAP values (# features) or (# samples x # features). If this is a 1D array then a single force plot will be drawn, if it is a 2D array then a stacked force plot will be drawn.

features : numpy.array

Matrix of feature values (# features) or (# samples x # features). This provides the values of all the features, and should be the same shape as the shap_values argument.

Summary Plots

为获取特征对结果影响的总览,可绘制每个样例每个特征的SHAP值。该图对SHAP影响大小进行了排序

# Make plot. 回归问题不需要加入下标[1],因为只有一个预测值
shap.summary_plot(explainer.shap_values(x1_test), x1_test)


png

通过计算所有SHAP值的平均+绝对值,可以直观得到各种特征对结果的贡献,与排序重要性非常相似。

shap.plots.bar(shap_values, max_display=30)


png

SHAP Dependence Contribution Plots

之前使用部分依赖图展示了单一因子对预测的影响,但没有因子的分布,Dependence Contribution Plots加入了更多的细节

正相关因子:

shap.dependence_plot('idvol', explainer.shap_values(x1_test), x1_test, interaction_index="total_vol")


png

负相关因子:

shap.dependence_plot('mom12', explainer.shap_values(x1_test), x1_test, interaction_index="momchg")


png

Shapley自动寻找比较有趣的因子对:

shap.dependence_plot('size', explainer.shap_values(x1_test), x1_test)


png

新的想法&其他问题(4.1):

  1. 对XGBoost进行调参,提高其准确率
  2. 数据不再采用一天的,全部用来训练
  3. 使用模型进行时间序列上的迁移学习:时间是连续的,因子对于收益率的影响也应该是连续的,前一天对后一天有影响
  4. 尝试其他机器学习(lightGBM),甚至深度学习模型(SHAP支持tensorflow/keras,甚至一些初步的pytorch)