import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

def dropun(X):
    for x in X.columns:
        if x[:7]=='Unnamed':
            X=X.drop(columns=[x])
    return X

def hist(L):
    kwargs = dict(histtype='stepfilled',density=True,alpha=0.3,bins=40)
    for X in L:
        plt.hist(X, **kwargs)
        
def prediction(y_pred, y_test):
    sum_mean = 0
    for i in range(len(y_pred)):
        sum_mean += (y_pred[i] - y_test[i]) ** 2
    sum_erro = np.sqrt(sum_mean /len(y_pred))  # 测试级的数量
    # calculate RMSE
    print ("RMSE by hand:", sum_erro)
    
    # 做ROC曲线
    plt.figure()
    plt.plot(range(len(y_pred)), y_pred, 'b', label="predict")
    plt.plot(range(len(y_pred)), y_test, 'r', label="test")
    plt.legend(loc="upper right")  # 显示图中的标签

读入数据

每天都需要构建回归模型,并且计算56个因子对应的收益率,现在以2013-03-01为例构建:

X=pd.read_csv('data/factors2013-3-2-1.csv')
Y=pd.read_csv('data/daily2011-2017-1.csv')
X=dropun(X)
Y=dropun(Y)
factors=list(X.columns)
factors.remove('ts_code')
factors.remove('trade_date')
print(factors)
['size', 'beta', 'betad', 'idvol', 'total_vol', 'idskew', 'skew', 'coskew', 'turn', 'std_turn', 'volumed', 'std_dvol', 'retnmax', 'illq', 'LM', 'sharechg', 'age', 'mom12', 'mom6', 'momchg', 'imom', 'lagretn', 'BM', 'AM', 'LEV', 'EP', 'CFP', 'OCFP', 'DP', 'SP', 'AG', 'LG', 'BVEG', 'INVG', 'INVchg', 'SG', 'SgINVg', 'PMG', 'TAXchg', 'ACC', 'ACCP', 'ROE', 'ROA', 'PA', 'CT', 'cash', 'cashpr', 'RD', 'RDsales', 'CR', 'QR', 'CFdebt', 'salecash', 'saleinv', 'CRG', 'QRG']
days=set(X['trade_date'])
days=list(days)
days.sort()
days
['2013-01-04',
 '2013-01-07',
 '2013-01-08',
 '2013-01-09',
 '2013-01-10',
...
 '2013-12-10',
 '2013-12-11',
 '2013-12-12',
 '2013-12-13',
 '2013-12-16',
 '2013-12-17',
 '2013-12-18',
 '2013-12-19',
 '2013-12-20',
 '2013-12-23',
 '2013-12-24',
 '2013-12-25',
 '2013-12-26',
 '2013-12-27',
 '2013-12-30',
 '2013-12-31']
x=X[X['trade_date']=='2013-03-01'].drop(columns=['trade_date'])
y=Y[Y['trade_date']=='2013-03-01'].drop(columns=['trade_date'])
z=pd.merge(x,y,on='ts_code')

这里x有1590支股票,y有2276支股票,但是一经过merge,就只有1498支股票,说明x和y中存在互斥的。有因子数据但没有收益数据,或者有收益数据却没有因子

x=z[factors]
y=z['yield']
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print('x_train.shape={}\ny_train.shape ={}\nx_test.shape={}\ny_test.shape={}'.format(x_train.shape, y_train.shape, x_test.shape, y_test.shape))
x_train.shape=(1192, 56)
y_train.shape =(1192,)
x_test.shape=(298, 56)
y_test.shape=(298,)

可视化单因子对收益率的影响

import seaborn as sns
#画出单因素拟合情况
# plt.rcParams['font.sans-serif'] = ['SimHei']  #配置显示中文,否则乱码
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号,如果是plt画图,则将mlp换成plt
sns.pairplot(z, x_vars=factors, y_vars='yield',kind="reg", size=5, aspect=0.7)
plt.show()
/root/anaconda3/envs/jupyter/lib/python3.9/site-packages/seaborn/axisgrid.py:1969: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

如图所示,很难看出因子对收益率的线性相关关系,可以进一步计算相关系数。

相关系数

在此计算所有因子,包含收益率之间的相关系数并绘制热力图。

样本是由多维特征的构成的,把每个特征维度都看成一个随机变量,为了考查两两特征间的关系,可以借助随机变量的协方差

image-20210401110416448

cov(Xi,Xj)=E(XiEXi)(XjEXj)DXi=E(XiEXi)2DXj=E(XjEXj)2cov(X_i,X_j)=E(X_i-EX_i)(X_j-EX_j)\\ DX_i=E(X_i-EX_i)^2\\ DX_j=E(X_j-EX_j)^2

rij=cov(Xi,Xj)DXiDXjr_{ij}=\frac{cov(X_i,X_j)}{\sqrt{DX_i\cdot DX_j}}

其中, rij1r_{ij}\leq1: 1表示随机变量XiX_iXjX_j完全线性正相关; -1表示完全线性负相关; 0表示线性无关,但是线性无关并不代表完全无关。
样本(共m个)随机变量X=[X1,X2,,Xn]TX=[X_1,X_2,\cdots,X_n]^T,第k个样本为{xk.=[xk1,xk2,,xkn]T1km}\{x_{k.}=[x_{k1},x_{k2},\cdots,x_{kn}]^T|1\leq k\leq m\}。则任意两个维度特征的协方差为

cov(Xi,Xj)=k=1m(xkix.i)(xkjx.j)m1\displaystyle cov(X_i,X_j)=\frac{\sum\limits_{k=1}^m(x_{ki}-\overline{x_{.i}})(x_{kj}-\overline{x_{.j}})}{m-1}

分母是 m-1是因为随机变量的数学期望未知,用“样本均值”代替,自由度减1

z1=z.drop(columns=['ts_code'])
cor = np.corrcoef(z1,rowvar=0)
print(cor, cor.shape)
[[ 1.          0.02862477 -0.12943443 ... -0.01320729 -0.00883679
  -0.16342838]
 [ 0.02862477  1.          0.63880103 ... -0.01009581 -0.02444542
  -0.17414441]
 [-0.12943443  0.63880103  1.         ...  0.02446409  0.00570809
  -0.11281865]
 ...
 [-0.01320729 -0.01009581  0.02446409 ...  1.          0.87023769
  -0.03211747]
 [-0.00883679 -0.02444542  0.00570809 ...  0.87023769  1.
  -0.04673449]
 [-0.16342838 -0.17414441 -0.11281865 ... -0.03211747 -0.04673449
   1.        ]] (57, 57)
fig = plt.subplots(figsize = (35,35))
sns.heatmap(cor, cmap='rainbow', center=0, annot=True, fmt='.2f', xticklabels=z1.columns, yticklabels=z1.columns)
<AxesSubplot:>

png

如图所示,存在一些相关系数相当高的属性,比如turn和std_turn,系数高达0.99,而turn和LM负相关,系数达到-0.64。

同时,我们发现不是所有因子都与收益率有线性相关关系,有的相关性很低。如果能对因子降维,将会大大减少后续的计算复杂度。

降维方案

  1. 事先剔除相关性高的因子(如turn和std_turn中保留一个)
  2. 事先剔除与收益率线性关系小的因子(如saleinv,与收益率相关系数为0)
  3. 事先将因子聚合,通过两个强相关因子聚合成一个超因子。但是需要考虑计算超因子收益率后如何分配到每个单独因子上。
  4. 使用回归模型自带的因子聚合(如lightGBM本身就能聚合因子)

多维线性回归

利用sklearn的LinearRegression进行训练,训练集:测试集=8:2

from sklearn.linear_model import LinearRegression  #线性回归
linreg = LinearRegression()
model = linreg.fit(x_train, y_train)
print('模型参数:',model)
print('模型截距:',linreg.intercept_)
print('参数权重:', linreg.coef_)
模型参数: LinearRegression()
模型截距: 0.11192220049016646
参数权重: [ 3.80910899e-04 -1.10826314e-03 -3.62544858e-05  5.27961421e-03
 -6.51975573e-03  1.16303728e-03 -2.83646648e-03 -7.55972332e-04
  1.84340392e-03 -1.41950367e-03 -2.49677416e-03 -3.34463448e-04
  3.80587540e-03  1.20873821e-04 -1.66211798e-03 -3.55848990e-05
 -3.88057504e-03 -3.91722772e-03  6.07612845e-03 -4.10494162e-03
  4.33685598e-04 -5.80198206e-04 -1.08843493e-03  9.52434622e-04
 -2.15499834e-03 -2.53412796e-03 -9.54653610e-04 -1.17394586e-03
 -7.99239104e-04  3.53530120e-04 -1.41222415e-03  1.50707903e-03
 -2.87463811e-05 -8.95818662e-04 -7.41060691e-04  1.05584496e-02
 -1.09850193e-02  1.94008109e-05  3.15920133e-04 -1.41007496e-03
  1.27199199e-04  1.78311779e-03 -3.32877882e-04 -1.14674292e-03
 -4.65312324e-04  3.64669943e-04  4.07370182e-04  5.74168143e-04
 -1.28046310e-03 -4.83573632e-03  3.60953532e-03  1.82333274e-03
 -3.88094608e-04  3.39263999e-04  1.94555721e-03 -1.39805122e-03]
y_pred = linreg.predict(x_test)
prediction(y_pred, y_test)
RMSE by hand: 0.01824543398776763

png