***#Importing Libraries***
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from yahoofinancials import YahooFinancials
***#Creating Date Range and adding them to values table***
end_date= “2020–03–01”
start_date = “2010–01–01”
date_range = pd.bdate_range(start=start_date,end=end_date)
values = pd.DataFrame({ ‘Date’: date_range})
values[‘Date’]= pd.to_datetime(values[‘Date’])
一旦我们在数据框中有了日期范围,就需要使用股票代码从 API 中拉取数据。***yahoofinancials ***以 JSON 格式返回输出。以下代码循环遍历股票代码列表,提取所有历史日期的收盘价,并按日期水平合并到数据框中。考虑到这些资产类别可能有不同的区域和交易假期,每次数据拉取的日期范围可能不同。通过合并,我们最终会有一些 NAs,我们将在稍后进行 frontfill。
***#Extracting Data from Yahoo Finance and Adding them to Values table using date as key
***for i in ticker:
raw_data = YahooFinancials(i)
raw_data = raw_data.get_historical_price_data(start_date, end_date, “daily”)
df = pd.DataFrame(raw_data[i][‘prices’])[[‘formatted_date’,’adjclose’]]
df.columns = [‘Date1’,i]
df[‘Date1’]= pd.to_datetime(df[‘Date1’])
values = values.merge(df,how=’left’,left_on=’Date’,right_on=’Date1')
values = values.drop(labels=’Date1',axis=1)
***#Renaming columns to represent instrument names rather than their ticker codes for ease of readability***
names.insert(0,’Date’)
values.columns = names
print(values.shape)
print(values.isna().sum())
***#Front filling the NaN values in the data set***
values = values.fillna(method="ffill",axis=0)
values = values.fillna(method="bfill",axis=0)
values.isna().sum()
***# Coercing numeric type to all columns except Date***
cols=values.columns.drop('Date')
values[cols] = values[cols].apply(pd.to_numeric,errors='coerce').round(decimals=1)
values.tail()
imp = [‘Gold’,’Silver’, ‘Crude Oil’, ‘S&P500’,’MSCI EM ETF’]
***# Calculating Short term -Historical Returns***
change_days = [1,3,5,14,21]
data = pd.DataFrame(data=values[‘Date’])
for i in change_days:
print(data.shape)
x= values[cols].pct_change(periods=i).add_suffix(“-T-”+str(i))
data=pd.concat(objs=(data,x),axis=1)
x=[]
print(data.shape)
***# Calculating Long term Historical Returns***
change_days = [60,90,180,250]
for i in change_days:
print(data.shape)
x= values[imp].pct_change(periods=i).add_suffix(“-T-”+str(i))
data=pd.concat(objs=(data,x),axis=1)
x=[]
print(data.shape)
***#Merging Moving Average values to the feature space***
data[‘Date’]=pd.to_datetime(data[‘Date’],format=’%Y-%b-%d’)
data = pd.merge(left=data,right=moving_avg,how=’left’,on=’Date’)
print(data.shape)
data.isna().sum()
***#If you are importing downloaded dataset***
data = pd.read_csv("Training Data.csv")
from pycaret.regression import *
***#We have two target columns. We will remove the T+14 day Target
***data_22= data.drop(['Gold-T+14'],axis=1)
PyCaret 为每个算法都有一个预定义的网格,并且 tune_model() 函数使用随机网格搜索来寻找优化指标选择(此处为 Rsquare)的参数集,并显示优化模型的交叉验证得分。它不接受已训练好的模型,需要传入估计器的缩写字符串。我们将调优 Extra Tree (et)、K Nearest Neighbors(knn) 和 CatBoost (catboost) 回归器。
et_tuned = tune_model(‘et’)
catb_tuned = tune_model(‘catboost’)
knn_tuned = tune_model(‘knn’,n_iter=150)
*#I have increased the iteration in knn because increasing iterations have shown to perform better for knn rather than other models in question without significantly increasing the training time.*
从上面我们可以看到,调优后 knn 的 R2 显著增加到 87.86%,远高于 et 和 catboost,它们在调优后没有改善。这可能是由于网格搜索过程中的随机性。在迭代次数非常高的情况下,它们可能会有所改善。
我还会创建一个基础 Extra Tree (et) 模型,因为其原始性能(调优前)与调优后的 knn 非常接近。我们将使用 PyCaret 的 create_model() 函数来创建模型。
et = create_model(‘et’)
评估模型
对训练好的模型进行一些模型诊断非常重要。我们将使用 PyCaret 中的 ***evaluate_model() *** 函数来查看图表集合和其他诊断信息。它接受一个已训练好的模型,并返回一系列模型诊断图表和模型定义。我们将对我们的两个最佳模型 knn_tuned 和 et 进行模型诊断。
从上面我们可以清楚地看到,在前 500 个观察值中,存在许多异常值,这些异常值不仅影响模型性能,还可能影响模型未来的泛化能力。因此,移除这些异常值可能是值得的。但在这样做之前,我们将通过 et 查看特征重要性(knn 不提供特征重要性)。
***#Importing Libraries***
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from yahoofinancials import YahooFinancials
ticker_details = pd.read_excel("Ticker List.xlsx")
ticker = ticker_details['Ticker'].to_list()
names = ticker_details['Description'].to_list()
***#Preparing Date Range***
end_date= datetime.strftime(datetime.today(),'%Y-%m-%d')
start_date = "2019-01-01"
date_range = pd.bdate_range(start=start_date,end=end_date)
values = pd.DataFrame({ 'Date': date_range})
values['Date']= pd.to_datetime(values['Date'])
***#Extracting Data from Yahoo Finance and Adding them to Values table using date as key***
for i in ticker:
raw_data = YahooFinancials(i)
raw_data = raw_data.get_historical_price_data(start_date, end_date, "daily")
df = pd.DataFrame(raw_data[i]['prices'])[['formatted_date','adjclose']]
df.columns = ['Date1',i]
df['Date1']= pd.to_datetime(df['Date1'])
values = values.merge(df,how='left',left_on='Date',right_on='Date1')
values = values.drop(labels='Date1',axis=1)
***#Renaming columns to represent instrument names rather than their ticker codes for ease of readability***
names.insert(0,'Date')
values.columns = names
***#Front filling the NaN values in the data set***
values = values.fillna(method="ffill",axis=0)
values = values.fillna(method="bfill",axis=0)
***# Co-ercing numeric type to all columns except Date***
cols=values.columns.drop('Date')
values[cols] = values[cols].apply(pd.to_numeric,errors='coerce').round(decimals=1)
imp = ['Gold','Silver', 'Crude Oil', 'S&P500','MSCI EM ETF']
***# Calculating Short term -Historical Returns***
change_days = [1,3,5,14,21]
data = pd.DataFrame(data=values['Date'])
for i in change_days:
x= values[cols].pct_change(periods=i).add_suffix("-T-"+str(i))
data=pd.concat(objs=(data,x),axis=1)
x=[]
***# Calculating Long term Historical Returns***
change_days = [60,90,180,250]
for i in change_days:
x= values[imp].pct_change(periods=i).add_suffix("-T-"+str(i))
data=pd.concat(objs=(data,x),axis=1)
x=[]
***#Calculating Moving averages for Gold***
moving_avg = pd.DataFrame(values['Date'],columns=['Date'])
moving_avg['Date']=pd.to_datetime(moving_avg['Date'],format='%Y-%b-%d')
moving_avg['Gold/15SMA'] = (values['Gold']/(values['Gold'].rolling(window=15).mean()))-1
moving_avg['Gold/30SMA'] = (values['Gold']/(values['Gold'].rolling(window=30).mean()))-1
moving_avg['Gold/60SMA'] = (values['Gold']/(values['Gold'].rolling(window=60).mean()))-1
moving_avg['Gold/90SMA'] = (values['Gold']/(values['Gold'].rolling(window=90).mean()))-1
moving_avg['Gold/180SMA'] = (values['Gold']/(values['Gold'].rolling(window=180).mean()))-1
moving_avg['Gold/90EMA'] = (values['Gold']/(values['Gold'].ewm(span=90,adjust=True,ignore_na=True).mean()))-1
moving_avg['Gold/180EMA'] = (values['Gold']/(values['Gold'].ewm(span=180,adjust=True,ignore_na=True).mean()))-1
moving_avg = moving_avg.dropna(axis=0)
***#Merging Moving Average values to the feature space***
data['Date']=pd.to_datetime(data['Date'],format='%Y-%b-%d')
data = pd.merge(left=data,right=moving_avg,how='left',on='Date')
data = data[data['Gold-T-250'].notna()]
prediction_data = data.copy()