- Linear regression-based strategies
- Machine learning-based strategies
- Deep learning-based strategies
Using Linear Regression for Market Movement Prediction
import os
import random
import numpy as np
from pylab import mpl, plt
plt.style.use('seaborn')
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.family'] = 'serif'
os.environ['PYTHONHASHSEED'] = '0'
x = np.linspace(0, 10)
def set_seeds(seed=100):
random.seed(seed)
np.random.seed(seed)
set_seeds()
y = x + np.random.standard_normal(len(x))
reg = np.polyfit(x, y, deg=1)
plt.figure(figsize=(10, 6))
plt.plot(x, y, 'bo', label='data')
plt.plot(x, np.polyval(reg, x), 'r', lw=2.5, label='linear regression')
plt.legend(loc=0)
Using logistic regression to predict market direction
symbol = 'GLD'
data = pd.DataFrame(raw[symbol])
data.rename(columns={symbol: 'price'}, inplace=True)
data['return'] = np.log(data['price'] / data['price'].shift(1))
data.dropna(inplace=True)
lags = 3
cols = []
for lag in range(1, lages+1):
col = 'lag_{}'.format(lag)
data[col] = data['return'].shift(lag)
cols.append(col)
data.dropna(inplace=True)
from sklearn.metrics import accuracy_score
lm = linear_model.LogisticRegression(C=1e7, solver='lbfgs', multi_class='auto', max_iter=1000)
lm.fit(data[cols], np.sign(data['return']))
data['prediction'] = lm.predict(data[cols])
data['prediction'].value_counts()
hits = np.sign(data['return'].iloc[lags:] * data['prediction'].iloc[lags:]).value_counts()
accuracy_score(data['prediction'], np.sign(data['return']))
data['strategy'] = data['prediction'] * data['return']
data[['return', 'strategy']].sum().apply(np.exp)
data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10, 6))
data = pd.DataFrame(raw[symbol])
data.rename(columns={symbol: 'price'}, inplace=True)
data['return'] = np.log(data['price'] / data['price'].shift(1))
lags = 5
cols = []
for lag in range(1, lags+1):
col = 'lag_%d' % lag
data[col] = data['price'].shift(lag)
cols.append(col)
data.dropna(inplace=True)
lm.fit(data[cols], np.sign(data['return']))
data['prediction'] = lm.predict(data[cols])
data['prediction'].vlaue_counts()
hits = np.sign(data['return'].iloc[lags:' * data['prediction'].iloc[lags:]).value_counts()
accuracy_score(data['prediction'], np.sign(data['return]))
data['strategy'] = data['prediction'] * data['return']
data[['return', 'strategy']].sum().apply(np.exp)
data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10, 6))
Without considering transaction costs
import ScikitVectorBacktester as SCI
scibt = SCI.ScikitVectorBacktest('EUR=', '2010-1-1', '2019-12-31', 10000, 0.0, 'logistic')
scibt.run_strategy('2015-1-1', '2019-12-31', '2015-1-1', '2019-12-31', lags=15)
scibt.run_strategy('2016-1-1', '2018-12-31', '2019-1-1', '2019-12-31', lags=15)
scibt.plot_results()
Consider the same strategy applied to the GDX ETF, for which an out-of-sample outperformance (over the year 2018)
scibt = SCI.ScikitVecotrBacktester('GDX', '2010-1-1', '2019-12-31', 10000, 0.00, 'logistic')
scibt.run_strategy('2013-1-1', '2017-12-31', '2018-1-1', '2018-12-31', lags=10)
scibt.plot_results()
Taking transaction costs into account
scibt = SCI.ScibitVectorBacktester('GDX', '2010-1-1', '2019-12-31', 10000, 0.0025, 'logistic')
scibt.run_strategy('2013-1-1', '2017-12-31', '2018-1-1', '2018-12-31', lags=10)
scibt.plot_results()
Using deep learning for market movement prediction
The simple classification problem revisited
hours = np.array([0.5, 0.75, 1., 1.25, 1.5, 1.75, 1.75, 2., 2.25, 2.5, 2.75, 3., 3.25, 3.5, 4., 4.25, 4.5, 4.75, 5., 5.5])
success = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])
data = pd.DataFrame({'hours': hours, 'success': success}]
data.info()
“MLP” stands for multi-layer perceptron is another expression for dense neural network
from sklearn.neural_netowrk import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=[32], max_iter=1000, random_state=100)
Generates the predictions and plots the results
model.fit(data['hours'].values.reshape(-1, 1), data['success'])
MLPClassifier(hidden_layer_sizes=[32], max_iter=1000, random_state=100)
data['prediction'] = model.predict(data['hours'].values.reshape(-1, 1))
data.tail()
data.plot(x='hours', y=['success', 'prediction'], style=['ro', b-'], ylim=[-.1, 1.1], figsize=(10, 6)
Using deep neural networks to predict market direction
Apply the approach to stock market data in the form of log returns from a financial time series
The data needs to be retrieved and prepared
symbol = 'EUR='
data = pd.DataFrame(raw[symbol])
data.rename(columns={symbol: 'price'}, inplace=True)
data['return'] = np.log(data['price']/data['price'].shift(1))
data['direction'] = np.where(data['return'] > 0, 1, 0)
lages =5
cols = []
for lag in range(1, lags + 1):
col = f'lag_{lag}'
data[col] = data['return'].shift(lag)
cols.append(col)
data.dropna(inplace=True)
data.round(4).tail()
Use a dense neural network (DNN) with the Keras package, defines training and test data sub-sets, defines the feature columns, and labels and fits the classifier
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, RMSprop
optimizer = Adam(learning_rate=0.0001)
def set_seeds(seed=100):
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(100)
set_seeds()
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(lags,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
cutoff = '2017-12-31'
training_data = data[data.index < cutoff].copy()
mu, std = training_data.mean(), training_data.std()
training_data_ = (training_data - mu) / std
test_data = data[data.index >= cutoff].copy()
test_data_ = (test_data - mu) / std
%%time
model.fit(training_data[cols], training_data['direction'], epochs=50, verbose=False, validation_split=0.2, shuffle=False)
res = pd.DataFrame(model.history.history)
res[['accuracy', 'val_accuracy']].plot(figsize=(10, 6), style='--')
Equipped with the fitted classifier, the model can generate predictions on the training data set
model.evaluate(training_data_[cols], training_data['direction'])
pred = np.where(model.predict(training_data_[cols] > 0.5, 1, 0)
pred[:30].flatten()
training_data['prediction'] = np.where(pred > 0, 1, -1)
training_data['strategy'] = (training_data['prediction'] * training_data['return'])
training_data[['return', 'strategy']].sum().apply(np.exp)
training_data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10,6))
How the strategy performs on the test data set (out-of-sample)
model.evaluate(test_data_[cols], test_data['direction']
pred = np.where(model.predict(test_data[cols]) > 0.5, 1, 0)
test_data['prediction'] = np.where(pred > 0, 1, -1)
test_data['prediction'].value_counts()
test_data['strategy'] = (test_data['prediction'] * test_data['return'])
test_data[['return', 'strategy']].sum().apply(np.exp)
test_data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10, 6))
Adding different types of features
Add more classes/categories and add other types of features to the mix, such as ones based on momentum, volatility, or distance measures.
data['momentum'] = data['return'].rolloing(5).mean().shift(1)
data['volatility'] = data['return'].rolling(20).std().shift(1)
data['distance'] = (data['price'] - data['price'].rolling(50).mean()).shift(1)
data.dropna(inplace=True)
cols.extend(['momentum', 'volatillity', 'distance'])
print(data.round(4).tail())
Redefine the training and test data sets, to normalise the features data, and to update the model to reflect the new features columns
training_data = data[data.index < cutoff].copy()
mu, std = training_data.mean(), training_data.std()
training_data_ = (training_data - mu) / std
test_data = data[data.index >= cutoff].copy()
test_data_ = (test_data - mu) / std
set_seeds()
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(len(cols),)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
Based on the enriched feature set, the classifier can be trained.
The in-sample performance of the strategy is quite a bit better than before.
%%time
model.fit(training_data_[cols], training_data['direction'], verbose=[False, epochs=25)
model.evaluate(training_data_[cols], training_data['direction'])
pred = np.where(model.prediction(training_data_[cols] > 0.5, 1, 0)
training_data['prediction'] = np.where(pred > 0, 1, -1)
training_data['strategy'] = (training_data['prediction'] * training_data['return'])
training_data[['return', 'strategy']].sum().apply(np.exp)
training_data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10, 6))
The final step is the evaluation of the classifier and the derivation of the strategy performance out-of-sample.
model.evaluate(test_data_[cols], test_data['direction'])
pred = np.where(model.predict(test_data[cols]) > 0.5, 1, 0)
test_data['prediction'] = np.where(pred > 0, 1, -1)
test_data['prediction'].value_counts()
test_data['strategy'] = (test_data['prediction'] * test_data['return'])
test_data[['return', 'strategy']].sum().apply(np.exp)
test_data[['return', 'strategy']].cumsum().apply(np.exp).plot(figsize=(10, 6))









