Appendix 3. Consolidated Evaluation Of LSTM Models And Forecast Dataframe Creation
Key points:
- Summary of LSTM models' accuracy results
- Forecast dataframe creation for RL model
In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score
In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN, ModelCheckpoint
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import TimeDistributed, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.utils import plot_model
from IPython.display import display
In [7]:
import json
import shutil
import os
import re
import glob
import time
import ta
from dbManager import DatabaseManager
import tempfile
import shutil
import warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings('ignore', category=UserWarning, module='keras')
In [8]:
# Get data
df = pd.read_csv("clean_data_Nasdaq_Helsinki_new_new.csv", index_col=0)
df.index = pd.to_datetime(df.index)
db_manager = DatabaseManager("thesis_ESG_csc.db")
Successfully connected to the database: thesis_ESG_csc.db
Summary of LSTM models' accuracy results
- Distribution of R2
In [9]:
output_r2 = pd.read_sql(f'SELECT Symbol, R2 FROM stock_data', db_manager.con)
# Bins
bins = [0.9+i*0.01 for i in range(11)]
labels = [f'{round(bins[i], 2)}-{round(bins[i]+0.01, 2)}' for i in range(len(bins) - 1)]
# Create the 'range' column
output_r2['r2'] = pd.cut(output_r2['R2'], bins=bins, labels=labels, include_lowest=True)
# Count how many symbols fall into each range
range_counts = output_r2['r2'].value_counts().sort_index()
# Plot the bar chart
plt.figure(figsize=(8, 5))
range_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of R2', fontsize=14)
plt.xlabel('R2', fontsize=12)
plt.ylabel('Count of Symbols', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
- Average and median directional accuracy
In [10]:
output_stats = pd.read_sql(f"""
SELECT
AVG(Directional_accuracy) AS average_directional_accuracy,
(
SELECT Directional_accuracy
FROM stock_data AS sub
ORDER BY Directional_accuracy
LIMIT 1
OFFSET (SELECT COUNT(*) FROM stock_data) / 2
) AS median_directional_accuracy
FROM
stock_data;
""", db_manager.con)
output_stats
Out[10]:
| average_directional_accuracy | median_directional_accuracy | |
|---|---|---|
| 0 | 70.606 | 71.59 |
- Summary for LSTM models where the directional accuracy exceeds 70%
In [14]:
output_high_accuracy = pd.read_sql(f"""
SELECT
SUM(CASE WHEN Directional_accuracy > 70 THEN 1 ELSE 0 END) AS count_greater_than_70,
COUNT(*) AS total_count,
CAST(SUM(CASE WHEN Directional_accuracy > 70 THEN 1 ELSE 0 END) AS REAL) * 100.0 / COUNT(*) AS percentage_greater_than_70
FROM
stock_data;
""", db_manager.con)
output_high_accuracy
Out[14]:
| count_greater_than_70 | total_count | percentage_greater_than_70 | |
|---|---|---|---|
| 0 | 80 | 140 | 57.142857 |
- Distribution of Directional Accuracy
In [15]:
output_hm = pd.read_sql(f'SELECT Symbol, Directional_accuracy FROM stock_data', db_manager.con)
# Bins
frequency = 5
frequency_n = int(100 / frequency)
bins = [i*frequency for i in range(frequency_n)]
labels = [f'{bins[i]}-{bins[i+1]}' for i in range(len(bins)-1)]
# Create the 'range' column
output_hm['accuracy_range'] = pd.cut(output_hm['Directional_accuracy'], bins=bins, labels=labels, include_lowest=True)
# Count how many symbols fall into each range
range_counts = output_hm['accuracy_range'].value_counts().sort_index()
# Plot the bar chart
plt.figure(figsize=(8, 5))
range_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Directional Accuracy', fontsize=14)
plt.xlabel('Accuracy Range (%)', fontsize=12)
plt.ylabel('Count of Symbols', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
Forecast Dataframe Creation
- Creating a dataframe for the PPO model
- Showing random model architectures and visualizing forecasts
In [16]:
# Create sequences
def create_sequences(data, window_size):
X, y = [], []
for i in range(len(data) - window_size):
X.append(data[i:i + window_size])
y.append(data[i + window_size][0]) # target is first column
return np.array(X), np.array(y).reshape(-1, 1)
In [18]:
examples_list=['GRK.HE', 'CONSTI.HE', 'SOSI1.HE', 'NOHO.HE']
first_loop = True
for col in df.columns:
target_col = col
df_selected = df[[target_col]]
# get model parametesr + hyperparameters
model_path = db_manager.get_lstm_model(col.replace(".HE",""))
window_size = db_manager.get_lstm_model_window(col.replace(".HE",""))
output_r2 = db_manager.get_r2_lstm_model(col.replace(".HE",""))
output_da = db_manager.get_directional_accuracy_lstm_model(col.replace(".HE",""))
output_hyperparameters = db_manager.get_hyperparameters_lstm_model(col.replace(".HE",""))
output_hyperparameters=json.loads(output_hyperparameters)
# Resample weekly on Wednesdays
df_weekly = df_selected[df_selected.index.weekday == 2].dropna().sort_index().copy()
df_weekly[target_col] = df_weekly[target_col].rolling(window=3).mean()
df_weekly['RSI'] = ta.momentum.RSIIndicator(close=df_weekly[target_col], window=14).rsi()
# Add technical indicators
macd = ta.trend.MACD(close=df_weekly[target_col])
df_weekly['MACD'] = macd.macd()
df_weekly['MACD_signal'] = macd.macd_signal()
boll = ta.volatility.BollingerBands(close=df_weekly[target_col], window=20, window_dev=2)
df_weekly['BB_high'] = boll.bollinger_hband()
df_weekly['BB_low'] = boll.bollinger_lband()
df_weekly['BB_width'] = df_weekly['BB_high'] - df_weekly['BB_low']
# Update target timeseries
df_original=pd.DataFrame(df_weekly[target_col].copy())
df_weekly[target_col] = df_weekly[target_col].diff()
df_weekly=df_weekly.dropna().sort_index()
# Drop unnecessary features
drop_features = ['RSI', 'MACD', 'MACD_signal', 'BB_high', 'BB_low', 'BB_width']
if "necessary_features" in output_hyperparameters:
necessary_features=output_hyperparameters["necessary_features"]
for nf in necessary_features:
drop_features.remove(nf)
else:
drop_features = []
df_weekly=df_weekly.drop(columns=drop_features)
# Standardize
# Convert % to indices
total_len = len(df_weekly)
p = lambda x: int(total_len * x)
# Define splits
lstm_inx_dict = {0: [p(0.0), p(0.5)], 1: [p(0.7), p(0.8)]}
rl_inx_dict = {0: [p(0.5), p(0.7)], 1: [p(0.8), p(0.9)]}
test_inx_dict = {0: [p(0.9), total_len]}
# Detect test_date_start
test_date_start = df_weekly.index[test_inx_dict[0][0]].date()
# Create index arrays
def merge_index_ranges(index_dict):
return np.concatenate([np.arange(start, end) for start, end in index_dict.values()])
lstm_indices = merge_index_ranges(lstm_inx_dict)
rl_indices = merge_index_ranges(rl_inx_dict)
test_indices = merge_index_ranges(test_inx_dict)
dates = df_weekly.index
scalers = {}
data_scaled = pd.DataFrame(index=df_weekly.index)
for col in df_weekly.columns:
scalers[col] = StandardScaler()
# Use consistent type (NumPy arrays) for fit and transform
train_values = df_weekly.iloc[lstm_indices][[col]].values # NumPy
scalers[col].fit(train_values)
# Transform full column, also as NumPy
transformed = scalers[col].transform(df_weekly[[col]].values)
data_scaled[col] = transformed
data_scaled = data_scaled.to_numpy()
X,y=create_sequences(data_scaled, window_size)
model = keras.models.load_model(model_path)
target_index = df_weekly.index[window_size:]
y_pred = model.predict(X)
# Inverse transform predictions
y_pred_inv = pd.DataFrame(columns=[target_col])
y_pred_inv[target_col] = scalers[target_col].inverse_transform(y_pred).flatten()
y_actual_inv = pd.DataFrame(columns=[target_col])
y_actual_inv[target_col] = scalers[target_col].inverse_transform(y).flatten()
# CALCULATE ORIGINAL TIME SERIES
# Get last original item
last_original_item = df_original.iloc[-len(y_pred_inv)-1][target_col]
original_list = [last_original_item]
prediction_list = [last_original_item]
for index, row in y_actual_inv.iterrows():
original_list.append(original_list[-1]+row[target_col])
prediction_list.append(original_list[-2]+y_pred_inv.loc[index,target_col])
df_save=df_original.iloc[-len(y_pred_inv):-1].copy()
df_save['prediction'] = prediction_list[2:]
df_save['original_reconstructed'] = original_list[2:]
# Save to df_forecast
if first_loop:
first_loop = False
df_forecast = pd.DataFrame({f'f_{target_col}': prediction_list[2:]}, index=df_save.index)
else:
new_col = df_save['prediction'].rename(f'f_{target_col}')
df_forecast = df_forecast.join(new_col, how='outer')
# Show examples
if target_col in examples_list:
print(f'Symbol: {target_col}')
print(f'Directional accuracy: {output_da} %')
print(f'Model architecture:')
model_plot = plot_model(
model,
show_shapes=True,
show_layer_names=True,
rankdir='TB',
dpi=75
)
display(model_plot)
plt.figure(figsize=(12, 5))
plt.plot(df_save.iloc[-100:]['original_reconstructed'], label='Actual', linewidth=1, marker='o')
plt.plot(df_save.iloc[-100:]['prediction'], label='Predicted', linewidth=1, marker='x')
plt.title(f'ORIGINAL TIMESERIES: Prediction vs Actual - TEST for {target_col}')
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
plt.close()
39/39 ━━━━━━━━━━━━━━━━━━━━ 4s 89ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 32ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 37ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step Symbol: CONSTI.HE Directional accuracy: 77.97 % Model architecture:
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 2s 39ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 38/38 ━━━━━━━━━━━━━━━━━━━━ 3s 70ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step Symbol: GRK.HE Directional accuracy: 69.39 % Model architecture:
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 30ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 25ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 34ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step Symbol: NOHO.HE Directional accuracy: 73.79 % Model architecture:
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 31ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 71ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 32ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step Symbol: SOSI1.HE Directional accuracy: 61.86 % Model architecture:
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 63ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 63ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 31ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 32ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step 39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step 40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
In [114]:
df_forecast=df_forecast.dropna().sort_index()
print(df_forecast.shape)
df_forecast.to_csv("forecasts.csv")
(1208, 140)
In [ ]: