Appendix 3. Consolidated Evaluation Of LSTM Models And Forecast Dataframe Creation

Key points:

  • Summary of LSTM models' accuracy results
  • Forecast dataframe creation for RL model
In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score
In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN, ModelCheckpoint
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import TimeDistributed, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.utils import plot_model
from IPython.display import display
In [7]:
import json
import shutil
import os
import re
import glob
import time
import ta
from dbManager import DatabaseManager
import tempfile
import shutil
import warnings

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
warnings.filterwarnings('ignore', category=UserWarning, module='keras')
In [8]:
# Get data
df = pd.read_csv("clean_data_Nasdaq_Helsinki_new_new.csv", index_col=0)
df.index = pd.to_datetime(df.index)

db_manager = DatabaseManager("thesis_ESG_csc.db")
Successfully connected to the database: thesis_ESG_csc.db

Summary of LSTM models' accuracy results

  • Distribution of R2
In [9]:
output_r2 = pd.read_sql(f'SELECT Symbol, R2 FROM stock_data', db_manager.con)
# Bins
bins = [0.9+i*0.01 for i in range(11)]
labels = [f'{round(bins[i], 2)}-{round(bins[i]+0.01, 2)}' for i in range(len(bins) - 1)]

# Create the 'range' column
output_r2['r2'] = pd.cut(output_r2['R2'], bins=bins, labels=labels, include_lowest=True)

# Count how many symbols fall into each range
range_counts = output_r2['r2'].value_counts().sort_index()

# Plot the bar chart
plt.figure(figsize=(8, 5))
range_counts.plot(kind='bar', color='skyblue', edgecolor='black')

plt.title('Distribution of R2', fontsize=14)
plt.xlabel('R2', fontsize=12)
plt.ylabel('Count of Symbols', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
No description has been provided for this image
  • Average and median directional accuracy
In [10]:
output_stats = pd.read_sql(f"""
SELECT
    AVG(Directional_accuracy) AS average_directional_accuracy,
    (
        SELECT Directional_accuracy
        FROM stock_data AS sub
        ORDER BY Directional_accuracy
        LIMIT 1
        OFFSET (SELECT COUNT(*) FROM stock_data) / 2
    ) AS median_directional_accuracy
FROM
    stock_data;
""", db_manager.con)
output_stats
Out[10]:
average_directional_accuracy median_directional_accuracy
0 70.606 71.59
  • Summary for LSTM models where the directional accuracy exceeds 70%
In [14]:
output_high_accuracy = pd.read_sql(f"""
SELECT
    SUM(CASE WHEN Directional_accuracy > 70 THEN 1 ELSE 0 END) AS count_greater_than_70,
    COUNT(*) AS total_count,
    CAST(SUM(CASE WHEN Directional_accuracy > 70 THEN 1 ELSE 0 END) AS REAL) * 100.0 / COUNT(*) AS percentage_greater_than_70
FROM
    stock_data;
""", db_manager.con)
output_high_accuracy
Out[14]:
count_greater_than_70 total_count percentage_greater_than_70
0 80 140 57.142857
  • Distribution of Directional Accuracy
In [15]:
output_hm = pd.read_sql(f'SELECT Symbol, Directional_accuracy FROM stock_data', db_manager.con)
# Bins
frequency = 5
frequency_n = int(100 / frequency)
bins = [i*frequency for i in range(frequency_n)]
labels = [f'{bins[i]}-{bins[i+1]}' for i in range(len(bins)-1)]

# Create the 'range' column
output_hm['accuracy_range'] = pd.cut(output_hm['Directional_accuracy'], bins=bins, labels=labels, include_lowest=True)

# Count how many symbols fall into each range
range_counts = output_hm['accuracy_range'].value_counts().sort_index()

# Plot the bar chart
plt.figure(figsize=(8, 5))
range_counts.plot(kind='bar', color='skyblue', edgecolor='black')

plt.title('Distribution of Directional Accuracy', fontsize=14)
plt.xlabel('Accuracy Range (%)', fontsize=12)
plt.ylabel('Count of Symbols', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
No description has been provided for this image

Forecast Dataframe Creation

  • Creating a dataframe for the PPO model
  • Showing random model architectures and visualizing forecasts
In [16]:
# Create sequences
def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size][0])  # target is first column

    return np.array(X), np.array(y).reshape(-1, 1)
In [18]:
examples_list=['GRK.HE', 'CONSTI.HE', 'SOSI1.HE', 'NOHO.HE']
first_loop = True
for col in df.columns:
    target_col = col
    df_selected = df[[target_col]]

    # get model parametesr + hyperparameters
    model_path = db_manager.get_lstm_model(col.replace(".HE",""))
    window_size = db_manager.get_lstm_model_window(col.replace(".HE",""))
    output_r2 = db_manager.get_r2_lstm_model(col.replace(".HE",""))
    output_da = db_manager.get_directional_accuracy_lstm_model(col.replace(".HE",""))
    output_hyperparameters = db_manager.get_hyperparameters_lstm_model(col.replace(".HE",""))
    output_hyperparameters=json.loads(output_hyperparameters)
    
    # Resample weekly on Wednesdays
    df_weekly = df_selected[df_selected.index.weekday == 2].dropna().sort_index().copy()
    df_weekly[target_col] = df_weekly[target_col].rolling(window=3).mean()
    df_weekly['RSI'] = ta.momentum.RSIIndicator(close=df_weekly[target_col], window=14).rsi()

    # Add technical indicators
    macd = ta.trend.MACD(close=df_weekly[target_col])
    df_weekly['MACD'] = macd.macd()
    df_weekly['MACD_signal'] = macd.macd_signal()
    
    boll = ta.volatility.BollingerBands(close=df_weekly[target_col], window=20, window_dev=2)
    df_weekly['BB_high'] = boll.bollinger_hband()
    df_weekly['BB_low'] = boll.bollinger_lband()
    df_weekly['BB_width'] = df_weekly['BB_high'] - df_weekly['BB_low']

    # Update target timeseries
    df_original=pd.DataFrame(df_weekly[target_col].copy())
    df_weekly[target_col] = df_weekly[target_col].diff()
    df_weekly=df_weekly.dropna().sort_index()

    # Drop unnecessary features
    drop_features = ['RSI', 'MACD', 'MACD_signal', 'BB_high', 'BB_low', 'BB_width']
    if "necessary_features" in output_hyperparameters:
        necessary_features=output_hyperparameters["necessary_features"]
        for nf in necessary_features:
            drop_features.remove(nf)
    else:
        drop_features = []

    df_weekly=df_weekly.drop(columns=drop_features)

    # Standardize
    # Convert % to indices
    total_len = len(df_weekly)
    p = lambda x: int(total_len * x)
    
    # Define splits
    lstm_inx_dict = {0: [p(0.0), p(0.5)], 1: [p(0.7), p(0.8)]}
    rl_inx_dict = {0: [p(0.5), p(0.7)], 1: [p(0.8), p(0.9)]}
    test_inx_dict = {0: [p(0.9), total_len]}
    
    # Detect test_date_start
    test_date_start = df_weekly.index[test_inx_dict[0][0]].date()
    
    # Create index arrays
    def merge_index_ranges(index_dict):
        return np.concatenate([np.arange(start, end) for start, end in index_dict.values()])
    
    lstm_indices = merge_index_ranges(lstm_inx_dict)
    rl_indices = merge_index_ranges(rl_inx_dict)
    test_indices = merge_index_ranges(test_inx_dict)
    
    dates = df_weekly.index
    
    scalers = {}
    data_scaled = pd.DataFrame(index=df_weekly.index)

    for col in df_weekly.columns:
        scalers[col] = StandardScaler()
    
        # Use consistent type (NumPy arrays) for fit and transform
        train_values = df_weekly.iloc[lstm_indices][[col]].values  # NumPy
        scalers[col].fit(train_values)
    
        # Transform full column, also as NumPy
        transformed = scalers[col].transform(df_weekly[[col]].values)
    
        data_scaled[col] = transformed
        
    data_scaled = data_scaled.to_numpy()
    X,y=create_sequences(data_scaled, window_size)

    model = keras.models.load_model(model_path)
    
    target_index = df_weekly.index[window_size:]
    
    y_pred = model.predict(X)
    
    # Inverse transform predictions
    y_pred_inv = pd.DataFrame(columns=[target_col])
    y_pred_inv[target_col] = scalers[target_col].inverse_transform(y_pred).flatten()

    y_actual_inv = pd.DataFrame(columns=[target_col])
    y_actual_inv[target_col] = scalers[target_col].inverse_transform(y).flatten()

    
    # CALCULATE ORIGINAL TIME SERIES
    # Get last original item
    last_original_item = df_original.iloc[-len(y_pred_inv)-1][target_col]
    
    original_list = [last_original_item]
    prediction_list = [last_original_item]
    for index, row in y_actual_inv.iterrows():
        original_list.append(original_list[-1]+row[target_col])
        prediction_list.append(original_list[-2]+y_pred_inv.loc[index,target_col])

    df_save=df_original.iloc[-len(y_pred_inv):-1].copy()
    df_save['prediction'] = prediction_list[2:]
    df_save['original_reconstructed'] = original_list[2:]

    # Save to df_forecast
    if first_loop:
        first_loop = False
        df_forecast = pd.DataFrame({f'f_{target_col}': prediction_list[2:]}, index=df_save.index)
    else:
        new_col = df_save['prediction'].rename(f'f_{target_col}')
        df_forecast = df_forecast.join(new_col, how='outer')

    # Show examples
    if target_col in examples_list:
        print(f'Symbol: {target_col}')
        print(f'Directional accuracy: {output_da} %')
        print(f'Model architecture:')
        model_plot = plot_model(
            model, 
            show_shapes=True, 
            show_layer_names=True,
            rankdir='TB',
            dpi=75
        )
        display(model_plot)
    
        plt.figure(figsize=(12, 5))
        plt.plot(df_save.iloc[-100:]['original_reconstructed'], label='Actual', linewidth=1, marker='o')
        plt.plot(df_save.iloc[-100:]['prediction'], label='Predicted', linewidth=1, marker='x')
        plt.title(f'ORIGINAL TIMESERIES: Prediction vs Actual - TEST for {target_col}')
        plt.xlabel('Time Step')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        plt.close() 
39/39 ━━━━━━━━━━━━━━━━━━━━ 4s 89ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 22ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 32ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 37ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
Symbol: CONSTI.HE
Directional accuracy: 77.97 %
Model architecture:
No description has been provided for this image
No description has been provided for this image
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 2s 39ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
38/38 ━━━━━━━━━━━━━━━━━━━━ 3s 70ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
Symbol: GRK.HE
Directional accuracy: 69.39 %
Model architecture:
No description has been provided for this image
No description has been provided for this image
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 30ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 25ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 34ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 14ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 23ms/step
Symbol: NOHO.HE
Directional accuracy: 73.79 %
Model architecture:
No description has been provided for this image
No description has been provided for this image
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 26ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 20ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 31ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 71ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 32ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 15ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Symbol: SOSI1.HE
Directional accuracy: 61.86 %
Model architecture:
No description has been provided for this image
No description has been provided for this image
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 63ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 24ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 3s 63ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 31ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 32ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 18ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 16ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 28ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 29ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step
39/39 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 1s 11ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 2s 45ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
40/40 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
In [114]:
df_forecast=df_forecast.dropna().sort_index()
print(df_forecast.shape)
df_forecast.to_csv("forecasts.csv")
(1208, 140)
In [ ]: