import tensorflow as tf
from tensorflow.keras import layers, Sequential
# Disable logging
tf.keras.utils.disable_interactive_logging()
= 7
window_size
# Create train and test data for LSTM
= np.array(train_features)
lstm_train_features = np.array(train_targets)
lstm_train_targets
= np.array(test_features)
lstm_test_features = np.array(test_targets)
lstm_test_targets
# Reshape train and test features suitable fo RNN
= lstm_train_features.reshape((lstm_train_features.shape[0], 1, lstm_train_features.shape[1]))
lstm_train_features
= lstm_test_features.reshape((lstm_test_features.shape[0], 1, lstm_test_features.shape[1]))
lstm_test_features
# Implement LSTM
= Sequential()
lstm_model 50, activation="relu"))
lstm_model.add(layers.LSTM(1))
lstm_model.add(layers.Dense(compile(loss="mape", optimizer="Adam")
lstm_model.
# Fit and Forecast
1, 5, verbose=0)
lstm_model.fit(lstm_train_features, lstm_train_targets, = lstm_model.predict(lstm_test_features) lstm_forecast
1 Introduction
This post is the continuation of the book sales forecasting case. In this second part we will create deep learning models and hyperparameter tuning according to the assumed business case on Part 1.
2 Deep Learning
Now we will create the forecasts with deep learning methods - LSTM and CNN. For these models we have to change feature and target DataFrames to Numpy arrays, and reshape features to 3 dimensional shape of (samples, timesteps, features). After creating a base model for both of them and comparing visually by plotting test features with forecasts, we will create our custom function in the next section and do hyperparameter tuning.
2.1 LSTM
Firstly, we will create a simple network.
Metal device set to: Apple M1
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
For easier comparison we will compute the previous benchmarks again.
LSTM Benchmark Scores
print(f"MAPE for LSTM model is: {mean_absolute_percentage_error(lstm_test_targets, lstm_forecast)}")
print(f"R2 Score for LSTM model is: {r2_score(lstm_test_targets, lstm_forecast)}")
MAPE for LSTM model is: 0.11684010922908783
R2 Score for LSTM model is: 0.24530407619175054
Now let’s create a simple CNN network and plot its forecasts.
2.2 CNN
Simple CNN Model
# Disable logging
tf.keras.utils.disable_interactive_logging()
# Create train and test data for CNN
= np.array(train_features)
cnn_train_features = np.array(train_targets)
cnn_train_targets
= np.array(test_features)
cnn_test_features = np.array(test_targets)
cnn_test_targets
# Reshape train and test features suitable fo RNN
= cnn_train_features.reshape((cnn_train_features.shape[0], 1, cnn_train_features.shape[1]))
cnn_train_features = cnn_test_features.reshape((cnn_test_features.shape[0], 1, cnn_test_features.shape[1]))
cnn_test_features
# Implement CNN
= Sequential()
cnn_model 50, 1, activation="relu"))
cnn_model.add(layers.Conv1D(
cnn_model.add(layers.Flatten())1))
cnn_model.add(layers.Dense(compile(loss="mape", optimizer="Adam")
cnn_model.
# Fit and Forecast
1, 5, verbose=0);
cnn_model.fit(cnn_train_features, cnn_train_targets, = cnn_model.predict(cnn_test_features)
cnn_forecast
# Plot both test data and forecast from the model to compare them visually.
="Simple CNN Model Forecasts"); plot_forecasts(cnn_test_targets, cnn_forecast, title
Plots show comparable performances for simple LSTM and CNN models. Let’s quantify the comparison.
MAPE for CNN model is: 0.10994397103786469
R2 Score for CNN model is: 0.3166595057021674
3 RNN Model Tuning
In this deep learning tuning part, we will combine tuning of LSTM and CNN models and also several other network types under one category of RNN Model. The code will compare pure LSTM, pure CNN, Stacked LSTM, Bidirectional LSTM and CNN-LSTM Models together and select the best performing one.
RNN Model Tuning Code
import optuna
import warnings
from tensorflow.keras.layers import Bidirectional
"ignore")
warnings.filterwarnings(
# Disable logging
tf.keras.utils.disable_interactive_logging()
# Define model creation function
def create_rnn_model(trial):
# Define trial variables
= trial.suggest_categorical(
model_type "model_type",
"vanilla_lstm", "stacked_lstm", "bidirectional_lstm", "cnn", "cnn_lstm"],
[
)
= trial.suggest_categorical("dropout", [True, False])
dropout
if model_type == "vanilla_lstm":
# Trial variables
= trial.suggest_int("units", 50, 200)
units = trial.suggest_int("dense_layers", 0, 2)
dense_layers = trial.suggest_categorical("activation", ["relu", "tanh"])
activation = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
optimizer
# Define model with trial variables
= Sequential()
rnn_model =activation))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "stacked_lstm":
# Trial variables
= trial.suggest_int("units", 50, 200)
units = trial.suggest_int("dense_layers", 0, 2)
dense_layers = trial.suggest_int("lstm_layers", 2, 3)
lstm_layers = trial.suggest_categorical("activation", ["relu", "tanh"])
activation = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
optimizer
# Define model with trial variables
= Sequential()
rnn_model
for layer in range(lstm_layers):
if layer == lstm_layers - 1:
=activation))
rnn_model.add(layers.LSTM(units, activationelse:
=activation, return_sequences=True))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "bidirectional_lstm":
# Trial variables
= trial.suggest_int("units", 50, 200)
units = trial.suggest_int("dense_layers", 0, 2)
dense_layers = trial.suggest_categorical("activation", ["relu", "tanh"])
activation = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
optimizer
# Define model with trial variables
= Sequential()
rnn_model =activation)))
rnn_model.add(Bidirectional(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "cnn":
# Trial variables
= trial.suggest_int("units", 50, 200)
units = trial.suggest_int("dense_layers", 0, 2)
dense_layers = trial.suggest_categorical("activation", ["relu", "tanh"])
activation = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
optimizer
# Define model with trial variables
= Sequential()
rnn_model 1, activation=activation))
rnn_model.add(layers.Conv1D(units,
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
rnn_model.add(layers.Flatten())
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "cnn_lstm":
# Trial variables
= trial.suggest_int("units", 50, 200)
units = trial.suggest_int("dense_layers", 0, 2)
dense_layers = trial.suggest_categorical("activation", ["relu", "tanh"])
activation = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
optimizer
# Define model with trial variables
= Sequential()
rnn_model 1, activation=activation))
rnn_model.add(layers.Conv1D(units,
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
=activation))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
return rnn_model
# Define Optuna Objective
def rnn_objective(trial):
import tensorflow as tf
# Define training variables
= trial.suggest_int("batch_size", 1, 100, log=True)
batch_size = trial.suggest_int("epochs", 10, 100)
epochs
# Call model
= create_rnn_model(trial)
rnn_model
# Fit the model
=0);
rnn_model.fit(lstm_train_features, lstm_train_targets, batch_size, epochs, verbose
# Forecast for the test data
= rnn_model.predict(lstm_test_features)
rnn_forecast
# Create a loop to calculate cumulative cost of the forecast
= 0
storage = 0
cumulative_cost = 20
book_price = 100
monthly_storage_cost for step in range(len(lstm_test_targets)):
# Get the cost and difference for storage for the current step
= cost_function(
cost, to_storage
storage,
lstm_test_targets[step], 0],
rnn_forecast[step][
book_price,
monthly_storage_cost)
# Add cost to cumulative cost and storage difference to storage
+= int(cost)
cumulative_cost += int(to_storage)
storage
= cumulative_cost + storage*book_price
total_cost return total_cost
# Create Optuna Study and Minimize total_cost
= optuna.create_study(direction="minimize", sampler=optuna.samplers.QMCSampler())
rnn_study =20) rnn_study.optimize(rnn_objective, n_trials
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_4 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_5 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_6 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_7 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_12 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_13 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_14 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_16 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_17 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_18 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_20 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_20 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_20 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_21 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_22 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_23 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
RNN Model with Best Parameters
# Disable logging
tf.keras.utils.disable_interactive_logging()
# Define best model parameters
= rnn_study.best_params
best_rnn_model_parameters = best_rnn_model_parameters["model_type"]
model_type = best_rnn_model_parameters["units"]
units = best_rnn_model_parameters["activation"]
activation = best_rnn_model_parameters["dense_layers"]
dense_layers = best_rnn_model_parameters["lstm_layers"] if "lstm_layers" in best_rnn_model_parameters.keys() else 0
lstm_layers = best_rnn_model_parameters["optimizer"]
optimizer = best_rnn_model_parameters["batch_size"]
batch_size = best_rnn_model_parameters["epochs"]
epochs = best_rnn_model_parameters["dropout"]
dropout
= subset[730:1095]
test_subset = subset[1095:1461]
val_subset
= np.array(test_subset["num_sold"])
test_targets = np.array(test_subset.drop(columns=["num_sold"])).reshape((test_subset.shape[0], 1, test_subset.shape[1] - 1))
test_features = np.array(val_subset["num_sold"])
val_targets = np.array(val_subset.drop(columns=["num_sold"])).reshape((val_subset.shape[0], 1, val_subset.shape[1] - 1))
val_features
if model_type == "vanilla_lstm":
# Define model with trial variables
= Sequential()
rnn_model =activation))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "stacked_lstm":
# Define model with trial variables
= Sequential()
rnn_model
for layer in range(lstm_layers):
if layer == lstm_layers - 1:
=activation))
rnn_model.add(layers.LSTM(units, activationelse:
=activation, return_sequences=True))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "bidirectional_lstm":
# Define model with trial variables
= Sequential()
rnn_model =activation)))
rnn_model.add(Bidirectional(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "cnn":
# Define model with trial variables
= Sequential()
rnn_model 1, activation=activation))
rnn_model.add(layers.Conv1D(units,
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
rnn_model.add(layers.Flatten())
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
elif model_type == "cnn_lstm":
# Define model with trial variables
= Sequential()
rnn_model 1, activation=activation))
rnn_model.add(layers.Conv1D(units,
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
=activation))
rnn_model.add(layers.LSTM(units, activation
for layer in range(dense_layers):
="relu"))
rnn_model.add(layers.Dense(units, activation
if dropout:
=0.25))
rnn_model.add(layers.Dropout(rate
1))
rnn_model.add(layers.Dense(compile(loss="mae", optimizer=optimizer)
rnn_model.
# Fit the model
rnn_model.fit(test_features,
test_targets,
batch_size,
epochs,=1,
verbose=(val_features, val_targets),
validation_data=tf.keras.callbacks.EarlyStopping(patience=7, min_delta=0.1));
callbacks
# Forecast for the test data
= rnn_model.predict(val_features)
rnn_forecast
# Calculate total cost
= 0
storage = 0
cumulative_cost = 20
book_price = 100
monthly_storage_cost for step in range(len(val_targets)):
# Get the cost and difference for storage for the current step
= cost_function(
cost, to_storage
storage,
val_targets[step], 0],
rnn_forecast[step][
book_price,
monthly_storage_cost)
# Add cost to cumulative cost and storage difference to storage
+= int(cost)
cumulative_cost += int(to_storage)
storage
= cumulative_cost + storage*book_price
rnn_model_total_cost
# Plot both test data and forecast from the model to compare them visually.
="Simple LSTM Model Forecasts"); plot_forecasts(val_targets, rnn_forecast, title
Model Type of the Tuned RNN Model: bidirectional_lstm
MAPE for Tuned RNN Model is: 0.16543614864349365
R2 Score for Tuned RNN Model is: -0.19266240917966937
Total Cost for Tuned RNN Model is: €84,989.00
4 Results and Conclusion
We have tuned all models and calculated the total costs in terms of our cost function and the winner is XGBoost! XGBoost’s performance is not surprising given that it is usually the winning model for competitions but STL-ARIMA has also performed very successfully. RNN models were, for me, somewhat disappointing; however, it should be stated that we didn’t fully utilized LSTM cells’ memory because we have only used one timestep in the input. Increasing timesteps could potentially improve its performance. ARIMAX was the worst performer even with tuning. It could be because of the feature matrix that we have created couldn’t fully capture seasonality and trend.
Even though we have added calendar features, they might not be enough to predict seasonality and trend. Therefore, a decomposition model where a model similar to STL decompose the time series to trend, seasonality, and residuals and after that a seperate regression model trained on only residuals could potentially be more accurate; but that’s all for this post.
Thank you for reading this post and see you on the next one!