I try to train my first LSTM regression model based on global average temperature data. The temperature is available for every month since January 1st, 1850.
From what I've learned online, I feed 12 months in a row into the LSTM and letting it predict the next month and I do this for all my sequences generated from the data (all data except the last 30 years).
I first only took the last output value from the LSTM and advanced it into the final linear linear layer but I noticed that it is not converging very well. Then I advanced all of the output data of the LSTM (so for every month I get the hidden size: 12 x hidden_size) and it works much better.
So with the second solution I can't put in variable length sequences but I won't do this anyway - right?
What would be the best approach here?
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
class LSTMDataset( Dataset ):
def __init__( self, x, y ):
self.x = x
self.y = y
def __len__(self):
return len( self.y )
def __getitem__(self, idx):
sample, label = self.x[ idx ], self.y[ idx ]
return sample.reshape( ( -1, 1 ) ), label.reshape( ( 1 ) )
class LSTMNet( nn.Module ):
def __init__( self ):
super().__init__()
self.hidden_size = 24
self.lstm = nn.LSTM( input_size=1, hidden_size=self.hidden_size, num_layers=1, batch_first=True )
self.net = nn.Sequential(
nn.Flatten(),
nn.Linear( self.hidden_size * 12, self.hidden_size * 12 ),
nn.ReLU(),
nn.Linear( self.hidden_size * 12, 1 ) # 12 is the fixed sequence length (12 months of temperature data)
)
def forward(self, x):
x, _ = self.lstm( x ) # or x[ :, -1, : ] - which one is preferred?
x = self.net( x )
return x
df = pd.read_csv( "globalTemperatures.csv" )
df = df[ [ "dt", "LandAverageTemperature" ] ]
df[ "dt" ] = pd.to_datetime( df[ "dt" ], format="%Y-%m-%d" )
forecastMonths = 12 * 30 # forecast 30 years
sequenceLength = 12 # 12 months are fed into LSTM one after another
trainX = []
trainY = []
testX = []
testY = []
for i in range( len( df ) - sequenceLength ):
x = np.array( df[ "LandAverageTemperature" ].iloc[ i : i + sequenceLength ] ).astype( np.float32 )
y = np.array( df[ "LandAverageTemperature" ].iloc[ i + sequenceLength ] ).astype( np.float32 )
if i + sequenceLength >= ( len( df ) - forecastMonths ):
testX.append( x )
testY.append( y )
else:
trainX.append( x )
trainY.append( y )
trainingSet = LSTMDataset( trainX, trainY )
testSet = LSTMDataset( testX, testY )
training_loader = DataLoader( trainingSet, batch_size=1, shuffle=True )
test_loader = DataLoader( testSet, batch_size=1, shuffle=False )
model = LSTMNet()
optimizer = torch.optim.Adam( model.parameters(), lr=0.01 )
loss_fn = torch.nn.MSELoss()
accuracies = []
epochs = 2
for epoch in range( epochs ):
losses = []
for i, data in enumerate( training_loader ):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
losses.append( loss.item() )
print( f"Epoch [{epoch + 1}/{epochs}] Loss: {np.mean( losses ):.2f}" )
predictedTemperatures = []
model.eval()
for i, data in enumerate( test_loader ):
inputs, labels = data
output = model( inputs )
loss = loss_fn(outputs, labels)
losses.append( loss.item() )
predictedTemperatures.append( output.item() )
print( f"Test Loss: {np.mean( losses ):.2f}" )
plt.figure( figsize=(18, 2) )
plt.plot( df[ "dt" ], df[ "LandAverageTemperature" ], label="True Temperatures" )
plt.plot( df[ "dt" ].iloc[ -forecastMonths : ], predictedTemperatures, label="Predicted Temperatures" )
plt.savefig( "temperatures.png" )
batch_size x timesteps x features. Then the output of the LSTM isbatch_size x timesteps x hidden_size. The linear layer will be applied to all timesteps, it can be 1 or any other number. It does not matter.x[ :, -1, :]when taking the wholexconverges so much better?x_fullis size(bs, sl), you havex = x_full[:, :-1]andy = x_full[:, 1:]. When doing next month prediction, you wouldn't do any pooling. Each timestep output would go through a linear layer so you have one output per input timestep.