I am trying to build a custom LSTM cell. I found many snippets online but none of them produces same result as torch lstm. For testing, I fix input size and number of layers to 1. Is there a known way how to customise the weights inside an LSTM cell or is there a pythonic way to write it even not efficient. Here is how I tried
First LSTM cell
class LSTMCell(nn.Module):
def __init__(self, hidden_size, input_size=1):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.weight_ih = nn.Parameter(torch.randn(4 * hidden_size, input_size))
self.weight_hh = nn.Parameter(torch.randn(4 * hidden_size, hidden_size))
self.bias = nn.Parameter(torch.zeros(4 * hidden_size))
def forward(self, x, hx=None, cx=None):
batch_size, seq_len, _ = x.size()
outputs = []
for t in range(seq_len):
x_t = x[:, t, :] # Extract the t-th time step
# Compute gates
gates = (x_t @ self.weight_ih.T) + (hx @ self.weight_hh.T) + self.bias
i_gate, f_gate, g_gate, o_gate = gates.chunk(4, dim=1)
# Activation functions for gates
input_gate = torch.sigmoid(i_gate)
forget_gate = torch.sigmoid(f_gate)
cell_gate = torch.tanh(g_gate)
output_gate = torch.sigmoid(o_gate)
# Update cell state and hidden state
cx = (forget_gate * cx) + (input_gate * cell_gate)
hx = output_gate * torch.tanh(cx)
# Append the current hidden state to outputs
outputs.append(hx.unsqueeze(1)) # Add time dimension back
# Concatenate outputs along the time dimension
outputs = torch.cat(outputs, dim=1)
return outputs, (hx, cx)
Second, here is how it is combined with dense layer
class LSTMModel(nn.Module):
def __init__(self, hidden_size, output_size, num_layers=1):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.LSTMModel = LSTMCell(hidden_size=self.hidden_size)
#nn.LSTM(1, hidden_size, 1, batch_first=True, bias=True) #uncomment to compare with LSTM
self.fc = nn.Linear(self.hidden_size, self.output_size, bias=False)
# Initialize states
self.tracked_h = None
self.tracked_c = None
def reset_states(self, batch_size):
"""Reset the hidden and cell states to zero."""
self.tracked_c = torch.zeros(batch_size, self.hidden_size)
self.tracked_h = torch.zeros(batch_size, self.hidden_size)
def adjust_states_for_batch_size(self, batch_size):
"""Adjust the tracked states if the batch size changes."""
if self.tracked_h is not None:
if self.tracked_h.size(0) < batch_size: # Batch size increased, reset states
self.reset_states(batch_size)
elif self.tracked_h.size(0) > batch_size: # Batch size decreased, slice states
self.tracked_h = self.tracked_h[:batch_size, :]
self.tracked_c = self.tracked_c[:batch_size, :]
else:
self.reset_states(batch_size)
def forward(self, x, h=None, c=None):
batch_size = x.size(0)
self.adjust_states_for_batch_size(batch_size)
out,(h,cn)= self.LowRankCell(x,hx=self.tracked_h,cx=self.tracked_c)
# Update tracked states
self.tracked_h = h.detach()
self.tracked_c = c.detach()
output = torch.tanh(self.fc(h))
return output
nn.LSTMCell, write a LSTM class that loops over the sequence length in python, and torch jit compile it for performance. Note that pytorch'snn.LSTMCellonly processes a single time step at a time, in contrast to your implementation