Deep Learning for Time Series
When classical methods struggle -- with complex nonlinear patterns, high-dimensional inputs, or very long sequences -- deep learning offers powerful alternatives.
Why Deep Learning for Time Series?
LSTM for Forecasting
LSTMs (Long Short-Term Memory) are the most popular RNN variant for time series. They solve the vanishing gradient problem with a gating mechanism:
Windowed Input Format
Time series data must be transformed into windows for LSTM input:
1import numpy as np
2
3def create_sequences(data, seq_length, forecast_horizon=1):
4 """
5 Create windowed sequences for LSTM training.
6
7 Args:
8 data: 1D or 2D array of time series data
9 seq_length: Number of past steps to use as input
10 forecast_horizon: Number of future steps to predict
11
12 Returns:
13 X: shape (n_samples, seq_length, n_features)
14 y: shape (n_samples, forecast_horizon)
15 """
16 if data.ndim == 1:
17 data = data.reshape(-1, 1)
18
19 n_samples = len(data) - seq_length - forecast_horizon + 1
20 n_features = data.shape[1]
21
22 X = np.zeros((n_samples, seq_length, n_features))
23 y = np.zeros((n_samples, forecast_horizon))
24
25 for i in range(n_samples):
26 X[i] = data[i:i + seq_length]
27 y[i] = data[i + seq_length:i + seq_length + forecast_horizon, 0]
28
29 return X, y
30
31# Demo
32np.random.seed(42)
33series = np.sin(np.linspace(0, 20, 200)) + np.random.randn(200) * 0.1
34
35X, y = create_sequences(series, seq_length=30, forecast_horizon=5)
36print(f"Input shape: {X.shape}") # (n, 30, 1)
37print(f"Target shape: {y.shape}") # (n, 5)
38print(f"First window: [{X[0, 0, 0]:.3f}, ..., {X[0, -1, 0]:.3f}]")
39print(f"First target: [{y[0, 0]:.3f}, ..., {y[0, -1]:.3f}]")GRU (Gated Recurrent Unit)
GRU is a simplified LSTM with only two gates:
GRU has fewer parameters than LSTM and often performs comparably. Use GRU when:
Sequence-to-Sequence (Seq2Seq)
For multi-step forecasting, the encoder-decoder architecture is powerful:
1. Encoder: Processes the input sequence and compresses it into a context vector 2. Decoder: Takes the context vector and autoregressively generates the forecast
This decouples input length from output length, allowing flexible forecast horizons.
1import numpy as np
2
3# LSTM cell implementation (forward pass only)
4class LSTMCell:
5 """Single LSTM cell for understanding the mechanics."""
6
7 def __init__(self, input_dim, hidden_dim):
8 self.hidden_dim = hidden_dim
9 scale = np.sqrt(2.0 / (input_dim + hidden_dim))
10
11 # Forget gate
12 self.Wf = np.random.randn(input_dim + hidden_dim, hidden_dim) * scale
13 self.bf = np.zeros(hidden_dim)
14
15 # Input gate
16 self.Wi = np.random.randn(input_dim + hidden_dim, hidden_dim) * scale
17 self.bi = np.zeros(hidden_dim)
18
19 # Cell candidate
20 self.Wc = np.random.randn(input_dim + hidden_dim, hidden_dim) * scale
21 self.bc = np.zeros(hidden_dim)
22
23 # Output gate
24 self.Wo = np.random.randn(input_dim + hidden_dim, hidden_dim) * scale
25 self.bo = np.zeros(hidden_dim)
26
27 def forward(self, x, h_prev, c_prev):
28 """Single step forward pass."""
29 combined = np.concatenate([x, h_prev])
30
31 # Gates
32 f = self._sigmoid(combined @ self.Wf + self.bf) # Forget
33 i = self._sigmoid(combined @ self.Wi + self.bi) # Input
34 c_hat = np.tanh(combined @ self.Wc + self.bc) # Candidate
35 o = self._sigmoid(combined @ self.Wo + self.bo) # Output
36
37 # Update cell state and hidden state
38 c = f * c_prev + i * c_hat
39 h = o * np.tanh(c)
40
41 return h, c
42
43 def _sigmoid(self, x):
44 return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
45
46
47# Demo forward pass
48cell = LSTMCell(input_dim=1, hidden_dim=16)
49h = np.zeros(16)
50c = np.zeros(16)
51
52# Process a sequence of 5 time steps
53sequence = [0.5, 0.8, 1.2, 0.3, 0.9]
54print("Processing sequence through LSTM:")
55for t, x in enumerate(sequence):
56 h, c = cell.forward(np.array([x]), h, c)
57 print(f" t={t}: input={x}, h_norm={np.linalg.norm(h):.4f}, c_norm={np.linalg.norm(c):.4f}")Temporal Convolutional Networks (TCN)
TCNs use 1D convolutions with causal padding (no future information leaks) and dilated convolutions (exponentially increasing receptive field).
Advantages over RNNs:
Architecture
WaveNet-Style Models
WaveNet (originally for audio) uses dilated causal convolutions and has been adapted for time series. It can model very long-range dependencies efficiently.1import numpy as np
2
3class CausalConv1D:
4 """Causal 1D convolution (no future leakage)."""
5
6 def __init__(self, in_channels, out_channels, kernel_size, dilation=1):
7 self.kernel_size = kernel_size
8 self.dilation = dilation
9 self.padding = (kernel_size - 1) * dilation # Causal: pad only the left
10
11 scale = np.sqrt(2.0 / (kernel_size * in_channels))
12 self.weight = np.random.randn(out_channels, in_channels, kernel_size) * scale
13 self.bias = np.zeros(out_channels)
14
15 def forward(self, x):
16 """
17 x: shape (batch, channels, time)
18 returns: shape (batch, out_channels, time)
19 """
20 batch, channels, time = x.shape
21
22 # Pad left side only (causal)
23 x_padded = np.pad(x, ((0,0), (0,0), (self.padding, 0)))
24
25 # Dilated convolution
26 out_time = time
27 out = np.zeros((batch, self.weight.shape[0], out_time))
28
29 for t in range(out_time):
30 for k in range(self.kernel_size):
31 idx = t + self.padding - k * self.dilation
32 if 0 <= idx < x_padded.shape[2]:
33 out[:, :, t] += np.einsum(
34 'bc,oc->bo',
35 x_padded[:, :, idx],
36 self.weight[:, :, k]
37 )
38
39 out += self.bias.reshape(1, -1, 1)
40 return out
41
42 def receptive_field(self):
43 return (self.kernel_size - 1) * self.dilation + 1
44
45
46# Show how dilated convolutions expand the receptive field
47print("TCN Receptive Field Growth:")
48total_rf = 1
49for layer, dilation in enumerate([1, 2, 4, 8, 16]):
50 conv = CausalConv1D(1, 1, kernel_size=3, dilation=dilation)
51 rf = conv.receptive_field()
52 total_rf += rf - 1
53 print(f" Layer {layer}: dilation={dilation}, layer_rf={rf}, total_rf={total_rf}")
54
55print(f"\nWith 5 layers of kernel_size=3: total receptive field = {total_rf} time steps")