Upload 7 files

Browse files

Files changed (6) hide show

README.md +64 -3
config.json +10 -0
configuration_super_linear.py +20 -0
example_usage.py +46 -0
modeling_super_linear.py +339 -144
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,3 +1,64 @@
----
-license: mit
----

+# SuperLinear: A Mixture of Experts Time Series Forecasting Model
+SuperLinear is a novel time series forecasting model that employs a Mixture of Experts (MoE) architecture to achieve superior performance across various forecasting tasks. The model routes inputs to the most relevant experts based on frequency-domain analysis using FFT-based gating networks.
+## Model Architecture
+The SuperLinear model consists of:
+- **Sparse Mixture of Experts (MoE)**: Routes inputs to the top-k most relevant experts
+- **FFT-based Gating Network**: Uses frequency domain analysis to determine expert routing
+- **Frequency-specific Experts**: Pre-trained experts specialized for different temporal patterns
+## Key Features
+- **Adaptive Expert Selection**: Dynamic routing based on input characteristics
+- **Frequency-aware Processing**: Leverages FFT analysis for intelligent expert selection
+- **Auto-regressive Capabilities**: Supports long-horizon forecasting
+- **Multi-scale Processing**: Handles various sequence lengths through resampling
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoConfig
+import torch
+# Load the model
+model = AutoModelForCausalLM.from_pretrained("path/to/superlinear", trust_remote_code=True)
+# Prepare input time series data
+# Shape: [batch_size, sequence_length, features]
+input_data = torch.randn(1, 512, 1)
+# Generate predictions
+with torch.no_grad():
+    outputs = model(inputs_embeds=input_data, pred_len=96)
+    predictions = outputs.logits  # Shape: [batch_size, prediction_length, features]
+```
+## Configuration
+Key configuration parameters:
+- `train_seq_len`: Training sequence length (default: 512)
+- `train_pred_len`: Training prediction length (default: 96)
+- `top_k_experts`: Number of experts to use (default: 12)
+- `use_fft`: Whether to use FFT-based gating (default: True)
+- `freq_experts`: Frequency-specific expert configuration
+- `moe_temp`: Temperature for expert selection during inference (default: 1)
+## Citation
+If you use SuperLinear in your research, please cite:
+```bibtex
+@article{superlinear2024,
+  title={SuperLinear: Mixture of Experts for Time Series Forecasting},
+  author={Your Name},
+  year={2024}
+}
+```
+## License
+This model is released under the MIT License.

config.json CHANGED Viewed

@@ -31,6 +31,16 @@
   "_comment_training": "Training parameters",
   "resample_long_lookback": false,
   "_comment_system": "System and framework parameters",
   "model_type": "super_linear",
   "torch_dtype": "float32",

   "_comment_training": "Training parameters",
   "resample_long_lookback": false,
+  "_comment_horizon": "Auto-regressive and horizon parameters",
+  "long_horizon_scaling": 1,
+  "_comment_resampling": "Resampling and lookback-based parameters",
+  "lookback_resampling": 1,
+  "scale_list": "2,4,6",
+  "threshold": 0.2,
+  "freq_bound": 0.25,
+  "penalty_scale": 2.0,
   "_comment_system": "System and framework parameters",
   "model_type": "super_linear",
   "torch_dtype": "float32",

configuration_super_linear.py CHANGED Viewed

@@ -41,6 +41,16 @@ class SuperLinearConfig(PretrainedConfig):
         # Training parameters
         resample_long_lookback=False,
         **kwargs,
     ):
         # Model architecture parameters
@@ -66,4 +76,14 @@ class SuperLinearConfig(PretrainedConfig):
         # Training parameters
         self.resample_long_lookback = resample_long_lookback
         super().__init__(**kwargs)

         # Training parameters
         resample_long_lookback=False,
+        # Auto-regressive and horizon parameters
+        long_horizon_scaling=1,
+        # Resampling and lookback-based parameters
+        lookback_resampling=1,
+        scale_list=[2,4,6],
+        threshold=0.2,
+        freq_bound=0.25,
+        penalty_scale=2.0,
         **kwargs,
     ):
         # Model architecture parameters
         # Training parameters
         self.resample_long_lookback = resample_long_lookback
+        # Auto-regressive and horizon parameters
+        self.long_horizon_scaling = long_horizon_scaling
+        # Resampling and lookback-based parameters
+        self.lookback_resampling = lookback_resampling
+        self.scale_list = scale_list
+        self.threshold = threshold
+        self.freq_bound = freq_bound
+        self.penalty_scale = penalty_scale
         super().__init__(**kwargs)

example_usage.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+"""
+Example usage of SuperLinear model for time series forecasting.
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoConfig
+def main():
+    # Load model configuration and model
+    config = AutoConfig.from_pretrained("./", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained("./", trust_remote_code=True)
+    # Set model to evaluation mode
+    model.eval()
+    # Create sample time series data
+    # Shape: [batch_size, sequence_length, features]
+    batch_size = 4
+    sequence_length = 512
+    num_features = 1
+    prediction_length = 96
+    # Generate synthetic time series data
+    t = torch.linspace(0, 10, sequence_length)
+    sample_data = torch.sin(t).unsqueeze(0).unsqueeze(-1).repeat(batch_size, 1, num_features)
+    print(f"Input shape: {sample_data.shape}")
+    # Generate predictions
+    with torch.no_grad():
+        outputs = model(inputs_embeds=sample_data, pred_len=prediction_length)
+        predictions = outputs.logits
+    print(f"Prediction shape: {predictions.shape}")
+    print(f"Sample predictions: {predictions[0, :5, 0]}")  # First 5 predictions of first batch
+    # Demonstrate with different prediction lengths
+    for pred_len in [24, 48, 96, 192]:
+        with torch.no_grad():
+            outputs = model(inputs_embeds=sample_data, pred_len=pred_len)
+            predictions = outputs.logits
+        print(f"Prediction length {pred_len}: output shape {predictions.shape}")
+if __name__ == "__main__":
+    main()

modeling_super_linear.py CHANGED Viewed

@@ -1,17 +1,19 @@
-from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import numpy as np
-from transformers import PreTrainedModel, GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 from .configuration_super_linear import SuperLinearConfig
 "-------------------------------------------------------------------------------------------------------------------"
 class RevIN(nn.Module):
-    def __init__(self, num_features: int, eps=1e-5, affine=True, norm_type = None, subtract_last = False):
         """
         :param num_features: the number of features or channels
         :param eps: a value added for numerical stability
@@ -26,13 +28,14 @@ class RevIN(nn.Module):
         if self.affine:
             self._init_params()
-    def forward(self, x, mode:str):
         if mode == 'norm':
             self._get_statistics(x)
             x = self._normalize(x)
         elif mode == 'denorm':
             x = self._denormalize(x)
-        else: raise NotImplementedError
         return x
     def _init_params(self):
@@ -44,18 +47,19 @@ class RevIN(nn.Module):
         dim2reduce = tuple(range(1, x.ndim-1))
         if self.subtract_last:
-            self.last = x[:,-1,:].unsqueeze(1)
         else:
             self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
         self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
-        if  self.norm_type == "l1":
-            self.denom = torch.sum(torch.abs(x), dim=dim2reduce, keepdim=True).detach()
-        elif  self.norm_type == "l2":
-            self.denom = torch.sqrt(torch.sum(x**2, dim=dim2reduce, keepdim=True)).detach()
     def _normalize(self, x):
         if self.subtract_last:
             x = x - self.last
         else:
@@ -63,7 +67,7 @@ class RevIN(nn.Module):
         x = x / self.stdev
         if self.norm_type in ["l1", "l2"]:
-            x = x / self.denom
         if self.affine:
             x = x * self.affine_weight
@@ -74,8 +78,10 @@ class RevIN(nn.Module):
         if self.affine:
             x = x - self.affine_bias
             x = x / (self.affine_weight + self.eps*self.eps)
         if self.norm_type in ["l1", "l2"]:
-            x = x * self.denom
         x = x * self.stdev
         if self.subtract_last:
             x = x + self.last
@@ -173,7 +179,7 @@ class SparseMoE(nn.Module):
             self.gating_network = nn.Linear(configs.train_seq_len, self.num_experts, bias=True)
         if self.moe_norm:
-            self.gate_norm = nn.BatchNorm1d(self.num_experts)
     def get_periodogram(self, inputs, n=10000):
         """
@@ -189,38 +195,28 @@ class SparseMoE(nn.Module):
         Returns:
             Normalized periodogram of the input signals
         """
-        if inputs.dim() == 2:
-            x_0 = inputs.unsqueeze(2)
-        else:
-            x_0 = inputs
-        x_0 = x_0 - torch.mean(x_0, dim=1, keepdim=True)  # Remove mean (DC component)
         # Compute FFT and normalize
         dft = torch.fft.fft(x_0, dim=1, n=n) / np.sqrt(n)
-        dft = dft[:, :n//2, :]  # Keep only positive frequencies
         I = torch.abs(dft) ** 2  # Power spectral density
         # Normalize periodogram
         I_sum = torch.sum(I, dim=1, keepdim=True)
         I_sum[I_sum == 0] = 1  # Avoid division by zero
         I = I / I_sum
-        if torch.any(I_sum == 0):
-            print("Zeros in the sum")
-            raise ValueError
-        if inputs.dim() == 2:
-            I = I.squeeze(2)
         return I
-    def forward(self, x, get_prob=False):
         """
         Forward pass through the Mixture of Experts.
         Args:
             x: Input tensor of shape [batch_size, sequence_length]
             get_prob: Whether to return expert selection probabilities
         Returns:
             - Output tensor from the selected experts
@@ -233,27 +229,30 @@ class SparseMoE(nn.Module):
             x_0 = x
         # Get gating logits
-        self.gate_outputs = self.gating_network(x_0)  # Raw gating scores
         if self.moe_norm:
-            self.gate_outputs = self.gate_norm(self.gate_outputs)
         # Apply temperature scaling during inference
         if not self.training:
-            self.gate_outputs = self.gate_outputs / self.moe_temp
         # Add noise to gating logits during training (for exploration)
-        noise = torch.randn_like(self.gate_outputs).to(x.device) * self.noise_std
         if self.training:
-            noisy_gate_outputs = self.gate_outputs + noise
-            self.topk_values, topk_indices = torch.topk(noisy_gate_outputs, self.k, dim=1)
         else:
-            self.topk_values, topk_indices = torch.topk(self.gate_outputs, self.k, dim=1)
         # Normalize the gate values with softmax
-        self.topk_gates = F.softmax(self.topk_values, dim=1)
-        batch_size = x.size(0)
         # Get outputs from all experts
         expert_outputs = torch.stack([self.experts[i](x) for i in range(self.num_experts)], dim=1)
@@ -262,10 +261,10 @@ class SparseMoE(nn.Module):
         sparse_expert_outputs = torch.gather(expert_outputs, 1, topk_indices_expanded)
         # Combine expert outputs using the gate values
-        output = torch.sum(self.topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         if get_prob:
-            expert_probs = F.softmax(self.gate_outputs, dim=1)
             return output, expert_probs
         return output
@@ -283,19 +282,36 @@ class Model(nn.Module):
     """
     def __init__(self, configs):
         super(Model, self).__init__()
-        self.configs = configs
-        self.model_name = "SuperLinear"
         self.train_pred_len = configs.train_pred_len
         self.train_seq_len = configs.train_seq_len
-        self.resample_long_lookback = configs.resample_long_lookback
         self.layer_type = configs.layer_type
         # Parse frequency experts from configuration
-        if configs.freq_experts == "":
             self.freq_experts = None
         else:
-            self.freq_experts = configs.freq_experts.split('_')
         self.top_k_experts = configs.top_k_experts
         self.freeze_experts = configs.freeze_experts
@@ -303,88 +319,222 @@ class Model(nn.Module):
         self.experts = {}
         if self.freq_experts is not None:
             for expert_freq in self.freq_experts:
-                if expert_freq == "naive" or expert_freq == "Naive":
                     self.experts[expert_freq] = Naive(self.train_seq_len, self.train_pred_len)
-                elif expert_freq == "mean" or expert_freq == "Mean":
                     self.experts[expert_freq] = Mean(self.train_seq_len, self.train_pred_len)
                 else:
-                    # Use the appropriate expert class based on layer_type
-                    expert_classes = {'Linear': Linear, 'RLinear': RLinear}
-                    if self.layer_type in expert_classes:
-                        expert_class = expert_classes[self.layer_type]
-                        self.experts[expert_freq] = expert_class(self.train_seq_len, self.train_pred_len)
-                    else:
-                        # Default to RLinear if unknown layer type
-                        self.experts[expert_freq] = RLinear(self.train_seq_len, self.train_pred_len)
         else:
-            raise ValueError("No frequency experts specified in configuration.")
         # Create additional complementary experts if specified
-        if configs.comp_moe > 0:
-            for i in range(configs.comp_moe):
-                expert_classes = {'Linear': Linear, 'RLinear': RLinear}
-                if self.layer_type in expert_classes:
-                    expert_class = expert_classes[self.layer_type]
-                    self.experts[f"comp_{i}"] = expert_class(self.train_seq_len, self.train_pred_len)
-                else:
-                    # Default to RLinear if unknown layer type
-                    self.experts[f"comp_{i}"] = RLinear(self.train_seq_len, self.train_pred_len)
-        # Initialize the MoE layer
         self.moe = SparseMoE(configs, experts=self.experts.values())
         print("Experts:", self.experts.keys())
-    def add_experts(self, experts: dict):
         """
         Add new experts to the model.
         Args:
             experts: Dictionary of expert instances to add
         """
         for name, expert in experts.items():
-            self.experts[name] = expert
         # Reinitialize the MoE layer with the updated experts
         self.moe = SparseMoE(self.configs, experts=self.experts.values())
         return self.moe
-    def resample_seq_len(self, x, pred_len, inverse=False, orig_pred_len=None):
         """
-        Resample sequence length for handling inputs shorter than expected training length.
         Args:
-            x: Input tensor
-            pred_len: Prediction length
-            inverse: If True, downsample back to original scale; if False, upsample
-            orig_pred_len: Original prediction length (required for inverse=True)
         Returns:
-            Tuple of (resampled_tensor, updated_pred_len, scale_factor, orig_pred_len)
-            For inverse=True: returns (resampled_tensor, None, None, None)
         """
-        if not inverse:
-            # Upsample if input is shorter than training length
-            if x.size(-1) < self.train_seq_len:
-                scale_factor = self.train_seq_len / x.size(-1)
-                x_resampled = F.interpolate(x.unsqueeze(1), size=self.train_seq_len, mode='linear', align_corners=False).squeeze(1)
-                pred_len_resampled = int(pred_len * scale_factor)
-                return x_resampled, pred_len_resampled, scale_factor, pred_len
-            else:
-                return x, pred_len, None, None
-        else:
-            # Downsample back to original scale
-            if orig_pred_len is not None:
-                x_resampled = F.interpolate(x.unsqueeze(1), size=orig_pred_len, mode='linear', align_corners=False).squeeze(1)
-                return x_resampled, None, None, None
-            else:
-                return x, None, None, None
-    def forward(self, x_in, get_prob=False, pred_len=None):
         """
         Forward pass through the model.
         Args:
-            x_in: Encoder input tensor
             get_prob: Whether to return expert selection probabilities
             pred_len: Override for prediction length
@@ -398,97 +548,142 @@ class Model(nn.Module):
         x = x_in
         # If input is 2D, add a channel dimension
         if x_in.dim() == 2:
-            x = x.unsqueeze(-1)
-        # Permute to shape [batch_size, features, sequence_length]
-        x = x.permute(0, 2, 1)
         B, V, L = x.shape
-        scale_factor = None
-        orig_pred_len = None
-        # Handle resampling if input is shorter than training length
-        if self.resample_long_lookback and L < self.train_seq_len:
-            x, pred_len, scale_factor, orig_pred_len = self.resample_seq_len(x, pred_len, inverse=False)
-        # Reshape for MoE processing
-        x = x.reshape(B * V, x.size(-1))
-        # Forward through MoE
         if get_prob:
             out, expert_probs = self.moe(x, get_prob=True)
         else:
             out = self.moe(x)
         if self.train_pred_len < pred_len:
             outputs = [out]
             ar_x = torch.cat([x, out], dim=1)[:, -self.train_seq_len:]
             for i in range(0, pred_len, self.train_pred_len):
                 ar_out = self.moe(ar_x)
                 outputs.append(ar_out)
                 ar_x = torch.cat([ar_x, ar_out], dim=1)[:, -self.train_seq_len:]
             out = torch.cat(outputs, dim=1)[:, :pred_len]
-        # Reshape back
-        out = out.reshape(B, V, out.size(-1))
-        # Handle resampling back to original scale if needed
-        if scale_factor is not None:
-            out, _, _, _ = self.resample_seq_len(out, None, inverse=True, orig_pred_len=orig_pred_len)
-        # Return to original shape conventions
-        result = out.permute(0, 2, 1)
-        if x_in.dim() == 2:
-            result = result.squeeze(-1)
         if get_prob:
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
-            return result, expert_probs
-        return result
 "-------------------------------------------------------------------------------------------------------------------"
-class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = SuperLinearConfig
     def __init__(self, config: SuperLinearConfig):
         super().__init__(config)
-        # the backbone keeps its own Config dataclass, so build one on‑the‑fly:
-        backbone_cfg   = type("Cfg", (), config.to_dict())()
-        self.args      = backbone_cfg
-        self.backbone  = Model(backbone_cfg)
-        self.post_init()
     # ------------------------------------------------------------------
     # Forward pass expected by AutoModelForCausalLM
     # ------------------------------------------------------------------
     def forward(self,
-                inputs_embeds: torch.Tensor = None,
-                attention_mask: Optional[torch.Tensor] = None,
-                past_key_values: Optional[Tuple] = None,
-                use_cache: bool = True,
-                labels: Optional[torch.Tensor] = None,
-                **kwargs,) -> CausalLMOutputWithCrossAttentions:
         if inputs_embeds is None:
-            raise ValueError("Pass the time‑series as `inputs_embeds`")
-        # backbone expects (B, C, L)
         x_enc = inputs_embeds
         # backbone returns (B, pred_len, C)
-        preds = self.backbone(x_enc, pred_len=kwargs.get("pred_len", None))
-        return CausalLMOutputWithCrossAttentions(loss=None,logits=preds,past_key_values=None,hidden_states=None,attentions=None,)
-    def prepare_inputs_for_generation(self, inputs_embeds, past_key_values=None, **kwargs):
-        if past_key_values is not None:
-            # only feed the last new step
-            inputs_embeds = inputs_embeds[:, -1:, :]
-        return {"inputs_embeds": inputs_embeds, "past_key_values": past_key_values}
-    def _reorder_cache(self, past, beam_idx, **kwargs):
-        return past  # backbone keeps no KV cache

+from typing import Optional, Tuple, Dict, List, Union
+import copy
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.functional import interpolate
+from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 from .configuration_super_linear import SuperLinearConfig
 "-------------------------------------------------------------------------------------------------------------------"
 class RevIN(nn.Module):
+    def __init__(self, num_features: int, eps=1e-5, affine=True, norm_type=None, subtract_last=False):
         """
         :param num_features: the number of features or channels
         :param eps: a value added for numerical stability
         if self.affine:
             self._init_params()
+    def forward(self, x, mode: str):
         if mode == 'norm':
             self._get_statistics(x)
             x = self._normalize(x)
         elif mode == 'denorm':
             x = self._denormalize(x)
+        else:
+            raise NotImplementedError
         return x
     def _init_params(self):
         dim2reduce = tuple(range(1, x.ndim-1))
         if self.subtract_last:
+            self.last = x[:, -1:, :].detach()
+            self.mean = torch.mean(x[:, :-1, :], dim=dim2reduce, keepdim=True).detach()
         else:
             self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
         self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+        if self.norm_type == "l1":
+            self.stdev = torch.mean(torch.abs(x - self.mean), dim=dim2reduce, keepdim=True).detach()
+        elif self.norm_type == "l2":
+            self.stdev = torch.sqrt(torch.mean((x - self.mean) ** 2, dim=dim2reduce, keepdim=True) + self.eps).detach()
     def _normalize(self, x):
         if self.subtract_last:
             x = x - self.last
         else:
         x = x / self.stdev
         if self.norm_type in ["l1", "l2"]:
+            x = x / self.stdev
         if self.affine:
             x = x * self.affine_weight
         if self.affine:
             x = x - self.affine_bias
             x = x / (self.affine_weight + self.eps*self.eps)
         if self.norm_type in ["l1", "l2"]:
+            x = x * self.stdev
         x = x * self.stdev
         if self.subtract_last:
             x = x + self.last
             self.gating_network = nn.Linear(configs.train_seq_len, self.num_experts, bias=True)
         if self.moe_norm:
+            self.batch_norm = nn.BatchNorm1d(self.num_experts)
     def get_periodogram(self, inputs, n=10000):
         """
         Returns:
             Normalized periodogram of the input signals
         """
+        x_0 = inputs - torch.mean(inputs, dim=1, keepdim=True)  # Remove mean (DC component)
         # Compute FFT and normalize
         dft = torch.fft.fft(x_0, dim=1, n=n) / np.sqrt(n)
+        dft = dft[:, :n//2]  # Keep only positive frequencies
         I = torch.abs(dft) ** 2  # Power spectral density
         # Normalize periodogram
         I_sum = torch.sum(I, dim=1, keepdim=True)
         I_sum[I_sum == 0] = 1  # Avoid division by zero
         I = I / I_sum
         return I
+    def forward(self, x, get_prob=False, get_prob_only=False):
         """
         Forward pass through the Mixture of Experts.
         Args:
             x: Input tensor of shape [batch_size, sequence_length]
             get_prob: Whether to return expert selection probabilities
+            get_prob_only: Whether to return only probabilities without computation
         Returns:
             - Output tensor from the selected experts
             x_0 = x
         # Get gating logits
+        gate_outputs = self.gating_network(x_0)  # Raw gating scores
         if self.moe_norm:
+            gate_outputs = self.batch_norm(gate_outputs)
         # Apply temperature scaling during inference
         if not self.training:
+            gate_outputs = gate_outputs / self.moe_temp
+        if get_prob_only:
+            expert_probs = F.softmax(gate_outputs, dim=1)
+            return expert_probs
         # Add noise to gating logits during training (for exploration)
         if self.training:
+            noise = torch.randn_like(gate_outputs).to(x.device) * self.noise_std
+            noisy_gate_outputs = gate_outputs + noise
+            topk_values, topk_indices = torch.topk(noisy_gate_outputs, self.k, dim=1)
         else:
+            topk_values, topk_indices = torch.topk(gate_outputs, self.k, dim=1)
         # Normalize the gate values with softmax
+        topk_gates = F.softmax(topk_values, dim=1)
         # Get outputs from all experts
         expert_outputs = torch.stack([self.experts[i](x) for i in range(self.num_experts)], dim=1)
         sparse_expert_outputs = torch.gather(expert_outputs, 1, topk_indices_expanded)
         # Combine expert outputs using the gate values
+        output = torch.sum(topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         if get_prob:
+            expert_probs = F.softmax(gate_outputs, dim=1)
             return output, expert_probs
         return output
     """
     def __init__(self, configs):
         super(Model, self).__init__()
+        self.configs = copy.deepcopy(configs)
+        # Core model configuration
         self.train_pred_len = configs.train_pred_len
         self.train_seq_len = configs.train_seq_len
         self.layer_type = configs.layer_type
+        # Initialize additional configuration attributes with defaults
+        self.long_horizon_scaling = configs.long_horizon_scaling
+        self.lookback_resampling = configs.lookback_resampling
+        lookback_scale_str = configs.scale_list
+        if isinstance(lookback_scale_str, str):
+            self.scale_list = [float(x.strip()) for x in lookback_scale_str.split(',')]
+        else:
+            self.scale_list = lookback_scale_str  # Already a list
+        self.threshold = configs.threshold
+        self.freq_bound = configs.freq_bound
+        self.penalty_scale = configs.penalty_scale
+        self.fft_len = configs.fft_len
         # Parse frequency experts from configuration
+        freq_experts_str = configs.freq_experts
+        if freq_experts_str == "":
             self.freq_experts = None
         else:
+            self.freq_experts = freq_experts_str.split('_')
+        # Expert configuration
         self.top_k_experts = configs.top_k_experts
         self.freeze_experts = configs.freeze_experts
         self.experts = {}
         if self.freq_experts is not None:
             for expert_freq in self.freq_experts:
+                if expert_freq.lower() == "naive":
                     self.experts[expert_freq] = Naive(self.train_seq_len, self.train_pred_len)
+                elif expert_freq.lower() == "mean":
                     self.experts[expert_freq] = Mean(self.train_seq_len, self.train_pred_len)
                 else:
+                    self.experts[expert_freq] = RLinear(self.train_seq_len, self.train_pred_len)
+            self.n_experts = len(self.experts)
         else:
+            raise ValueError("Please specify experts in the configuration.")
         # Create additional complementary experts if specified
+        comp_moe = configs.comp_moe
+        if comp_moe > 0:
+            if comp_moe == 1:
+                print("Creating complementary expert")
+                self.experts["comp"] = RLinear(self.train_seq_len, self.train_pred_len)
+            else:
+                for i in range(comp_moe):
+                    print(f"Creating complementary expert {i}")
+                    self.experts["comp_"+str(i)] = RLinear(self.train_seq_len, self.train_pred_len)
+        # Initialize the MoE layer and dropout
         self.moe = SparseMoE(configs, experts=self.experts.values())
         print("Experts:", self.experts.keys())
+    def add_experts(self, experts: Dict[str, nn.Module]) -> nn.Module:
         """
         Add new experts to the model.
         Args:
             experts: Dictionary of expert instances to add
+        Returns:
+            Updated MoE layer
         """
         for name, expert in experts.items():
+            if name not in self.experts:
+                self.experts[name] = expert
+                print(f"Added expert: {name}")
+            else:
+                print(f"Expert {name} already exists. Skipping addition.")
         # Reinitialize the MoE layer with the updated experts
         self.moe = SparseMoE(self.configs, experts=self.experts.values())
         return self.moe
+    def apply_long_horizon_scaling(self, ar_out: torch.Tensor, ar_x: torch.Tensor) -> torch.Tensor:
         """
+        Apply scaling to auto-regressive outputs to maintain statistical properties during long horizon prediction.
+        This function identifies cases where the variance of the new predictions exceeds the variance
+        of the input sequence and applies scaling to maintain consistent statistical properties.
         Args:
+            ar_out: Auto-regressive output tensor of shape [batch_size * features, pred_len]
+            ar_x: Input sequence tensor of shape [batch_size * features, seq_len]
         Returns:
+            Scaled auto-regressive output tensor
         """
+        if not (self.long_horizon_scaling and not self.training):
+            return ar_out
+        # Calculate statistics for scaling
+        std_new = torch.std(ar_out, dim=1, keepdim=True)
+        mean_new = torch.mean(ar_out, dim=1, keepdim=True)
+        std_old = torch.std(ar_x, dim=1, keepdim=True)
+        # Find indices where new variance exceeds old variance
+        inds = torch.where(std_new / std_old > 1)[0]
+        if len(inds) > 0:
+            # Center the outputs around their mean
+            ar_out_centered = ar_out[inds] - mean_new[inds]
+            # Calculate scaling factor to match old variance
+            scaling = std_old[inds] / (std_new[inds] + 1e-8)
+            # Scale and shift back to mean_new
+            ar_out_adjusted = ar_out_centered * scaling + mean_new[inds]
+            ar_out[inds] = ar_out_adjusted
+        return ar_out
+    def lookback_resample_search(self, x, scale_list=[2,4,6], min_lookback=512):
+        """
+        Search for optimal resampling scale based on lookback analysis of expert selection.
+        This function analyzes the frequency content and expert selection lookback to determine
+        the best resampling scale for each input sequence, potentially improving model performance
+        by matching input characteristics to expert capabilities.
+        Args:
+            x: Input tensor of shape [batch_size, features, sequence_length]
+            scale_list: List of potential downsampling scales to evaluate
+            min_lookback: Minimum sequence length required after resampling
+        Returns:
+            Tuple of (resampled_input, final_scales) where:
+            - resampled_input: Optimally resampled input tensor
+            - final_scales: Scale factors used for each sample
+        """
+        B, V, L = x.shape
+        lookback = self.train_seq_len
+        x_0 = x.reshape(B*V, L)[:, -lookback:]
+        output_x = x_0.clone()[:, -lookback:]
+        x_reshape = x.reshape(B*V, L)
+        x_fft_init = self.moe.get_periodogram(x_reshape, n=self.fft_len)
+        right_cumsum = torch.cumsum(x_fft_init, dim=-1)
+        mask = right_cumsum > 1-self.threshold
+        j_threshold = mask.float().argmax(dim=-1)
+        freqs = np.array([np.linspace(0, 0.5, self.fft_len//2)])
+        threshhold_freqs = np.take_along_axis(freqs, j_threshold.unsqueeze(-1).detach().cpu().numpy(), axis=1)
+        # where threshhold_freqs is 0, set to a small value to avoid division by zero
+        threshhold_freqs[threshhold_freqs == 0] = self.freq_bound
+        max_scale_factor = (self.freq_bound/ threshhold_freqs).astype(int).flatten()
+        if self.threshold==0:
+            max_scale_factor = np.inf * np.ones(B*V, dtype=int)
+        # Compute energy loss penalty for each potential scale
+        energy_loss_penalties = {}
+        total_energy = torch.sum(x_fft_init, dim=-1)  # Total energy per sample
+        for scale in scale_list:
+            if scale <= 1:
+                continue  # No penalty for upsampling or no scaling
+            # Calculate Nyquist frequency after downsampling
+            nyquist_after_downsample = 0.5 / scale
+            # Find frequency bins that will be lost (above new Nyquist)
+            freq_bins = torch.linspace(0, 0.5, self.fft_len//2, device=x_fft_init.device)
+            lost_freq_mask = freq_bins > nyquist_after_downsample
+            # Calculate energy that will be lost
+            lost_energy = torch.sum(x_fft_init[:, lost_freq_mask], dim=-1)
+            # Energy loss fraction (0 = no loss, 1 = all energy lost)
+            energy_loss_fraction = lost_energy / (total_energy + 1e-10)
+            energy_loss_penalties[scale] = energy_loss_fraction
+        # Get initial entropy
+        prob = self.moe(x_0, get_prob_only=True)
+        best_scores = -torch.sum(prob * torch.log(prob + 1e-10), dim=-1)
+        final_scales = torch.ones(B*V, device=x.device)
+        for scale in scale_list:
+            x_interp = torch.nn.functional.interpolate(
+                x, scale_factor=1/scale, mode='linear', align_corners=True
+            )
+            if x_interp.shape[2] >= min_lookback:
+                x_interp_reshaped = x_interp.reshape(B*V, x_interp.shape[-1])
+                x_interp_reshaped = x_interp_reshaped[:, -lookback:]
+                prob = self.moe(x_interp_reshaped, get_prob_only=True)
+                scores = -torch.sum(prob * torch.log(prob + 1e-10), dim=-1)
+                # Add energy loss penalty
+                if scale in energy_loss_penalties:
+                    energy_penalty = energy_loss_penalties[scale]
+                    scores = scores + energy_penalty*self.penalty_scale
+                idx = np.where((scores < best_scores).cpu() & torch.tensor(max_scale_factor >= scale))[0]
+                if len(idx) > 0:
+                    output_x[idx] = x_interp_reshaped[idx]
+                    best_scores[idx] = scores[idx]
+                    final_scales[idx] = scale
+        return output_x.reshape(B, V, output_x.shape[-1]), final_scales
+    def lookback_resample_reverse(self, y, final_scales, inf_pred_len=None):
+        """
+        Reverse the resampling operation on the output.
+        This function upsamples the model outputs back to the original scale
+        based on the resampling factors used during input processing.
+        Args:
+            y: Output tensor from model of shape [batch_size, features, pred_len]
+            final_scales: Scale factors used during input resampling
+            inf_pred_len: Target prediction length
+        Returns:
+            Upsampled output tensor of shape [batch_size, features, inf_pred_len]
+        """
+        B, V, L = y.shape
+        y_reshaped = y.view(B*V, L)
+        y_out = y_reshaped[:, :inf_pred_len]
+        unique_scales = torch.unique(final_scales)
+        for scale in unique_scales:
+            scale_val = scale.item()  # Convert tensor to scalar
+            if scale_val > 1:
+                idx = torch.where(final_scales == scale)[0]
+                if len(idx) > 0:
+                    y_interp = torch.nn.functional.interpolate(
+                        y_reshaped[idx].unsqueeze(1), scale_factor=scale_val, mode='linear', align_corners=True
+                    )
+                    y_out[idx] = y_interp.reshape(len(idx), y_interp.shape[-1])[:, :inf_pred_len]
+        return y_out.reshape(B, V, inf_pred_len)
+    def forward(self, x_in: torch.Tensor, get_prob: bool = False, pred_len: Optional[int] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         Forward pass through the model.
         Args:
+            x_in: Encoder input tensor of shape [batch_size, sequence_length] or [batch_size, features, sequence_length]
             get_prob: Whether to return expert selection probabilities
             pred_len: Override for prediction length
         x = x_in
         # If input is 2D, add a channel dimension
         if x_in.dim() == 2:
+            x = x.unsqueeze(1)
         B, V, L = x.shape
+        short_lookback = False
+        orig_pred_len = pred_len
+        if L < self.train_seq_len:
+            # Handle case where input sequence is shorter than expected
+            # by interpolating to the required length
+            scale_factor = self.train_seq_len / L
+            scale_factor = int(np.ceil(scale_factor))
+            pred_len = pred_len * scale_factor
+            x = interpolate(x, scale_factor=scale_factor, mode='linear')
+            x = x[:, :, -self.train_seq_len:]
+            L = self.train_seq_len
+            short_lookback = True
+        # lookback resampling logic
+        final_scales = None
+        if self.lookback_resampling and L > self.train_seq_len:
+            x_resampled, final_scales = self.lookback_resample_search(
+                x, self.scale_list, self.train_seq_len
+            )
+            # Update x and L for the resampled input
+            x = x_resampled
+            L = x.shape[-1]
+        # Reshape to process each feature independently
+        x = x.reshape(B * V, L)
+        expert_probs = None
+        # Forward pass through MoE
         if get_prob:
             out, expert_probs = self.moe(x, get_prob=True)
         else:
             out = self.moe(x)
+        # Auto-regressive prediction for long horizons
         if self.train_pred_len < pred_len:
             outputs = [out]
             ar_x = torch.cat([x, out], dim=1)[:, -self.train_seq_len:]
             for i in range(0, pred_len, self.train_pred_len):
                 ar_out = self.moe(ar_x)
+                ar_out = self.apply_long_horizon_scaling(ar_out, ar_x)
                 outputs.append(ar_out)
                 ar_x = torch.cat([ar_x, ar_out], dim=1)[:, -self.train_seq_len:]
             out = torch.cat(outputs, dim=1)[:, :pred_len]
+        # Reshape back to batch format
+        out = out.reshape(B, V, out.shape[-1])
+        # Apply lookback resampling reverse if it was used
+        if self.lookback_resampling and final_scales is not None and not short_lookback:
+            out = self.lookback_resample_reverse(out, final_scales, orig_pred_len)
+        # If we used interpolation earlier, now downsample back to original scale
+        if short_lookback:
+            out = interpolate(out, scale_factor=1/scale_factor, mode='linear')
+        out = out[:, :, :orig_pred_len]
+        if x_in.dim() == 2:
+            out = out.squeeze(1)
         if get_prob:
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
+          #  expert_probs = expert_probs.permute(0, 2, 1)  # [batch_size, num_experts, sequence_length]
+            if x_in.dim() == 2:
+                expert_probs = expert_probs.squeeze(-1)
+            return out, expert_probs
+        return out
+    def map_to_cycle(self, freq: str) -> int:
+        """
+        Map frequency string notation to cycle length (number of periods).
+        Args:
+            freq: String representing a time frequency (e.g., "h" for hourly, "D" for daily)
+        Returns:
+            Integer representing the number of periods in the cycle
+        """
+        cycle = int(freq.split("/")[1])
+        return cycle
 "-------------------------------------------------------------------------------------------------------------------"
+class SuperLinearForCausalLM(PreTrainedModel):
     config_class = SuperLinearConfig
     def __init__(self, config: SuperLinearConfig):
         super().__init__(config)
+        # the backbone keeps its own Config dataclass, so build one on-the-fly:
+        backbone_cfg = type("Cfg", (), config.to_dict())()
+        self.args = backbone_cfg
+        self.backbone = Model(backbone_cfg)
+        self.post_init()
     # ------------------------------------------------------------------
     # Forward pass expected by AutoModelForCausalLM
     # ------------------------------------------------------------------
     def forward(self,
+                inputs_embeds: torch.Tensor = None,
+                pred_len: Optional[int] = None,
+                get_prob: bool = False,
+                **kwargs) -> CausalLMOutputWithCrossAttentions:
         if inputs_embeds is None:
+            raise ValueError("inputs_embeds must be provided")
+        # backbone expects (B, C, L) or (B, L)
         x_enc = inputs_embeds
         # backbone returns (B, pred_len, C)
+        if get_prob:
+            preds, probs = self.backbone(x_enc, pred_len=pred_len, get_prob=True)
+        else:
+            preds = self.backbone(x_enc, pred_len=pred_len, get_prob=False)
+            probs = None
+        return CausalLMOutputWithCrossAttentions(
+            logits=preds,
+            hidden_states=None,
+            attentions=probs
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pyyaml
+numpy
+pandas
+torch
+scikit-learn
+ipykernel
+transformers>=4.40.1
+datasets>=2.18.0
+accelerate>=0.28.0