SequentialLearning
/

SuperLinear

@@ -1,12 +1,12 @@
-from   typing import Optional, Tuple
-import torch, torch.nn as nn, torch.nn.functional as F
 import numpy as np
-import matplotlib.pyplot as plt
-import os
-from transformers                          import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
-from transformers.modeling_outputs         import CausalLMOutputWithCrossAttentions
-from .configuration_super_linear           import SuperLinearConfig
 "-------------------------------------------------------------------------------------------------------------------"
@@ -170,7 +170,7 @@ class SparseMoE(nn.Module):
         if self.use_fft:
             self.gating_network = nn.Linear(self.fft_len//2, self.num_experts, bias=True)
         else:
-            self.gating_network = nn.Linear(configs.seq_len, self.num_experts, bias=True)
         if self.moe_norm:
             self.gate_norm = nn.BatchNorm1d(self.num_experts)
@@ -289,11 +289,6 @@ class Model(nn.Module):
         self.train_seq_len = configs.train_seq_len
         self.resample_long_lookback = configs.resample_long_lookback
         self.layer_type = configs.layer_type
-        self.load_weights_full = configs.load_weights_full
-        self.load_linear = configs.load_linear
-        if self.load_weights_full:
-            pass  # TODO: implement full weight loading
         # Parse frequency experts from configuration
         if configs.freq_experts == "":
@@ -303,9 +298,6 @@ class Model(nn.Module):
         self.top_k_experts = configs.top_k_experts
         self.freeze_experts = configs.freeze_experts
-        path = configs.linear_freq_weights_path
-        linear_freq_dirs = os.listdir(path) if os.path.exists(path) else []
-        checkpoints_paths = [path + "/" + d + "/" + "checkpoint.pth" for d in linear_freq_dirs]
         # Initialize experts based on frequency specification or create generic experts
         self.experts = {}
@@ -324,19 +316,6 @@ class Model(nn.Module):
                     else:
                         # Default to RLinear if unknown layer type
                         self.experts[expert_freq] = RLinear(self.train_seq_len, self.train_pred_len)
-                    if self.load_linear and checkpoints_paths:
-                        cycle = self.map_to_cycle(expert_freq)
-                        cycle_str = f'cycle_{cycle}/'
-                        cycle_checkpoint_path = [cp for cp in checkpoints_paths if (cycle_str in cp and self.layer_type in cp)]
-                        if len(cycle_checkpoint_path) > 0:
-                            cycle_checkpoint_path = cycle_checkpoint_path[0]
-                            print(f'Loading checkpoint: {cycle_checkpoint_path}')
-                            self.experts[expert_freq].load_state_dict(torch.load(cycle_checkpoint_path))
-                        if self.freeze_experts:
-                            for param in self.experts[expert_freq].parameters():
-                                param.requires_grad = False
         else:
             # Create generic experts
             for i in range(configs.n_experts):
@@ -359,12 +338,8 @@ class Model(nn.Module):
                     # Default to RLinear if unknown layer type
                     self.experts[f"comp_{i}"] = RLinear(self.train_seq_len, self.train_pred_len)
-        # Initialize the MoE layer and dropout
         self.moe = SparseMoE(configs, experts=self.experts.values())
-        # Load pre-trained weights if specified
-        if configs.load_weights_full:
-            pass  # TODO: implement full weight loading
         print("Experts:", self.experts.keys())
@@ -470,40 +445,6 @@ class Model(nn.Module):
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
             return result, expert_probs
         return result
-    def map_to_cycle(self, freq):
-        """Map frequency string to cycle length for expert loading."""
-        if "/" in freq:
-            cycle = int(freq.split("/")[1])
-        elif "h" in freq:
-            cycle = 24
-        elif "2h" in freq:
-            cycle = 12
-        elif "3h" in freq:
-            cycle = 8
-        elif "4h" in freq:
-            cycle = 6
-        elif "D" in freq:
-            cycle = 7
-        elif "DM" in freq:
-            cycle = 30
-        elif "W" in freq:
-            cycle = 52
-        elif "M" in freq:
-            cycle = 12
-        elif "min" in freq:
-            cycle = 1440
-        elif "5min" in freq:
-            cycle = 288
-        elif "10min" in freq:
-            cycle = 144
-        elif "15min" in freq:
-            cycle = 96
-        elif "30min" in freq:
-            cycle = 48
-        else:
-            cycle = int(freq)
-        return cycle
 "-------------------------------------------------------------------------------------------------------------------"
 class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = SuperLinearConfig

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import numpy as np
+from transformers import PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from .configuration_super_linear import SuperLinearConfig
 "-------------------------------------------------------------------------------------------------------------------"
         if self.use_fft:
             self.gating_network = nn.Linear(self.fft_len//2, self.num_experts, bias=True)
         else:
+            self.gating_network = nn.Linear(configs.train_seq_len, self.num_experts, bias=True)
         if self.moe_norm:
             self.gate_norm = nn.BatchNorm1d(self.num_experts)
         self.train_seq_len = configs.train_seq_len
         self.resample_long_lookback = configs.resample_long_lookback
         self.layer_type = configs.layer_type
         # Parse frequency experts from configuration
         if configs.freq_experts == "":
         self.top_k_experts = configs.top_k_experts
         self.freeze_experts = configs.freeze_experts
         # Initialize experts based on frequency specification or create generic experts
         self.experts = {}
                     else:
                         # Default to RLinear if unknown layer type
                         self.experts[expert_freq] = RLinear(self.train_seq_len, self.train_pred_len)
         else:
             # Create generic experts
             for i in range(configs.n_experts):
                     # Default to RLinear if unknown layer type
                     self.experts[f"comp_{i}"] = RLinear(self.train_seq_len, self.train_pred_len)
+        # Initialize the MoE layer
         self.moe = SparseMoE(configs, experts=self.experts.values())
         print("Experts:", self.experts.keys())
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
             return result, expert_probs
         return result
 "-------------------------------------------------------------------------------------------------------------------"
 class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = SuperLinearConfig