SequentialLearning
/

SuperLinear

@@ -1,19 +1,22 @@
 from   typing import Optional, Tuple
 import torch, torch.nn as nn, torch.nn.functional as F
 from transformers                          import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
 from transformers.modeling_outputs         import CausalLMOutputWithCrossAttentions
 from .configuration_super_linear           import SuperLinearConfig
-from torch.nn.functional import interpolate
-import datetime
 import numpy as np
 import matplotlib.pyplot as plt
 import os
-import numpy as np
 "-------------------------------------------------------------------------------------------------------------------"
 class RevIN(nn.Module):
@@ -99,7 +102,6 @@ class moving_avg(nn.Module):
         self.kernel_size = kernel_size
         self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
     def forward(self, x):
         # x: [Batch, Input length]
         # padding on the both ends of time series
@@ -147,8 +149,12 @@ class Linear(nn.Module):
     def forward(self, x):
         # x: [Batch*Channel, Input length]
-        x = x.clone()
-        x = self.Linear(x).clone()
         return x # to [Batch, Output length, Channel]
 class Naive(nn.Module):
@@ -159,7 +165,11 @@ class Naive(nn.Module):
     def forward(self, x):
         # x: [Batch*Channel, Input length]
         x =  x[:,-1].unsqueeze(1).repeat(1, self.output_len)
         return x # to [Batch, Output length, Channel]
 class Mean(nn.Module):
@@ -169,7 +179,9 @@ class Mean(nn.Module):
     def forward(self, x):
         # x: [Batch*Channel, Input length]
         x =  x.mean(dim=1).unsqueeze(1).repeat(1, self.output_len)
         return x # to [Batch, Output length, Channel]
@@ -179,74 +191,31 @@ class NLinear(nn.Module):
         self.Linear = nn.Linear(input_len, output_len)
     def forward(self, x):
-        # x: [Batch, Input length,Channel]
         seq_last = x[:,-1:].detach()
         x = x - seq_last
         x = self.Linear(x)
-        return x+seq_last # to [Batch, Output length, Channel]
 class RLinear(nn.Module):
     def __init__(self, input_len, output_len):
         super(RLinear, self).__init__()
-        self.Linear              = nn.Linear(input_len, output_len)
-        self.seq_len             = input_len
-        self.horizon             = output_len
-        self.revin_layer         = RevIN(num_features = None, affine=False, norm_type = None, subtract_last = False)
-        self.zero_shot_Linear    = None
-    def transform_model(self,new_lookback,mode):
-        if mode == 1:
-            W              = self.Linear.weight.detach()
-            new_W          = W[:, -new_lookback:]
-            original_norm  = torch.norm(W, p=2)
-            new_norm       = torch.norm(new_W, p=2)
-            final_scaling  = original_norm / new_norm if new_norm.item() != 0 else 1.0
-            new_W          = new_W * final_scaling
-            self.zero_shot_Linear        = new_W
-        elif mode ==2:
-            W = self.Linear.weight.detach()
-            W4d = W.unsqueeze(0).unsqueeze(0)            # (1, 1, out, in)
-            # resize   H → self.horizon   and   W → new_lookback
-            new_W = F.interpolate(
-                        W4d,
-                        size=(self.horizon, new_lookback),   # (H_out, W_out)
-                        mode='bilinear',
-                        align_corners=False
-                    )[0, 0]                                 # drop the two singleton dims
-            self.zero_shot_Linear = new_W                  # shape (self.horizon, new_lookback)
-        else:
-            W = self.Linear.weight.detach()
-            m      = nn.AdaptiveAvgPool1d(new_lookback)
-            self.zero_shot_Linear = m(W)
     def forward(self, x):
         # x: [Batch, Input length,Channel]
         x_shape = x.shape
-        ''''if x.shape[1] < self.seq_len:
-            #if self.zero_shot_Linear is None:
-                #print(F"new Lookkback : {x.shape[1]}")
-            self.transform_model(x.shape[1],1)
-            x = self.revin_layer(x, 'norm')
-            x = F.linear(x, self.zero_shot_Linear)
-            x = self.revin_layer(x, 'denorm')
-            return x'''
         if len(x_shape) == 2:
             x = x.unsqueeze(-1)
         x = x.clone()
         x = self.revin_layer(x, 'norm')
         x = self.Linear(x.permute(0,2,1)).permute(0,2,1).clone()
         x = self.revin_layer(x, 'denorm')
         if len(x_shape) == 2:
             x = x.squeeze(-1)
         return x # to [Batch, Output length, Channel]
@@ -254,23 +223,27 @@ class RLinear(nn.Module):
 "-------------------------------------------------------------------------------------------------------------------"
 class SparseNoisyMoE(nn.Module):
     def __init__(self, configs, experts=None):
-        self.i = 0
         super(SparseNoisyMoE, self).__init__()
         input_dim = configs.seq_len
-        self.lookback = configs.seq_len
         output_dim = configs.pred_len
-        self.k = configs.top_k_experts
         self.noise_std = configs.noisy_gating_std
         self.noise_std_decay = configs.noisy_gating_std_decay
         self.experts = nn.ModuleList(experts)
         self.num_experts = len(experts)
-        self.ker_len = configs.ker_len
-        self.con = configs.con
         self.d_model = configs.d_model
         self.mlp_gating = configs.mlp_gating
         self.moe_temp = configs.moe_temp
         self.use_fft = configs.use_fft
         self.fft_len = configs.fft_len
         if self.use_fft:
             if self.mlp_gating:
@@ -279,12 +252,18 @@ class SparseNoisyMoE(nn.Module):
                     nn.ReLU(),
                     nn.Linear(self.d_model, self.num_experts)
                 )
             else:
                 self.gating_network = nn.Linear(self.fft_len//2, self.num_experts, bias=True)
         else:
             self.gating_network = nn.Linear(input_dim, self.num_experts, bias=True)
-    def get_periodogram(self, inputs, ker_len=50, con=1, n=10000):
         if inputs.dim() == 2:
             x_0 = inputs.unsqueeze(2)
         else:
@@ -292,19 +271,6 @@ class SparseNoisyMoE(nn.Module):
         x_0 = x_0 - torch.mean(x_0, dim=1, keepdim=True)
         v = torch.arange(0, n) / n
-        if con:
-            if ker_len is None:
-                ker_len = n // 4
-                ker_len = min(ker_len, 50)
-            x_0 = x_0.permute(0, 2, 1)
-            ker = (torch.ones(1, 1, ker_len) / ker_len).to(x_0.device)
-            x_c = F.conv1d(x_0, ker, padding="same")
-            x_c[:, :, :ker_len // 2] = x_c[:, :, ker_len // 2:ker_len // 2 + 1]
-            x_c[:, :, -ker_len // 2:] = x_c[:, :, -ker_len // 2 - 1:-ker_len // 2]
-            x_0 = x_0 - x_c
-            x_0 = x_0.permute(0, 2, 1)
         dft = torch.fft.fft(x_0, dim=1, n=n) / np.sqrt(n)
         dft = dft[:, :n//2, :]
         I = torch.abs(dft) ** 2
@@ -322,40 +288,35 @@ class SparseNoisyMoE(nn.Module):
         return I
-    def fourier_interp_dim1(self,x, target_len: int = 512):
-        L = x.size(1)
-        X      = torch.fft.rfft(x, dim=1)                   # (..., 25, ...)
-        pad    = target_len // 2 + 1 - X.size(1)
-        X_pad  = torch.cat([X, X.new_zeros(*X.shape[:-1], pad)], dim=1)
-        y      = torch.fft.irfft(X_pad, n=target_len, dim=1)
-        return y
     def forward(self, x, get_prob=False):
         if self.use_fft:
-            x_0 = self.get_periodogram(x, ker_len=self.ker_len, n=self.fft_len, con=self.con)
         else:
             x_0 = x
-        self.gate_outputs = self.gating_network(x_0)
-        #print(self.gate_outputs.shape)
         if not self.training:
             self.gate_outputs = self.gate_outputs / self.moe_temp
         noise = torch.randn_like(self.gate_outputs).to(x.device) * self.noise_std
         if self.training:
             noisy_gate_outputs = self.gate_outputs + noise
-            self.topk_values, topk_indices = torch.topk(noisy_gate_outputs, self.k, dim=1)
         else:
             self.topk_values, topk_indices = torch.topk(self.gate_outputs, self.k, dim=1)
         self.topk_gates = F.softmax(self.topk_values, dim=1)
         batch_size = x.size(0)
-        '''if x.shape[1] < 512:
-            x = self.fourier_interp_dim1(x)'''
         expert_outputs = torch.stack([self.experts[i](x) for i in range(self.num_experts)], dim=1)
         topk_indices_expanded = topk_indices.unsqueeze(-1).expand(-1, -1, expert_outputs.size(2))
@@ -364,10 +325,9 @@ class SparseNoisyMoE(nn.Module):
         output = torch.sum(self.topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
-            print(expert_probs.shape)
             return output, load_balancing_loss, expert_probs
         return output, load_balancing_loss
@@ -387,6 +347,7 @@ class SparseNoisyMoE(nn.Module):
         return load_balancing_loss
 class superLinear(nn.Module):
     def __init__(self, configs):
         super(superLinear, self).__init__()
@@ -399,13 +360,14 @@ class superLinear(nn.Module):
         self.auto_regressive = configs.auto_regressive
         self.n_experts = configs.moe_n_experts
         self.moe = configs.moe
         if configs.freq_experts == "":
             self.freq_experts = None
         else:
             self.freq_experts = configs.freq_experts.split('_')
         self.moe_loss = None
         self.top_k_experts = configs.top_k_experts
@@ -415,9 +377,9 @@ class superLinear(nn.Module):
         self.layer_type = configs.layer_type
         self.model_name = "SuperLinear"
         self.layer_dict = {'DLinear': DLinear, 'Linear': Linear, 'NLinear': NLinear, 'RLinear': RLinear}
-        path = configs.linear_checkpoints_path + configs.linear_checkpoints_dir + "/"
         dirs = os.listdir(path)
         checkpoints_paths = [path + "/" + d + "/" + "checkpoint.pth" for d in dirs]
@@ -462,12 +424,38 @@ class superLinear(nn.Module):
         self.manual_moe = configs.manual_moe
-        if configs.misc_moe == 1:
-            self.experts["misc"] = self.layer_dict[self.layer_type](self.seq_len, self.pred_len)
         self.moe = SparseNoisyMoE(configs, experts=self.experts.values())
         self.dropout = nn.Dropout(configs.dropout)
     def map_to_cycle(self, freq):
         if "/" in freq:
             cycle = int(freq.split("/")[1])
@@ -502,7 +490,7 @@ class superLinear(nn.Module):
         return cycle
-    def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, mask=None, freq=[None], get_prob=False, inf_pred_len=None):
         if inf_pred_len is  None:
             inf_pred_len = self.inf_pred_len
@@ -511,13 +499,12 @@ class superLinear(nn.Module):
             x = x_enc.permute(0, 2, 1)
             B, V, L = x.shape
         else:
-            x     = x_enc
-            x_enc =  x_enc.unsqueeze(-1)
             B, L = x.shape
             V    = 1
         short_lookback = False
-        if L < self.seq_len:
           #  print("test!")
             #ceil - very bad heuristic!
             scale_factor = self.seq_len / L
@@ -527,13 +514,12 @@ class superLinear(nn.Module):
             inf_pred_len = inf_pred_len*scale_factor
             x = interpolate(x_enc.permute(0, 2, 1), scale_factor=scale_factor, mode='linear')
             x = x[:,: , -self.seq_len:]
             orig_L = L
             L = self.seq_len
             short_lookback  = True
         x = x.reshape(B * V, L)
         expert_probs = None
@@ -556,8 +542,7 @@ class superLinear(nn.Module):
         if short_lookback:
             out = interpolate(out, scale_factor=1/scale_factor, mode='linear')
-           # print(out.shape)
-            out  = out[:, :,:orig_pred_len]
         result = out.permute(0, 2, 1)
         if get_prob:
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
@@ -576,64 +561,9 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
         backbone_cfg     = type("Cfg", (), config.to_dict())()
         self.args        = backbone_cfg
         self.backbone    = superLinear(backbone_cfg)
-        self.revin_layer = RevIN(num_features = None, affine=False, norm_type = None, subtract_last = False)
         self.post_init()
-    def fourier_interp_dim1(self,x, target_len: int = 512):
-        L = x.size(1)
-        X      = torch.fft.rfft(x, dim=1)                   # (..., 25, ...)
-        pad    = target_len // 2 + 1 - X.size(1)
-        X_pad  = torch.cat([X, X.new_zeros(*X.shape[:-1], pad)], dim=1)
-        y      = torch.fft.irfft(X_pad, n=target_len, dim=1)
-        return y
-    def fourier_downsample_dim1(self,x,target_len: int):
-        L = x.size(1)
-        # 1. Forward real FFT along dim-1
-        X = torch.fft.rfft(x, dim=1)                    # shape (..., L//2 + 1, ...)
-        # 2. Keep only the low-frequency bins needed for the shorter series
-        keep = target_len // 2 + 1                      # rfft size for the target grid
-        X_crop = X[..., :keep]                          # ideal brick-wall low-pass
-        # 3. Inverse FFT to the shorter grid
-        y = torch.fft.irfft(X_crop, n=target_len, dim=1)
-        return y
-    def upsample_interpolate(self, x, scale_factor, target_len: int = 512, mode='bicubic'):
-        was_2d = x.dim() == 2
-        if was_2d:                                  # [B, L] -> [B, 1, L]
-            x = x.unsqueeze(1)
-        else:                                       # [B, L, C] -> [B, C, L]
-            x = x.permute(0, 2, 1)
-        # Add support for bicubic interpolation by adding an extra dimension
-        if mode == 'bicubic':
-            x = x.unsqueeze(2)  # [B, C, 1, L]
-            x_up = F.interpolate(x, size=(1, target_len), mode='bicubic', align_corners=False)
-            x_up = x_up.squeeze(2)  # [B, C, L]
-        else:
-            x_up = F.interpolate(x, size=target_len, mode=mode, align_corners=False)
-        x_up = x_up * scale_factor
-        # Restore original layout
-        if was_2d:                                  # back to [B, target_len]
-            return x_up.squeeze(1).float()
-        else:                                       # back to [B, target_len, C]
-            return x_up.permute(0, 2, 1).float()
     def forward(self,
                 inputs_embeds: torch.Tensor = None,
                 attention_mask: Optional[torch.Tensor] = None,
@@ -647,19 +577,7 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
             raise ValueError("Pass the time‑series as `inputs_embeds`")
         # backbone expects (B, C, L)
-        x_enc = inputs_embeds
-        if x_enc.shape[1] < 512:
-            '''scale_factor = int(np.ceil(512/x_enc.shape[-1]))
-            x_enc = self.upsample_interpolate(x_enc,scale_factor,512)
-            self.backbone.inf_pred_len = 96*scale_factor
-            preds = self.backbone(x_enc)
-            preds = self.upsample_interpolate(preds,1/scale_factor,96)'''
-            preds = self.backbone(x_enc)
-        else:
-            preds = self.backbone(x_enc)
         return CausalLMOutputWithCrossAttentions(loss=None,logits=preds,past_key_values=None,hidden_states=None,attentions=None,)
@@ -673,3 +591,4 @@ class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
         return past  # backbone keeps no KV cache

 from   typing import Optional, Tuple
 import torch, torch.nn as nn, torch.nn.functional as F
 from transformers                          import (PreTrainedModel,GenerationMixin,AutoConfig,AutoModelForCausalLM,)
 from transformers.modeling_outputs         import CausalLMOutputWithCrossAttentions
 from .configuration_super_linear           import SuperLinearConfig
+from layers.Linear_layers import DLinear, Linear, NLinear, RLinear, Naive, Mean
+import math
+import torch
 import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
 import matplotlib.pyplot as plt
 import os
+from torch.nn.functional import interpolate
+import datetime
 "-------------------------------------------------------------------------------------------------------------------"
 class RevIN(nn.Module):
         self.kernel_size = kernel_size
         self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
     def forward(self, x):
         # x: [Batch, Input length]
         # padding on the both ends of time series
     def forward(self, x):
         # x: [Batch*Channel, Input length]
+        x_shape = x.shape
+        if len(x_shape) == 2:
+            x = x.unsqueeze(-1)
+        x = self.Linear(x)
+        if len(x_shape) == 2:
+            x = x.squeeze(-1)
         return x # to [Batch, Output length, Channel]
 class Naive(nn.Module):
     def forward(self, x):
         # x: [Batch*Channel, Input length]
         x =  x[:,-1].unsqueeze(1).repeat(1, self.output_len)
         return x # to [Batch, Output length, Channel]
 class Mean(nn.Module):
     def forward(self, x):
         # x: [Batch*Channel, Input length]
         x =  x.mean(dim=1).unsqueeze(1).repeat(1, self.output_len)
         return x # to [Batch, Output length, Channel]
         self.Linear = nn.Linear(input_len, output_len)
     def forward(self, x):
+        # x: [Batch* Input length,Channel]
         seq_last = x[:,-1:].detach()
         x = x - seq_last
         x = self.Linear(x)
+        x = x + seq_last
+        return x
 class RLinear(nn.Module):
     def __init__(self, input_len, output_len):
         super(RLinear, self).__init__()
+        self.Linear      = nn.Linear(input_len, output_len)
+        self.revin_layer = RevIN(num_features = None, affine=False, norm_type = None, subtract_last = False)
     def forward(self, x):
         # x: [Batch, Input length,Channel]
         x_shape = x.shape
         if len(x_shape) == 2:
             x = x.unsqueeze(-1)
         x = x.clone()
         x = self.revin_layer(x, 'norm')
         x = self.Linear(x.permute(0,2,1)).permute(0,2,1).clone()
         x = self.revin_layer(x, 'denorm')
         if len(x_shape) == 2:
             x = x.squeeze(-1)
         return x # to [Batch, Output length, Channel]
 "-------------------------------------------------------------------------------------------------------------------"
 class SparseNoisyMoE(nn.Module):
     def __init__(self, configs, experts=None):
         super(SparseNoisyMoE, self).__init__()
         input_dim = configs.seq_len
         output_dim = configs.pred_len
         self.noise_std = configs.noisy_gating_std
         self.noise_std_decay = configs.noisy_gating_std_decay
         self.experts = nn.ModuleList(experts)
         self.num_experts = len(experts)
+        self.k = configs.top_k_experts
+        if self.k > self.num_experts:
+            print(f"Warning: k ({self.k}) is greater than the number of experts ({self.num_experts}). Setting k to {self.num_experts}.")
+            self.k = self.num_experts
+       # self.ker_len = configs.ker_len
+        #self.con = configs.con
         self.d_model = configs.d_model
         self.mlp_gating = configs.mlp_gating
         self.moe_temp = configs.moe_temp
         self.use_fft = configs.use_fft
         self.fft_len = configs.fft_len
+        self.moe_norm = configs.moe_norm
         if self.use_fft:
             if self.mlp_gating:
                     nn.ReLU(),
                     nn.Linear(self.d_model, self.num_experts)
                 )
             else:
                 self.gating_network = nn.Linear(self.fft_len//2, self.num_experts, bias=True)
         else:
             self.gating_network = nn.Linear(input_dim, self.num_experts, bias=True)
+        if self.moe_norm:
+            self.batch_norm = nn.BatchNorm1d(self.num_experts)
+    def get_periodogram(self, inputs,   n=10000):
         if inputs.dim() == 2:
             x_0 = inputs.unsqueeze(2)
         else:
         x_0 = x_0 - torch.mean(x_0, dim=1, keepdim=True)
         v = torch.arange(0, n) / n
         dft = torch.fft.fft(x_0, dim=1, n=n) / np.sqrt(n)
         dft = dft[:, :n//2, :]
         I = torch.abs(dft) ** 2
         return I
     def forward(self, x, get_prob=False):
         if self.use_fft:
+           # x_0 = self.get_periodogram(x, ker_len=self.ker_len, n=self.fft_len, con=self.con)
+           x_0 = self.get_periodogram(x,  n=self.fft_len)
         else:
             x_0 = x
+        self.gate_outputs = self.gating_network(x_0) # g(X)
+        if self.moe_norm:
+          #  self.gate_outputs = self.batch_norm(self.gate_outputs)
+          self.gate_outputs = self.batch_norm(self.gate_outputs)
+        #
         if not self.training:
             self.gate_outputs = self.gate_outputs / self.moe_temp
+        # original
         noise = torch.randn_like(self.gate_outputs).to(x.device) * self.noise_std
         if self.training:
             noisy_gate_outputs = self.gate_outputs + noise
+            self.topk_values, topk_indices = torch.topk(noisy_gate_outputs, self.k, dim=1) # N = 35, k=6,12,20
         else:
             self.topk_values, topk_indices = torch.topk(self.gate_outputs, self.k, dim=1)
         self.topk_gates = F.softmax(self.topk_values, dim=1)
         batch_size = x.size(0)
         expert_outputs = torch.stack([self.experts[i](x) for i in range(self.num_experts)], dim=1)
         topk_indices_expanded = topk_indices.unsqueeze(-1).expand(-1, -1, expert_outputs.size(2))
         output = torch.sum(self.topk_gates.unsqueeze(2) * sparse_expert_outputs, dim=1)
         load_balancing_loss = self.calculate_load_balancing_loss(self.gate_outputs, batch_size)
         if get_prob:
             expert_probs = F.softmax(self.gate_outputs, dim=1)
             return output, load_balancing_loss, expert_probs
         return output, load_balancing_loss
         return load_balancing_loss
 class superLinear(nn.Module):
     def __init__(self, configs):
         super(superLinear, self).__init__()
         self.auto_regressive = configs.auto_regressive
         self.n_experts = configs.moe_n_experts
         self.moe = configs.moe
+        self.model_name = "SuperLinear"
         if configs.freq_experts == "":
             self.freq_experts = None
         else:
             self.freq_experts = configs.freq_experts.split('_')
+        print("self.freq_experts:", self.freq_experts)
         self.moe_loss = None
         self.top_k_experts = configs.top_k_experts
         self.layer_type = configs.layer_type
         self.model_name = "SuperLinear"
+        print("self.layer_type", self.layer_type)
         self.layer_dict = {'DLinear': DLinear, 'Linear': Linear, 'NLinear': NLinear, 'RLinear': RLinear}
+        path = configs.linear_checkpoints_path + configs.linear_checkpoints_dir
         dirs = os.listdir(path)
         checkpoints_paths = [path + "/" + d + "/" + "checkpoint.pth" for d in dirs]
         self.manual_moe = configs.manual_moe
+        if configs.misc_moe>0:
+            if configs.misc_moe == 1:
+                print("Creating misc expert")
+                self.experts["misc"] = self.layer_dict[self.layer_type](self.seq_len, self.pred_len)
+            else:
+                for i in range(configs.misc_moe):
+                    print(f"Creating misc expert {i}")
+                    self.experts["misc_"+str(i)] = self.layer_dict[self.layer_type](self.seq_len, self.pred_len)
+        if configs.misc_moe2==1:
+                print("Creating misc expert")
+                self.experts["misc2"] = self.layer_dict[self.layer_type](self.seq_len, self.pred_len)
         self.moe = SparseNoisyMoE(configs, experts=self.experts.values())
         self.dropout = nn.Dropout(configs.dropout)
+        if configs.load_weights:
+            print(f"Loading weights from {path}")
+            path = configs.load_weights_path + "" + configs.load_weights_dir + "/" + "checkpoint.pth"
+            if os.path.exists(path):
+              #  print(f"Loading weights from {path}")
+                checkpoint = torch.load(path)
+                print(len(self.experts.keys()))
+                print(self.experts.keys())
+                print(self.state_dict().keys())
+                print(checkpoint.keys())
+                self.load_state_dict(checkpoint)
+            else:
+                print(f"Path {path} does not exist. Skipping loading weights.")
     def map_to_cycle(self, freq):
         if "/" in freq:
             cycle = int(freq.split("/")[1])
         return cycle
+    def forward(self, x_enc, x_mark_enc=None, x_dec=None, x_mark_dec=None, mask=None, freq=[None], get_prob=False):
         if inf_pred_len is  None:
             inf_pred_len = self.inf_pred_len
             x = x_enc.permute(0, 2, 1)
             B, V, L = x.shape
         else:
+            x    = x_enc
             B, L = x.shape
             V    = 1
         short_lookback = False
+        if L<self.seq_len:
           #  print("test!")
             #ceil - very bad heuristic!
             scale_factor = self.seq_len / L
             inf_pred_len = inf_pred_len*scale_factor
             x = interpolate(x_enc.permute(0, 2, 1), scale_factor=scale_factor, mode='linear')
             x = x[:,: , -self.seq_len:]
             orig_L = L
             L = self.seq_len
             short_lookback  = True
         x = x.reshape(B * V, L)
         expert_probs = None
         if short_lookback:
             out = interpolate(out, scale_factor=1/scale_factor, mode='linear')
+            out = out[:, :,:orig_pred_len]
         result = out.permute(0, 2, 1)
         if get_prob:
             expert_probs = expert_probs.reshape(B, V, expert_probs.shape[-1])
         backbone_cfg     = type("Cfg", (), config.to_dict())()
         self.args        = backbone_cfg
         self.backbone    = superLinear(backbone_cfg)
         self.post_init()
     def forward(self,
                 inputs_embeds: torch.Tensor = None,
                 attention_mask: Optional[torch.Tensor] = None,
             raise ValueError("Pass the time‑series as `inputs_embeds`")
         # backbone expects (B, C, L)
+        preds = self.backbone(inputs_embeds)
         return CausalLMOutputWithCrossAttentions(loss=None,logits=preds,past_key_values=None,hidden_states=None,attentions=None,)
         return past  # backbone keeps no KV cache