Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| from torch.utils.data import DataLoader, Dataset | |
| import matplotlib.pyplot as plt | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.model_selection import train_test_split | |
| import math | |
| from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence | |
| """ | |
| Concrete Creep Prediction Model with LLM-Style Full History Processing | |
| This model uses an LLM-style approach for predicting concrete creep, where the entire history | |
| of creep measurements is processed using transformer architecture. | |
| Key improvements: | |
| 1. Full token utilization - instead of only using the last token, the model leverages all tokens | |
| in the creep history sequence using a hybrid pooling method that combines: | |
| - Mean pooling: Average of all sequence tokens | |
| - Attention pooling: Weighted sum based on learned attention | |
| - Last token: Traditional approach (which worked well in previous versions) | |
| This hybrid approach provides a richer representation of the sequence history, | |
| allowing the model to better capture both overall patterns and recent trends. | |
| """ | |
| # Set random seed for reproducibility | |
| torch.manual_seed(42) | |
| np.random.seed(42) | |
| # Define the file paths | |
| EXCEL_FEATURE_FILE = 'data_r28april.xlsx' | |
| EXCEL_CREEP_FILE = 'creep_predictions.xlsx' | |
| # Function to specifically handle the format of creep_predictions_1_to_220.xlsx | |
| def load_creep_prediction_file(): | |
| """ | |
| This function is specifically designed to handle the format of the | |
| creep_predictions_1_to_220.xlsx file which has a structure where: | |
| - Columns represent samples | |
| - Rows represent time points | |
| """ | |
| try: | |
| # Load the file | |
| df_creep = pd.read_excel(EXCEL_CREEP_FILE) | |
| print(f"Loaded creep file with shape: {df_creep.shape}") | |
| # Check if first column is time values | |
| first_col = df_creep.columns[0] | |
| if first_col in ['time', 'Time', 'TIME', 't', 'T', 'day', 'Day', 'DAY', 'd', 'D'] or str(first_col).lower().startswith(('time', 'day')): | |
| print(f"First column '{first_col}' recognized as time values") | |
| # Extract time values as an array to preserve for later use | |
| time_values = df_creep.iloc[:, 0].values | |
| # Remove the time column to keep only sample data | |
| df_creep = df_creep.iloc[:, 1:] | |
| # Store time values in the DataFrame attributes for reference | |
| df_creep.attrs['time_values'] = time_values | |
| else: | |
| print(f"First column '{first_col}' not recognized as time, but treating rows as time points") | |
| # Generate sequential time values if not provided | |
| time_values = np.arange(1, len(df_creep) + 1) | |
| df_creep.attrs['time_values'] = time_values | |
| print(f"DataFrame processed: {df_creep.shape[1]} samples across {df_creep.shape[0]} time points") | |
| return df_creep | |
| except Exception as e: | |
| print(f"Error loading creep prediction file: {str(e)}") | |
| # Return an empty DataFrame as a fallback | |
| return pd.DataFrame() | |
| # Update the load_data function to use the specialized loader | |
| def load_data(): | |
| # Read creep predictions from the new file using specialized loader | |
| df_creep = load_creep_prediction_file() | |
| # Read features from the original file | |
| df_features = pd.read_excel(EXCEL_FEATURE_FILE, sheet_name='Sheet2') | |
| # Ensure we have the same number of samples in both dataframes | |
| # Samples are in columns for creep data and in rows for feature data | |
| if df_creep.shape[1] != len(df_features): | |
| print(f"Warning: Creep data has {df_creep.shape[1]} samples (columns) but features data has {len(df_features)} rows") | |
| # Find the minimum number of samples to use | |
| min_samples = min(df_creep.shape[1], len(df_features)) | |
| # Keep only matching samples | |
| df_creep = df_creep.iloc[:, :min_samples] | |
| df_features = df_features.iloc[:min_samples] | |
| print(f"Using only {min_samples} samples that match between datasets") | |
| return df_creep, df_features | |
| # Custom Dataset class for full-history prediction (like LLM) | |
| class LLMConcreteCreepDataset(Dataset): | |
| def __init__(self, creep_data, time_data, features, target_len=1): | |
| """ | |
| Args: | |
| creep_data: List of variable-length time series [sample_idx][time_idx] | |
| time_data: List of time points [sample_idx][time_idx] | |
| features: Feature matrix [n_samples, n_features] | |
| target_len: Number of values to predict | |
| """ | |
| self.creep_data = creep_data # List of time series | |
| self.time_data = time_data # List of time points | |
| self.features = features # Feature data | |
| self.target_len = target_len # Number of values to predict | |
| # Create samples | |
| self.samples = self._prepare_samples() | |
| def _prepare_samples(self): | |
| """ | |
| Prepare samples for LLM-style prediction | |
| Each sample includes all previous time steps up to time t | |
| and targets the next target_len values | |
| """ | |
| samples = [] | |
| for i in range(len(self.creep_data)): | |
| time_series = self.creep_data[i] | |
| time_points = self.time_data[i] if self.time_data is not None else None | |
| feature_vec = self.features[i] | |
| # For each time step (except the last target_len steps) | |
| for t in range(1, len(time_series) - self.target_len + 1): | |
| # Input: all previous values up to t | |
| history = time_series[:t] | |
| # Get time points if available | |
| time_history = time_points[:t] if time_points is not None else None | |
| # Target: next target_len values | |
| targets = time_series[t:t+self.target_len] | |
| samples.append((history, targets, feature_vec, time_history)) | |
| return samples | |
| def __len__(self): | |
| return len(self.samples) | |
| def __getitem__(self, idx): | |
| history, targets, features, time_history = self.samples[idx] | |
| # Convert to tensors | |
| history_tensor = torch.FloatTensor(history) | |
| targets_tensor = torch.FloatTensor(targets) | |
| features_tensor = torch.FloatTensor(features) | |
| if time_history is not None: | |
| time_tensor = torch.FloatTensor(time_history) | |
| return history_tensor, targets_tensor, features_tensor, time_tensor, len(history) | |
| else: | |
| return history_tensor, targets_tensor, features_tensor, len(history) | |
| # Custom collate function to handle variable length sequences | |
| def collate_fn(batch): | |
| """ | |
| Pack variable length sequences for efficient processing | |
| """ | |
| # Sort by sequence length (descending) | |
| if len(batch[0]) > 4: # With time data | |
| batch.sort(key=lambda x: x[4], reverse=True) | |
| histories, targets, features, times, lengths = zip(*batch) | |
| # Pad sequences - keep all tensors on CPU to be moved to appropriate device later | |
| padded_histories = pad_sequence(histories, batch_first=True) | |
| padded_targets = torch.stack(targets) | |
| padded_features = torch.stack(features) | |
| padded_times = pad_sequence(times, batch_first=True) | |
| return padded_histories, padded_targets, padded_features, padded_times, torch.tensor(lengths, dtype=torch.int64) | |
| else: # Without time data | |
| batch.sort(key=lambda x: x[3], reverse=True) | |
| histories, targets, features, lengths = zip(*batch) | |
| # Pad sequences - keep all tensors on CPU to be moved to appropriate device later | |
| padded_histories = pad_sequence(histories, batch_first=True) | |
| padded_targets = torch.stack(targets) | |
| padded_features = torch.stack(features) | |
| return padded_histories, padded_targets, padded_features, torch.tensor(lengths, dtype=torch.int64) | |
| # Prepare data for LLM-style model | |
| def prepare_llm_data(target_len=1, test_size=0.05, val_size=0.05): | |
| # Load data from files | |
| df_creep, df_features = load_data() | |
| # Prepare variable-length sequences and time points | |
| creep_sequences = [] | |
| time_points = [] | |
| # Check the format of the creep data file | |
| print(f"Creep data has {df_creep.shape[1]} samples across {df_creep.shape[0]} time points") | |
| # Get time values if available from the data loading step | |
| if hasattr(df_creep, 'attrs') and 'time_values' in df_creep.attrs: | |
| time_values = df_creep.attrs['time_values'] | |
| print(f"Found time values with shape: {time_values.shape}") | |
| # Make sure time_values matches the number of rows in df_creep | |
| if len(time_values) != df_creep.shape[0]: | |
| print(f"Warning: Time values length ({len(time_values)}) doesn't match data rows ({df_creep.shape[0]})") | |
| # Truncate or extend time_values to match | |
| if len(time_values) > df_creep.shape[0]: | |
| time_values = time_values[:df_creep.shape[0]] | |
| else: | |
| # Extend with sequential values | |
| additional = np.arange(len(time_values) + 1, df_creep.shape[0] + 1) | |
| time_values = np.append(time_values, additional) | |
| print(f"Adjusted time values to length: {len(time_values)}") | |
| else: | |
| # Generate sequential time values if not available | |
| time_values = np.arange(1, df_creep.shape[0] + 1) | |
| print("Using generated sequential time values") | |
| # Process each column (sample) in the creep data | |
| for col_idx in range(df_creep.shape[1]): | |
| try: | |
| # Extract the column as a sample time series | |
| sample_series = df_creep.iloc[:, col_idx].values | |
| # Check for and filter out any NaN values | |
| valid_indices = ~np.isnan(sample_series) | |
| if not np.any(valid_indices): | |
| print(f"Skipping column {col_idx} - no valid data") | |
| continue | |
| # Keep only valid data and corresponding time points | |
| valid_series = sample_series[valid_indices] | |
| valid_times = time_values[valid_indices] | |
| # Store sequences if they're long enough | |
| if len(valid_series) > target_len + 1: # Need at least target_len+1 points | |
| creep_sequences.append(valid_series) | |
| time_points.append(valid_times) | |
| else: | |
| print(f"Skipping column {col_idx} - insufficient data points ({len(valid_series)})") | |
| except Exception as e: | |
| print(f"Error processing column {col_idx}: {str(e)}") | |
| continue | |
| # Log data shape | |
| print(f"Extracted {len(creep_sequences)} valid creep sequences") | |
| # Ensure we have same number of feature rows as creep sequences | |
| if len(creep_sequences) != len(df_features): | |
| print(f"Warning: Number of valid sequences ({len(creep_sequences)}) doesn't match feature count ({len(df_features)})") | |
| # If we have more features than sequences, truncate features | |
| if len(creep_sequences) < len(df_features): | |
| df_features = df_features.iloc[:len(creep_sequences)] | |
| print(f"Truncated features to {len(df_features)} rows") | |
| else: | |
| # If we have more sequences than features, truncate sequences | |
| creep_sequences = creep_sequences[:len(df_features)] | |
| time_points = time_points[:len(df_features)] | |
| print(f"Truncated sequences to {len(creep_sequences)}") | |
| # Check if we have at least one sequence | |
| if len(creep_sequences) == 0: | |
| raise ValueError("No valid sequences extracted. Check data format and filtering.") | |
| # Normalize features | |
| feature_scaler = StandardScaler() | |
| normalized_features = feature_scaler.fit_transform(df_features) | |
| # Import or define the CreepScaler class for consistency with llm_predict.py | |
| class CreepScaler: | |
| def __init__(self, factor=1000): | |
| self.factor = factor | |
| self.mean_ = 0 # Default to no mean shift | |
| self.scale_ = factor # Use factor as scale | |
| self.is_standard_scaler = False | |
| def transform(self, X): | |
| if isinstance(X, np.ndarray): | |
| if self.is_standard_scaler: | |
| return (X - self.mean_) / self.scale_ | |
| return X / self.factor | |
| return np.array(X) / self.factor | |
| def inverse_transform(self, X): | |
| if isinstance(X, np.ndarray): | |
| if self.is_standard_scaler: | |
| return (X * self.scale_) + self.mean_ | |
| return X * self.factor | |
| return np.array(X) * self.factor | |
| # Create a creep scaler that divides by 1000 | |
| creep_scaler = CreepScaler(factor=1000) | |
| # Apply normalization to sequences | |
| normalized_creep_sequences = [] | |
| for seq in creep_sequences: | |
| normalized_seq = creep_scaler.transform(np.array(seq).reshape(-1, 1)).flatten() | |
| normalized_creep_sequences.append(normalized_seq) | |
| # Normalize time points (log scale to handle large time values) | |
| normalized_time_points = [] | |
| for seq in time_points: | |
| normalized_seq = np.log1p(np.array(seq)) # log1p to handle zeros | |
| normalized_time_points.append(normalized_seq) | |
| # Print validation information | |
| print(f"Final dataset: {len(normalized_creep_sequences)} sequences") | |
| print(f"First sequence length: {len(normalized_creep_sequences[0])} time points") | |
| # Create dataset | |
| dataset = LLMConcreteCreepDataset( | |
| normalized_creep_sequences, | |
| normalized_time_points, | |
| normalized_features, | |
| target_len | |
| ) | |
| # If dataset is empty, raise an error | |
| if len(dataset) == 0: | |
| raise ValueError("Dataset is empty. Check the data preparation process.") | |
| # Calculate split sizes | |
| train_ratio = 1.0 - (test_size + val_size) | |
| train_size = int(len(dataset) * train_ratio) | |
| val_size_samples = int(len(dataset) * val_size) | |
| test_size_samples = len(dataset) - train_size - val_size_samples | |
| # Split into train, validation, and test sets using random_split | |
| print(f"Splitting dataset into {train_ratio*100:.1f}% train, {val_size*100:.1f}% validation, {test_size*100:.1f}% test") | |
| print(f"Train: {train_size} samples, Validation: {val_size_samples} samples, Test: {test_size_samples} samples") | |
| train_dataset, val_dataset, test_dataset = torch.utils.data.random_split( | |
| dataset, [train_size, val_size_samples, test_size_samples] | |
| ) | |
| return train_dataset, val_dataset, test_dataset, feature_scaler, creep_scaler | |
| # Positional Encoding | |
| class PositionalEncoding(nn.Module): | |
| def __init__(self, d_model, max_len=5000): | |
| super(PositionalEncoding, self).__init__() | |
| pe = torch.zeros(max_len, d_model) | |
| position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| self.register_buffer('pe', pe) | |
| def forward(self, x): | |
| # x: [batch_size, seq_len, d_model] | |
| return x + self.pe[:x.size(1), :].unsqueeze(0) | |
| # Feature Encoder for static features | |
| class FeatureEncoder(nn.Module): | |
| def __init__(self, input_dim, hidden_dim, dropout=0.1): | |
| super(FeatureEncoder, self).__init__() | |
| # Original encoding path | |
| self.fc1 = nn.Linear(input_dim, hidden_dim * 2) | |
| self.ln1 = nn.LayerNorm(hidden_dim * 2) | |
| self.fc2 = nn.Linear(hidden_dim * 2, hidden_dim) | |
| self.ln2 = nn.LayerNorm(hidden_dim) | |
| # New feature-wise projection (each feature to dim 16) | |
| self.feature_projection = nn.Linear(1, 16) | |
| # Ensure feature attention is configured correctly | |
| feature_embed_dim = 16 | |
| # For 16 dimensions, valid num_heads are: 1, 2, 4, 8, 16 | |
| feature_heads = 4 # 16 is divisible by 4 | |
| # Attention for parallel feature processing | |
| self.feature_attention = nn.MultiheadAttention( | |
| embed_dim=feature_embed_dim, | |
| num_heads=feature_heads, | |
| dropout=dropout, | |
| batch_first=True | |
| ) | |
| # For batch attention, first choose the embedding dimension | |
| # Make it a power of 2 for compatibility with many head configurations | |
| batch_embed_dim = 16 # Fixed safe value, divisible by many head counts | |
| # Now choose heads that divide evenly into the embed_dim | |
| batch_heads = 4 # 16 is divisible by 4 | |
| # Always project input to the fixed batch_embed_dim | |
| self.batch_projection = nn.Linear(input_dim, batch_embed_dim) | |
| # Batch-wise attention with safe values | |
| self.batch_attention = nn.MultiheadAttention( | |
| embed_dim=batch_embed_dim, | |
| num_heads=batch_heads, | |
| dropout=dropout, | |
| batch_first=True | |
| ) | |
| # Layer norms for attention outputs | |
| self.feature_ln = nn.LayerNorm(16) | |
| self.batch_ln = nn.LayerNorm(batch_embed_dim) | |
| # Integration layer - combines original and new paths | |
| self.integration = nn.Linear(hidden_dim + 16 * input_dim + batch_embed_dim, hidden_dim) | |
| self.integration_ln = nn.LayerNorm(hidden_dim) | |
| self.dropout = nn.Dropout(dropout) | |
| self.relu = nn.ReLU() | |
| # Store dimensions for debugging | |
| self.input_dim = input_dim | |
| self.batch_embed_dim = batch_embed_dim | |
| self.batch_heads = batch_heads | |
| print(f"FeatureEncoder initialized with: input_dim={input_dim}, batch_embed_dim={batch_embed_dim}, batch_heads={batch_heads}") | |
| def forward(self, x): | |
| # x: [batch_size, input_dim] | |
| batch_size, input_dim = x.size() | |
| # Original path | |
| original = self.fc1(x) | |
| original = self.ln1(original) | |
| original = self.relu(original) | |
| original = self.dropout(original) | |
| original = self.fc2(original) | |
| original = self.ln2(original) | |
| original = self.relu(original) | |
| # Feature-wise projection path | |
| # Reshape to process each feature separately | |
| features = x.view(batch_size, input_dim, 1) # [batch_size, input_dim, 1] | |
| features_projected = self.feature_projection(features) # [batch_size, input_dim, 16] | |
| # Feature-wise attention | |
| feature_attn_out, _ = self.feature_attention( | |
| features_projected, | |
| features_projected, | |
| features_projected | |
| ) # [batch_size, input_dim, 16] | |
| feature_attn_out = self.feature_ln(feature_attn_out + features_projected) # Add & Norm | |
| # Apply projection to make input_dim compatible with attention | |
| x_proj = self.batch_projection(x) | |
| # Batch-wise attention | |
| batch_attn_out, _ = self.batch_attention( | |
| x_proj.unsqueeze(1), # [batch_size, 1, batch_embed_dim] | |
| x_proj.unsqueeze(1), | |
| x_proj.unsqueeze(1) | |
| ) # [batch_size, 1, batch_embed_dim] | |
| batch_attn_out = self.batch_ln(batch_attn_out.squeeze(1) + x_proj) # Add & Norm | |
| # Reshape feature attention output to concatenate | |
| feature_attn_flat = feature_attn_out.reshape(batch_size, -1) # [batch_size, input_dim * 16] | |
| # Concatenate all processed features | |
| combined = torch.cat([original, feature_attn_flat, batch_attn_out], dim=1) | |
| # Final integration | |
| output = self.integration(combined) | |
| output = self.integration_ln(output) | |
| output = self.relu(output) | |
| return output | |
| # Self-Attention Block | |
| class SelfAttention(nn.Module): | |
| def __init__(self, d_model, num_heads, dropout=0.1): | |
| super(SelfAttention, self).__init__() | |
| self.d_model = d_model | |
| self.num_heads = num_heads | |
| self.head_dim = d_model // num_heads | |
| assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads" | |
| # Multi-head attention | |
| self.attention = nn.MultiheadAttention( | |
| embed_dim=d_model, | |
| num_heads=num_heads, | |
| dropout=dropout, | |
| batch_first=True | |
| ) | |
| # Layer normalization and dropout | |
| self.layer_norm = nn.LayerNorm(d_model) | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x, attention_mask=None, key_padding_mask=None): | |
| # x: [batch_size, seq_len, d_model] | |
| # Self-attention with residual connection | |
| attn_output, _ = self.attention( | |
| query=x, | |
| key=x, | |
| value=x, | |
| attn_mask=attention_mask, | |
| key_padding_mask=key_padding_mask | |
| ) | |
| # Add & Norm | |
| x = x + self.dropout(attn_output) | |
| x = self.layer_norm(x) | |
| return x | |
| # Feed-Forward Block | |
| class FeedForward(nn.Module): | |
| def __init__(self, d_model, d_ff, dropout=0.1): | |
| super(FeedForward, self).__init__() | |
| self.linear1 = nn.Linear(d_model, d_ff) | |
| self.linear2 = nn.Linear(d_ff, d_model) | |
| self.relu = nn.ReLU() | |
| self.dropout = nn.Dropout(dropout) | |
| self.layer_norm = nn.LayerNorm(d_model) | |
| def forward(self, x): | |
| # x: [batch_size, seq_len, d_model] | |
| # FFN with residual connection | |
| ff_output = self.linear1(x) | |
| ff_output = self.relu(ff_output) | |
| ff_output = self.dropout(ff_output) | |
| ff_output = self.linear2(ff_output) | |
| # Add & Norm | |
| x = x + self.dropout(ff_output) | |
| x = self.layer_norm(x) | |
| return x | |
| # Transformer Encoder Layer | |
| class EncoderLayer(nn.Module): | |
| def __init__(self, d_model, num_heads, d_ff, dropout=0.1): | |
| super(EncoderLayer, self).__init__() | |
| self.self_attention = SelfAttention(d_model, num_heads, dropout) | |
| self.feed_forward = FeedForward(d_model, d_ff, dropout) | |
| def forward(self, x, attention_mask=None, key_padding_mask=None): | |
| # x: [batch_size, seq_len, d_model] | |
| # Self-attention block | |
| x = self.self_attention(x, attention_mask, key_padding_mask) | |
| # Feed-forward block | |
| x = self.feed_forward(x) | |
| return x | |
| # LLM-Style Concrete Creep Transformer | |
| class LLMConcreteModel(nn.Module): | |
| def __init__( | |
| self, | |
| feature_dim, | |
| d_model=128, | |
| num_layers=6, | |
| num_heads=8, | |
| d_ff=512, | |
| dropout=0.1, | |
| target_len=1, | |
| pooling_method='attention' # Options: 'mean', 'max', 'attention', 'weighted', 'hybrid' | |
| ): | |
| super(LLMConcreteModel, self).__init__() | |
| # Model dimensions | |
| self.d_model = d_model | |
| self.target_len = target_len | |
| self.pooling_method = pooling_method | |
| # Input embedding layers | |
| self.creep_embedding = nn.Linear(1, d_model) | |
| self.time_embedding = nn.Linear(1, d_model) if True else None # Optional time embedding | |
| self.feature_encoder = FeatureEncoder(feature_dim, d_model, dropout) | |
| # Positional encoding | |
| self.positional_encoding = PositionalEncoding(d_model) | |
| # Encoder layers | |
| self.encoder_layers = nn.ModuleList([ | |
| EncoderLayer(d_model, num_heads, d_ff, dropout) | |
| for _ in range(num_layers) | |
| ]) | |
| # Attention pooling layer for sequence tokens | |
| self.attention_pooling = nn.Sequential( | |
| nn.Linear(d_model, 1), | |
| nn.Softmax(dim=1) | |
| ) | |
| # Weighted pooling parameters | |
| self.weighted_pool = nn.Linear(d_model, 1, bias=False) | |
| # Hybrid pooling integration layer | |
| self.hybrid_pooling_integration = nn.Linear(d_model * 3, d_model) | |
| # Output layers for prediction | |
| self.predictor = nn.Sequential( | |
| nn.Linear(d_model, d_model), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(d_model, target_len) | |
| ) | |
| # Integration of features with sequence | |
| self.feature_integration = nn.Linear(d_model * 2, d_model) | |
| # Layer normalization | |
| self.layer_norm = nn.LayerNorm(d_model) | |
| # Dropout | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, creep_history, features, lengths, time_history=None): | |
| # creep_history: [batch_size, max_seq_len] | |
| # features: [batch_size, feature_dim] | |
| # lengths: [batch_size] - actual sequence lengths | |
| # time_history: [batch_size, max_seq_len] (optional) | |
| # Get the device from input tensors to ensure consistent device usage | |
| device = creep_history.device | |
| batch_size, max_seq_len = creep_history.size() | |
| # Create padding mask (1 for padding, 0 for actual values) | |
| padding_mask = torch.arange(max_seq_len, device=device).unsqueeze(0) >= lengths.unsqueeze(1) | |
| # Create attention mask to prevent looking at padding tokens | |
| attention_mask = padding_mask.unsqueeze(1).expand(batch_size, max_seq_len, max_seq_len) | |
| # Embed creep values | |
| creep_embedded = self.creep_embedding(creep_history.unsqueeze(-1)) | |
| # Add time embedding if provided | |
| if time_history is not None and self.time_embedding is not None: | |
| time_embedded = self.time_embedding(time_history.unsqueeze(-1)) | |
| # Combine creep and time embeddings | |
| embedded = creep_embedded + time_embedded | |
| else: | |
| embedded = creep_embedded | |
| # Add positional encoding | |
| embedded = self.positional_encoding(embedded) | |
| # Apply dropout | |
| embedded = self.dropout(embedded) | |
| # Process feature data | |
| feature_encoded = self.feature_encoder(features) # [batch_size, d_model] | |
| # Pass through encoder layers | |
| encoder_output = embedded | |
| for layer in self.encoder_layers: | |
| encoder_output = layer(encoder_output, key_padding_mask=padding_mask) | |
| # USE ALL TOKENS: Apply pooling to aggregate information from all tokens | |
| # Create a mask for padding (1 for real tokens, 0 for padding) | |
| mask = ~padding_mask # [batch_size, seq_len] | |
| if self.pooling_method == 'mean': | |
| # Mean pooling with mask to handle variable sequence lengths | |
| # Sum all non-padding token embeddings and divide by sequence length | |
| mask_expanded = mask.unsqueeze(-1).float() # [batch_size, seq_len, 1] | |
| context_vectors = torch.sum(encoder_output * mask_expanded, dim=1) / torch.sum(mask_expanded, dim=1) | |
| elif self.pooling_method == 'max': | |
| # Max pooling with mask to handle variable sequence lengths | |
| # Use a large negative number for padding tokens | |
| masked_output = encoder_output.clone() | |
| masked_output[padding_mask.unsqueeze(-1).expand_as(masked_output)] = float('-inf') | |
| context_vectors = torch.max(masked_output, dim=1)[0] | |
| elif self.pooling_method == 'attention': | |
| # Attention pooling | |
| # Calculate attention weights for each token | |
| attn_weights = self.attention_pooling(encoder_output) # [batch_size, seq_len, 1] | |
| # Zero out attention for padding tokens | |
| attn_weights = attn_weights.masked_fill(padding_mask.unsqueeze(-1), 0) | |
| # Normalize weights to sum to 1 (per batch) | |
| attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) | |
| # Weighted sum of token embeddings | |
| context_vectors = torch.sum(encoder_output * attn_weights, dim=1) | |
| elif self.pooling_method == 'weighted': | |
| # Weighted pooling considering sequence position | |
| # Higher weights for later positions (more recent tokens) | |
| position_weights = self.weighted_pool(encoder_output) # [batch_size, seq_len, 1] | |
| # Apply softmax to get normalized weights | |
| position_weights = torch.softmax(position_weights, dim=1) | |
| # Zero out weights for padding tokens | |
| position_weights = position_weights.masked_fill(padding_mask.unsqueeze(-1), 0) | |
| # Weighted sum of token embeddings | |
| context_vectors = torch.sum(encoder_output * position_weights, dim=1) | |
| elif self.pooling_method == 'hybrid': | |
| # Hybrid pooling: combine multiple pooling methods | |
| # 1. Mean pooling | |
| mask_expanded = mask.unsqueeze(-1).float() | |
| mean_vectors = torch.sum(encoder_output * mask_expanded, dim=1) / torch.sum(mask_expanded, dim=1) | |
| # 2. Attention pooling | |
| attn_weights = self.attention_pooling(encoder_output) | |
| attn_weights = attn_weights.masked_fill(padding_mask.unsqueeze(-1), 0) | |
| attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) | |
| attn_vectors = torch.sum(encoder_output * attn_weights, dim=1) | |
| # 3. Last token pooling (traditional approach) | |
| last_indices = (lengths - 1).clamp(min=0) | |
| batch_indices = torch.arange(batch_size, device=device) | |
| last_vectors = encoder_output[batch_indices, last_indices] | |
| # Combine all pooling methods with a learnable integration | |
| combined_vectors = torch.cat([mean_vectors, attn_vectors, last_vectors], dim=1) | |
| context_vectors = self.hybrid_pooling_integration(combined_vectors) | |
| context_vectors = torch.tanh(context_vectors) | |
| else: | |
| # Default: use a combination of mean and attention | |
| # Mean pooling component | |
| mask_expanded = mask.unsqueeze(-1).float() | |
| mean_vectors = torch.sum(encoder_output * mask_expanded, dim=1) / torch.sum(mask_expanded, dim=1) | |
| # Attention pooling component | |
| attn_weights = self.attention_pooling(encoder_output) | |
| attn_weights = attn_weights.masked_fill(padding_mask.unsqueeze(-1), 0) | |
| attn_weights = attn_weights / (attn_weights.sum(dim=1, keepdim=True) + 1e-8) | |
| attn_vectors = torch.sum(encoder_output * attn_weights, dim=1) | |
| # Combine both pooling methods | |
| context_vectors = (mean_vectors + attn_vectors) / 2 | |
| # Combine context with features | |
| combined = torch.cat([context_vectors, feature_encoded], dim=1) # [batch_size, d_model*2] | |
| integrated = self.feature_integration(combined) # [batch_size, d_model] | |
| integrated = torch.tanh(integrated) | |
| # Final layer normalization | |
| integrated = self.layer_norm(integrated) | |
| # Generate predictions | |
| predictions = self.predictor(integrated) # [batch_size, target_len] | |
| return predictions | |
| # Function to create padding mask for variable length sequences | |
| def create_padding_mask(lengths, max_len): | |
| """ | |
| Create a mask for padding tokens (1 for padding, 0 for actual values) | |
| Args: | |
| lengths: Tensor of sequence lengths [batch_size] | |
| max_len: Maximum sequence length | |
| Returns: | |
| Padding mask [batch_size, max_len] | |
| """ | |
| batch_size = lengths.size(0) | |
| mask = torch.arange(max_len).unsqueeze(0) >= lengths.unsqueeze(1) | |
| return mask | |
| # Train the model | |
| def train_model(model, train_loader, optimizer, criterion, device, clip=1.0): | |
| model.train() | |
| epoch_loss = 0 | |
| num_batches = 0 | |
| for batch_idx, batch in enumerate(train_loader): | |
| try: | |
| if len(batch) == 5: # With time data | |
| histories, targets, features, times, lengths = [item.to(device) for item in batch] | |
| # Forward pass | |
| optimizer.zero_grad() | |
| outputs = model(histories, features, lengths, times) | |
| else: # Without time data | |
| histories, targets, features, lengths = [item.to(device) for item in batch] | |
| # Forward pass | |
| optimizer.zero_grad() | |
| outputs = model(histories, features, lengths) | |
| # Calculate loss | |
| loss = criterion(outputs, targets) | |
| # Backward pass | |
| loss.backward() | |
| # Clip gradients | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), clip) | |
| optimizer.step() | |
| epoch_loss += loss.item() | |
| num_batches += 1 | |
| except Exception as e: | |
| print(f"Error in batch {batch_idx}: {str(e)}") | |
| continue | |
| return epoch_loss / max(1, num_batches) | |
| # Evaluate the model | |
| def evaluate_model(model, test_loader, criterion, device): | |
| model.eval() | |
| epoch_loss = 0 | |
| num_batches = 0 | |
| # For calculating MAPE and MAE | |
| all_targets = [] | |
| all_outputs = [] | |
| with torch.no_grad(): | |
| for batch_idx, batch in enumerate(test_loader): | |
| try: | |
| if len(batch) == 5: # With time data | |
| histories, targets, features, times, lengths = [item.to(device) for item in batch] | |
| outputs = model(histories, features, lengths, times) | |
| else: # Without time data | |
| histories, targets, features, lengths = [item.to(device) for item in batch] | |
| outputs = model(histories, features, lengths) | |
| # Calculate loss | |
| loss = criterion(outputs, targets) | |
| epoch_loss += loss.item() | |
| num_batches += 1 | |
| # Store targets and outputs for MAPE calculation | |
| all_targets.append(targets.cpu()) | |
| all_outputs.append(outputs.cpu()) | |
| except Exception as e: | |
| print(f"Error in evaluation batch {batch_idx}: {str(e)}") | |
| continue | |
| # Calculate MAPE and MAE if we have data | |
| mape = None | |
| mae = None | |
| if all_targets and all_outputs: | |
| try: | |
| # Concatenate all batches | |
| all_targets = torch.cat(all_targets) | |
| all_outputs = torch.cat(all_outputs) | |
| # Calculate MAE (Mean Absolute Error) | |
| mae = torch.abs(all_targets - all_outputs).mean().item() | |
| # Calculate MAPE, avoiding division by zero | |
| # Add small epsilon to avoid division by zero | |
| epsilon = 1e-8 | |
| abs_percentage_errors = torch.abs((all_targets - all_outputs) / (all_targets + epsilon)) * 100 | |
| # Filter out invalid values (where target is very close to zero) | |
| valid_indices = torch.abs(all_targets) > epsilon | |
| if valid_indices.sum() > 0: | |
| mape = abs_percentage_errors[valid_indices].mean().item() | |
| else: | |
| mape = float('nan') | |
| except Exception as e: | |
| print(f"Error calculating metrics: {str(e)}") | |
| mape = float('nan') | |
| mae = float('nan') | |
| return epoch_loss / max(1, num_batches), mape, mae | |
| # Function to predict using the full history | |
| def predict_with_full_history(model, creep_history, features, creep_scaler, device, time_history=None): | |
| model.eval() | |
| with torch.no_grad(): | |
| # Convert inputs to tensors | |
| creep_tensor = torch.FloatTensor(creep_history).unsqueeze(0).to(device) # [1, seq_len] | |
| features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) # [1, feature_dim] | |
| lengths = torch.tensor([len(creep_history)]).to(device) # [1] | |
| if time_history is not None: | |
| time_tensor = torch.FloatTensor(time_history).unsqueeze(0).to(device) # [1, seq_len] | |
| predictions = model(creep_tensor, features_tensor, lengths, time_tensor) | |
| else: | |
| predictions = model(creep_tensor, features_tensor, lengths) | |
| # Convert predictions to numpy and denormalize | |
| predictions_np = predictions.cpu().numpy()[0] # [target_len] | |
| predictions_denorm = creep_scaler.inverse_transform( | |
| predictions_np.reshape(-1, 1) | |
| ).flatten() | |
| return predictions_denorm | |
| # Visualize predictions for a test sample | |
| def visualize_predictions(model, test_loader, creep_scaler, device, sample_idx=0): | |
| # Get a batch from the test loader | |
| for i, batch in enumerate(test_loader): | |
| if i == sample_idx // test_loader.batch_size: | |
| idx_in_batch = sample_idx % test_loader.batch_size | |
| if len(batch) == 5: # With time data | |
| histories, targets, features, times, lengths = batch | |
| history = histories[idx_in_batch, :lengths[idx_in_batch]].numpy() | |
| time_history = times[idx_in_batch, :lengths[idx_in_batch]].numpy() | |
| feature = features[idx_in_batch].numpy() | |
| target = targets[idx_in_batch].numpy() | |
| # Get predictions | |
| predictions = predict_with_full_history( | |
| model, history, feature, creep_scaler, device, time_history | |
| ) | |
| # Get actual time values (denormalize from log scale) | |
| time_values = np.exp(time_history) - 1 # Reverse of log1p | |
| else: # Without time data | |
| histories, targets, features, lengths = batch | |
| history = histories[idx_in_batch, :lengths[idx_in_batch]].numpy() | |
| feature = features[idx_in_batch].numpy() | |
| target = targets[idx_in_batch].numpy() | |
| # Get predictions | |
| predictions = predict_with_full_history( | |
| model, history, feature, creep_scaler, device | |
| ) | |
| # Create sequential time steps for plotting | |
| time_values = np.arange(1, len(history) + 1) | |
| # Denormalize target and history | |
| target_denorm = creep_scaler.inverse_transform( | |
| target.reshape(-1, 1) | |
| ).flatten() | |
| history_denorm = creep_scaler.inverse_transform( | |
| history.reshape(-1, 1) | |
| ).flatten() | |
| # Get time steps for predictions and targets | |
| # If we have actual time values, use the last time point plus regular intervals | |
| history_time = time_values | |
| if len(time_values) > 0: | |
| # If we have time data, we need to extrapolate for prediction times | |
| time_step = 1.0 | |
| if len(time_values) > 1: | |
| # Estimate time step from the last two points | |
| time_step = time_values[-1] - time_values[-2] | |
| # Generate future time points for predictions/targets | |
| target_time = np.array([time_values[-1] + time_step * (i+1) for i in range(len(target))]) | |
| pred_time = np.array([time_values[-1] + time_step * (i+1) for i in range(len(predictions))]) | |
| else: | |
| # If no time data, use sequential indices | |
| target_time = np.arange(len(history) + 1, len(history) + len(target) + 1) | |
| pred_time = np.arange(len(history) + 1, len(history) + len(predictions) + 1) | |
| # Plot results | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(history_time, history_denorm, 'b-', label='Historical Data') | |
| plt.plot(target_time, target_denorm, 'g-', label='Actual Future') | |
| plt.plot(pred_time, predictions, 'r--', label='Predictions') | |
| plt.legend() | |
| plt.title('Concrete Creep Prediction with Full History') | |
| plt.xlabel('Time') | |
| plt.ylabel('Creep Value') | |
| plt.grid(True) | |
| plt.savefig('llm_prediction_results.png') | |
| plt.close() | |
| return history_denorm, target_denorm, predictions | |
| print("Sample index out of range") | |
| return None, None, None | |
| # Utility function to examine data structure | |
| def examine_data_structure(): | |
| """ | |
| Examine the structure of the creep and feature files | |
| to help with debugging and data understanding | |
| """ | |
| print("Examining data structure...") | |
| # Load the creep file | |
| try: | |
| df_creep = pd.read_excel(EXCEL_CREEP_FILE) | |
| print(f"\nCreep file shape: {df_creep.shape}") | |
| print(f"Format: {df_creep.shape[0]} time points (rows) × {df_creep.shape[1]} samples (columns)") | |
| # Check if first column might be time values | |
| first_col = df_creep.columns[0] | |
| if first_col in ['time', 'Time', 'TIME', 't', 'T', 'day', 'Day', 'DAY', 'd', 'D'] or str(first_col).lower().startswith(('time', 'day')): | |
| print(f"First column '{first_col}' recognized as time values") | |
| print(f"Time values sample: {df_creep.iloc[:5, 0].tolist()}") | |
| print(f"Actual samples start from column 1") | |
| else: | |
| print(f"First column '{first_col}' not recognized as time, but treating rows as time points") | |
| print(f"Assuming all columns are samples") | |
| # Show a sample of the data | |
| print(f"First 5 rows (time points) and 3 columns (samples):") | |
| print(df_creep.iloc[:5, :3]) | |
| # Count NaN values | |
| nan_count = df_creep.isna().sum().sum() | |
| print(f"Total NaN values: {nan_count}") | |
| except Exception as e: | |
| print(f"Error examining creep file: {str(e)}") | |
| # Load the feature file | |
| try: | |
| df_features = pd.read_excel(EXCEL_FEATURE_FILE, sheet_name='Sheet2') | |
| print(f"\nFeature file shape: {df_features.shape}") | |
| print(f"Feature file columns: {df_features.columns.tolist()}") | |
| print(f"Feature sample (first 3 rows):") | |
| print(df_features.iloc[:3]) | |
| # Ensure it has the right number of rows to match sample count | |
| if df_features.shape[0] != df_creep.shape[1]: | |
| print(f"WARNING: Feature count ({df_features.shape[0]} rows) does not match sample count in creep file ({df_creep.shape[1]} columns)") | |
| else: | |
| print(f"Feature rows ({df_features.shape[0]}) matches sample count in creep file ({df_creep.shape[1]} columns)") | |
| except Exception as e: | |
| print(f"Error examining feature file: {str(e)}") | |
| print("\nData examination complete.") | |
| # Add a function to calculate detailed performance metrics on test data | |
| def calculate_detailed_metrics(model, test_loader, creep_scaler, device): | |
| """ | |
| Calculate detailed performance metrics on the test dataset. | |
| Returns actual and predicted values in their original scale along with metrics. | |
| """ | |
| model.eval() | |
| all_targets_norm = [] | |
| all_outputs_norm = [] | |
| all_targets_denorm = [] | |
| all_outputs_denorm = [] | |
| with torch.no_grad(): | |
| for batch_idx, batch in enumerate(test_loader): | |
| try: | |
| if len(batch) == 5: # With time data | |
| histories, targets, features, times, lengths = [item.to(device) for item in batch] | |
| outputs = model(histories, features, lengths, times) | |
| else: # Without time data | |
| histories, targets, features, lengths = [item.to(device) for item in batch] | |
| outputs = model(histories, features, lengths) | |
| # Store normalized values | |
| all_targets_norm.append(targets.cpu()) | |
| all_outputs_norm.append(outputs.cpu()) | |
| # Denormalize for actual metrics | |
| for i in range(len(targets)): | |
| target = targets[i].cpu().numpy() | |
| output = outputs[i].cpu().numpy() | |
| # Reshape for inverse_transform | |
| target_denorm = creep_scaler.inverse_transform(target.reshape(-1, 1)).flatten() | |
| output_denorm = creep_scaler.inverse_transform(output.reshape(-1, 1)).flatten() | |
| all_targets_denorm.extend(target_denorm) | |
| all_outputs_denorm.extend(output_denorm) | |
| except Exception as e: | |
| print(f"Error in batch {batch_idx}: {str(e)}") | |
| continue | |
| # Convert to numpy arrays | |
| all_targets_denorm = np.array(all_targets_denorm) | |
| all_outputs_denorm = np.array(all_outputs_denorm) | |
| # Calculate metrics on denormalized data | |
| mse = np.mean((all_targets_denorm - all_outputs_denorm) ** 2) | |
| rmse = np.sqrt(mse) | |
| mae = np.mean(np.abs(all_targets_denorm - all_outputs_denorm)) | |
| # Calculate MAPE, avoiding division by zero | |
| epsilon = 1e-8 | |
| mask = np.abs(all_targets_denorm) > epsilon | |
| mape = np.mean(np.abs((all_targets_denorm[mask] - all_outputs_denorm[mask]) / (all_targets_denorm[mask]))) * 100 | |
| # Calculate R² | |
| ss_total = np.sum((all_targets_denorm - np.mean(all_targets_denorm)) ** 2) | |
| ss_residual = np.sum((all_targets_denorm - all_outputs_denorm) ** 2) | |
| r_squared = 1 - (ss_residual / ss_total) if ss_total > 0 else 0 | |
| # Print detailed metrics | |
| print("\n===== Detailed Performance Metrics =====") | |
| print(f"MSE: {mse:.6f}") | |
| print(f"RMSE: {rmse:.6f}") | |
| print(f"MAE: {mae:.6f}") | |
| print(f"MAPE: {mape:.2f}%") | |
| print(f"R²: {r_squared:.6f}") | |
| return { | |
| "targets": all_targets_denorm, | |
| "predictions": all_outputs_denorm, | |
| "mse": mse, | |
| "rmse": rmse, | |
| "mae": mae, | |
| "mape": mape, | |
| "r_squared": r_squared | |
| } | |
| # Main function | |
| def main(): | |
| print("\n" + "="*80) | |
| print("CONCRETE CREEP PREDICTION MODEL WITH LLM-STYLE FULL HISTORY PROCESSING") | |
| print("="*80 + "\n") | |
| # Parameters - Updated with Bayesian optimization results | |
| TARGET_LEN = 1 # Length of prediction horizon | |
| D_MODEL = 192 # Model dimension (was 128) | |
| NUM_LAYERS = 4 # Number of transformer layers (was 6) | |
| NUM_HEADS = 4 # Number of attention heads (was 8) | |
| BATCH_SIZE = 128 # Batch size (was 200) | |
| LEARNING_RATE = 0.0001897931493931044 # Learning rate (was 0.001) | |
| WEIGHT_DECAY = 5.552376124031933e-06 # Weight decay (was 1e-5) | |
| DROPOUT = 0.056999223340150215 # Dropout rate for model initialization (new parameter) | |
| NUM_EPOCHS = 200 | |
| POOLING_METHOD = 'hybrid' # Using the hybrid pooling method which combines multiple approaches | |
| # Set device | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Using device: {device}") | |
| # Set memory handling for GPU if available | |
| if device.type == 'cuda': | |
| print("Managing GPU memory settings...") | |
| # Empty cache to start fresh | |
| torch.cuda.empty_cache() | |
| # Get GPU memory info | |
| if hasattr(torch.cuda, 'get_device_properties'): | |
| prop = torch.cuda.get_device_properties(device) | |
| print(f"GPU: {prop.name} with {prop.total_memory / 1024**3:.2f} GB memory") | |
| try: | |
| # Examine data structure first for debugging | |
| examine_data_structure() | |
| # Prepare data | |
| print("\nPreparing data...") | |
| train_dataset, val_dataset, test_dataset, feature_scaler, creep_scaler = prepare_llm_data( | |
| target_len=TARGET_LEN | |
| ) | |
| print(f"Training samples: {len(train_dataset)}") | |
| print(f"Validation samples: {len(val_dataset)}") | |
| print(f"Testing samples: {len(test_dataset)}") | |
| # Adjust batch size if needed | |
| adjusted_batch_size = min(BATCH_SIZE, len(train_dataset), len(val_dataset), len(test_dataset)) | |
| if adjusted_batch_size < BATCH_SIZE: | |
| print(f"Adjusting batch size from {BATCH_SIZE} to {adjusted_batch_size} due to small dataset") | |
| BATCH_SIZE = adjusted_batch_size | |
| # Create data loaders | |
| print(f"Creating dataloaders with batch size {BATCH_SIZE}...") | |
| train_loader = DataLoader( | |
| train_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=True, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True if device.type == 'cuda' else False # Faster data transfer to GPU | |
| ) | |
| val_loader = DataLoader( | |
| val_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=False, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True if device.type == 'cuda' else False # Faster data transfer to GPU | |
| ) | |
| test_loader = DataLoader( | |
| test_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=False, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True if device.type == 'cuda' else False # Faster data transfer to GPU | |
| ) | |
| # Get feature dimension | |
| feature_dim = train_dataset[0][2].shape[0] | |
| print(f"Feature dimension: {feature_dim}") | |
| # Initialize model | |
| print("\nInitializing model...") | |
| print(f"Using pooling method: {POOLING_METHOD}") | |
| model = LLMConcreteModel( | |
| feature_dim=feature_dim, | |
| d_model=D_MODEL, | |
| num_layers=NUM_LAYERS, | |
| num_heads=NUM_HEADS, | |
| d_ff=D_MODEL * 4, | |
| dropout=DROPOUT, # Using the optimized dropout value | |
| target_len=TARGET_LEN, | |
| pooling_method=POOLING_METHOD # Set the pooling method | |
| ) | |
| # Move model to device | |
| model = model.to(device) | |
| print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") | |
| # Define optimizer and loss | |
| optimizer = optim.AdamW( | |
| model.parameters(), | |
| lr=LEARNING_RATE, | |
| weight_decay=WEIGHT_DECAY | |
| ) | |
| criterion = nn.MSELoss() | |
| # Learning rate scheduler | |
| scheduler = optim.lr_scheduler.ReduceLROnPlateau( | |
| optimizer, | |
| mode='min', | |
| factor=0.5, | |
| patience=5, | |
| verbose=True | |
| ) | |
| # Training loop | |
| print("\nStarting training...") | |
| train_losses = [] | |
| val_losses = [] | |
| val_mapes = [] # Track MAPE values | |
| best_val_loss = float('inf') | |
| for epoch in range(NUM_EPOCHS): | |
| try: | |
| # Train | |
| train_loss = train_model(model, train_loader, optimizer, criterion, device) | |
| train_losses.append(train_loss) | |
| # Evaluate | |
| val_loss, val_mape, val_mae = evaluate_model(model, val_loader, criterion, device) | |
| val_losses.append(val_loss) | |
| val_mapes.append(val_mape if val_mape is not None else float('nan')) | |
| # Update learning rate | |
| scheduler.step(val_loss) | |
| # Print progress | |
| print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}, MAPE: {val_mape:.2f}%, MAE: {val_mae:.6f}") | |
| # Save best model | |
| if val_loss < best_val_loss: | |
| best_val_loss = val_loss | |
| torch.save(model.state_dict(), 'best_llm_model.pt') | |
| print(f"Best model saved (Epoch {epoch+1})") | |
| # Periodically clear GPU cache | |
| if device.type == 'cuda' and (epoch + 1) % 5 == 0: | |
| torch.cuda.empty_cache() | |
| except RuntimeError as e: | |
| if 'out of memory' in str(e).lower(): | |
| print(f"WARNING: GPU out of memory at epoch {epoch+1}. Attempting to recover...") | |
| if device.type == 'cuda': | |
| torch.cuda.empty_cache() | |
| # Try reducing batch size | |
| if BATCH_SIZE > 1: | |
| BATCH_SIZE = BATCH_SIZE // 2 | |
| print(f"Reducing batch size to {BATCH_SIZE}") | |
| # Recreate dataloaders with new batch size | |
| train_loader = DataLoader( | |
| train_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=True, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True | |
| ) | |
| val_loader = DataLoader( | |
| val_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=False, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True | |
| ) | |
| test_loader = DataLoader( | |
| test_dataset, | |
| batch_size=BATCH_SIZE, | |
| shuffle=False, | |
| collate_fn=collate_fn, | |
| drop_last=False, | |
| pin_memory=True | |
| ) | |
| # Continue with reduced batch size | |
| continue | |
| else: | |
| print("ERROR: Batch size already at minimum. Cannot recover.") | |
| break | |
| else: | |
| print(f"ERROR during training: {str(e)}") | |
| break | |
| # Save final model at the last epoch | |
| torch.save(model.state_dict(), 'final_llm_model.pt') | |
| print(f"Final model saved at epoch {NUM_EPOCHS}") | |
| # Plot loss curves with MAPE | |
| print("\nPlotting loss curves and MAPE...") | |
| # Create a figure with two subplots | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12)) | |
| # Plot losses on the first subplot | |
| ax1.plot(train_losses, label='Training Loss') | |
| ax1.plot(val_losses, label='Validation Loss') | |
| ax1.set_xlabel('Epoch') | |
| ax1.set_ylabel('Loss (MSE)') | |
| ax1.set_title('Training and Validation Loss') | |
| ax1.legend() | |
| ax1.grid(True) | |
| # Plot MAPE on the second subplot | |
| ax2.plot(val_mapes, 'r-', label='Validation MAPE') | |
| ax2.set_xlabel('Epoch') | |
| ax2.set_ylabel('MAPE (%)') | |
| ax2.set_title('Validation Mean Absolute Percentage Error') | |
| ax2.legend() | |
| ax2.grid(True) | |
| plt.tight_layout() | |
| plt.savefig('llm_loss_and_mape_curves.png') | |
| plt.close() | |
| # Also save the traditional loss curve plot | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(train_losses, label='Training Loss') | |
| plt.plot(val_losses, label='Validation Loss') | |
| plt.xlabel('Epoch') | |
| plt.ylabel('Loss') | |
| plt.title('Training and Validation Loss (LLM Model)') | |
| plt.legend() | |
| plt.grid(True) | |
| plt.savefig('llm_loss_curves.png') | |
| plt.close() | |
| #================================================== | |
| # COMPREHENSIVE EVALUATION ON TEST SET | |
| #================================================== | |
| print("\n" + "="*80) | |
| print("COMPREHENSIVE EVALUATION ON TEST SET") | |
| print("="*80) | |
| # Load final model | |
| print("\nEvaluating final model...") | |
| model.load_state_dict(torch.load('final_llm_model.pt', map_location=device)) | |
| # Calculate metrics for final model | |
| final_test_loss, final_test_mape, final_test_mae = evaluate_model(model, test_loader, criterion, device) | |
| print(f"Final model metrics - MSE: {final_test_loss:.6f}, MAPE: {final_test_mape:.2f}%, MAE: {final_test_mae:.6f}") | |
| # Calculate detailed metrics for final model | |
| print("\nDetailed metrics for final model:") | |
| final_metrics = calculate_detailed_metrics(model, test_loader, creep_scaler, device) | |
| # Visualize predictions for final model | |
| for sample_idx in range(min(3, len(test_loader.dataset))): | |
| history, target, predictions = visualize_predictions( | |
| model, test_loader, creep_scaler, device, sample_idx=sample_idx | |
| ) | |
| if history is not None: | |
| print(f"\nSample {sample_idx+1} (Final Model):") | |
| print(f"Target values: {target}") | |
| print(f"Predictions: {predictions}") | |
| plt.savefig(f'final_model_prediction_sample_{sample_idx+1}.png') | |
| # Load best model | |
| print("\nEvaluating best model...") | |
| model.load_state_dict(torch.load('best_llm_model.pt', map_location=device)) | |
| # Calculate metrics for best model | |
| best_test_loss, best_test_mape, best_test_mae = evaluate_model(model, test_loader, criterion, device) | |
| print(f"Best model metrics - MSE: {best_test_loss:.6f}, MAPE: {best_test_mape:.2f}%, MAE: {best_test_mae:.6f}") | |
| # Calculate detailed metrics for best model | |
| print("\nDetailed metrics for best model:") | |
| best_metrics = calculate_detailed_metrics(model, test_loader, creep_scaler, device) | |
| # Visualize predictions for best model | |
| for sample_idx in range(min(3, len(test_loader.dataset))): | |
| history, target, predictions = visualize_predictions( | |
| model, test_loader, creep_scaler, device, sample_idx=sample_idx | |
| ) | |
| if history is not None: | |
| print(f"\nSample {sample_idx+1} (Best Model):") | |
| print(f"Target values: {target}") | |
| print(f"Predictions: {predictions}") | |
| plt.savefig(f'best_model_prediction_sample_{sample_idx+1}.png') | |
| # Compare models | |
| print("\n" + "="*50) | |
| print("MODEL COMPARISON") | |
| print("="*50) | |
| print(f" Final Model Best Model") | |
| print(f"MSE: {final_metrics['mse']:.6f} {best_metrics['mse']:.6f}") | |
| print(f"RMSE: {final_metrics['rmse']:.6f} {best_metrics['rmse']:.6f}") | |
| print(f"MAE: {final_metrics['mae']:.6f} {best_metrics['mae']:.6f}") | |
| print(f"MAPE: {final_metrics['mape']:.2f}% {best_metrics['mape']:.2f}%") | |
| print(f"R²: {final_metrics['r_squared']:.6f} {best_metrics['r_squared']:.6f}") | |
| print("\nTraining and evaluation complete!") | |
| except Exception as e: | |
| print(f"\nERROR: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| print("\nExiting due to error.") | |
| if __name__ == "__main__": | |
| main() |