MethSurvPredictor / methane.py
csycsycsy's picture
Upload 10 files
84db192 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
os.chdir(script_dir)
data = pd.read_csv('data.csv')
X = data.drop(columns=['OS.time']).values
y = data['OS.time'].values
print(np.isnan(X).sum(), np.isnan(y).sum())
print(np.isinf(X).sum(), np.isinf(y).sum())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
class SimpleNN(nn.Module):
def __init__(self, input_dim):
super(SimpleNN, self).__init__()
self.fc1 = nn.Linear(input_dim, 100)
self.dropout1 = nn.Dropout(0.5)
self.fc2 = nn.Linear(100, 100)
self.dropout2 = nn.Dropout(0.5)
self.fc3 = nn.Linear(100, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.dropout1(x)
x = torch.relu(self.fc2(x))
x = self.dropout2(x)
x = self.fc3(x)
return x
def weights_init(m):
if isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight)
nn.init.zeros_(m.bias)
model = SimpleNN(X_train.shape[1])
model.apply(weights_init)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
best_test_loss = float('inf')
best_model_state = None
num_epochs = 10000
train_losses = []
test_losses = []
all_predictions = []
gradients = []
r2_scores = []
for epoch in range(num_epochs):
model.train()
train_loss = 0.0
epoch_gradients = []
for inputs, targets in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
for param in model.parameters():
epoch_gradients.append(param.grad.abs().mean().item())
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_loader)
train_losses.append(train_loss)
gradients.append(epoch_gradients)
print(f'Epoch {epoch+1}, Train Loss: {train_loss}')
model.eval()
test_loss = 0.0
predictions = []
with torch.no_grad():
for inputs, targets in test_loader:
outputs = model(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
predictions.append(outputs.numpy())
test_loss /= len(test_loader)
test_losses.append(test_loss)
all_predictions.append(predictions)
predictions_flat = np.concatenate(predictions).flatten()
r2 = r2_score(y_test, predictions_flat)
r2_scores.append(r2)
print(f'Epoch {epoch+1}, R^2: {r2}')
if test_loss < best_test_loss:
best_test_loss = test_loss
best_model_state = model.state_dict()
torch.save(best_model_state, 'best_model.pth')
print(f'Saved new best model at epoch {epoch+1} with test loss {test_loss}')
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs + 1), test_losses, label='Test Loss')
window_size = 50
train_losses_ma = pd.Series(train_losses).rolling(window=window_size).mean()
test_losses_ma = pd.Series(test_losses).rolling(window=window_size).mean()
plt.plot(range(1, num_epochs + 1), train_losses_ma, label='Train Loss (MA)', linestyle='--')
plt.plot(range(1, num_epochs + 1), test_losses_ma, label='Test Loss (MA)', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train and Test Loss with Moving Average')
plt.legend()
plt.savefig('train_test_loss.png')
plt.close()
final_predictions = np.array(all_predictions[-1]).flatten()
actuals = y_test_tensor.numpy().flatten()
correlation, p_value = pearsonr(actuals, final_predictions)
print(f'Pearson Correlation: {correlation}')
print(f'P-value: {p_value}')
plt.figure(figsize=(10, 5))
plt.scatter(actuals, final_predictions, color='blue', label=f'Predictions vs Actuals (r={correlation:.2f}, p={p_value:.2g})')
plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual OS.time')
plt.ylabel('Predicted OS.time')
plt.title('Predictions vs Actuals')
plt.legend()
plt.savefig('predictions_vs_actuals.png')
plt.close()
errors = final_predictions - actuals
plt.figure(figsize=(10, 5))
plt.hist(errors, bins=30, color='purple', alpha=0.7)
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title('Error Distribution')
plt.savefig('error_distribution.png')
plt.close()
actuals = y_test_tensor.numpy()
colors = cm.viridis(np.linspace(0, 1, num_epochs))
plt.figure(figsize=(10, 5))
plt.plot(actuals, label='Actual Values', color='b', marker='o', linestyle='-')
for i in range(0, num_epochs, max(1, num_epochs // 100)):
predictions = np.array(all_predictions[i]).flatten()
plt.plot(predictions, label=f'Epoch {i+1}', color=colors[i], linestyle='--')
plt.xlabel('Sample Index')
plt.ylabel('OS.time')
plt.title('Actual vs Predicted Values Over Time')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.savefig('actual_vs_predicted_over_time.png')
plt.close()
for i, layer in enumerate(model.children()):
if isinstance(layer, nn.Linear):
plt.figure(figsize=(10, 5))
plt.hist(layer.weight.detach().numpy().flatten(), bins=30, alpha=0.6, color='blue')
plt.xlabel(f'Layer {i+1} Weights')
plt.ylabel('Frequency')
plt.title(f'Weight Distribution of Layer {i+1}')
plt.savefig(f'layer_{i+1}_weight_distribution.png')
plt.close()
importances = np.abs(model.fc1.weight.detach().numpy()).sum(axis=0)
indices = np.argsort(importances)
plt.figure(figsize=(10, 5))
plt.barh(range(X_train.shape[1]), importances[indices], align='center')
plt.xlabel('Importance')
plt.ylabel('Feature Index')
plt.title('Feature Importances in the First Layer')
plt.savefig('feature_importances.png')
plt.close()
for i, layer in enumerate(model.children()):
if isinstance(layer, nn.Linear):
plt.figure(figsize=(10, 5))
plt.imshow(layer.weight.detach().numpy(), aspect='auto', cmap='viridis')
plt.colorbar()
plt.title(f'Weight Heatmap of Layer {i+1}')
plt.xlabel('Input Features')
plt.ylabel('Neurons')
plt.savefig(f'layer_{i+1}_weight_heatmap.png')
plt.close()
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), r2_scores, label='R^2 Score')
r2_scores_ma = pd.Series(r2_scores).rolling(window=window_size).mean()
plt.plot(range(1, num_epochs + 1), r2_scores_ma, label='R^2 Score (MA)', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('R^2 Score')
plt.title('R^2 Score over Epochs')
plt.legend()
plt.savefig('r2_over_epochs.png')
plt.close()