MethSurvPredictor / methane.py

Upload 10 files

84db192 verified about 1 year ago

7.97 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.cm as cm
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import r2_score
	from scipy.stats import pearsonr
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import DataLoader, TensorDataset
	import os


	script_path = os.path.abspath(__file__)
	script_dir = os.path.dirname(script_path)
	os.chdir(script_dir)


	data = pd.read_csv('data.csv')


	X = data.drop(columns=['OS.time']).values
	y = data['OS.time'].values


	print(np.isnan(X).sum(), np.isnan(y).sum())
	print(np.isinf(X).sum(), np.isinf(y).sum())

	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)

	X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

	X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
	y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
	X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
	y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

	train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
	test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

	train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
	test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


	class SimpleNN(nn.Module):
	def __init__(self, input_dim):
	super(SimpleNN, self).__init__()
	self.fc1 = nn.Linear(input_dim, 100)
	self.dropout1 = nn.Dropout(0.5)
	self.fc2 = nn.Linear(100, 100)
	self.dropout2 = nn.Dropout(0.5)
	self.fc3 = nn.Linear(100, 1)

	def forward(self, x):
	x = torch.relu(self.fc1(x))
	x = self.dropout1(x)
	x = torch.relu(self.fc2(x))
	x = self.dropout2(x)
	x = self.fc3(x)
	return x


	def weights_init(m):
	if isinstance(m, nn.Linear):
	nn.init.kaiming_uniform_(m.weight)
	nn.init.zeros_(m.bias)

	model = SimpleNN(X_train.shape[1])
	model.apply(weights_init)

	criterion = nn.MSELoss()
	optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)


	best_test_loss = float('inf')
	best_model_state = None


	num_epochs = 10000
	train_losses = []
	test_losses = []
	all_predictions = []
	gradients = []
	r2_scores = []

	for epoch in range(num_epochs):
	model.train()
	train_loss = 0.0
	epoch_gradients = []
	for inputs, targets in train_loader:
	optimizer.zero_grad()
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	loss.backward()

	for param in model.parameters():
	epoch_gradients.append(param.grad.abs().mean().item())

	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
	optimizer.step()
	train_loss += loss.item()

	train_loss /= len(train_loader)
	train_losses.append(train_loss)
	gradients.append(epoch_gradients)
	print(f'Epoch {epoch+1}, Train Loss: {train_loss}')

	model.eval()
	test_loss = 0.0
	predictions = []
	with torch.no_grad():
	for inputs, targets in test_loader:
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	test_loss += loss.item()
	predictions.append(outputs.numpy())

	test_loss /= len(test_loader)
	test_losses.append(test_loss)
	all_predictions.append(predictions)


	predictions_flat = np.concatenate(predictions).flatten()
	r2 = r2_score(y_test, predictions_flat)
	r2_scores.append(r2)
	print(f'Epoch {epoch+1}, R^2: {r2}')


	if test_loss < best_test_loss:
	best_test_loss = test_loss
	best_model_state = model.state_dict()
	torch.save(best_model_state, 'best_model.pth')
	print(f'Saved new best model at epoch {epoch+1} with test loss {test_loss}')





	plt.figure(figsize=(10, 5))
	plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
	plt.plot(range(1, num_epochs + 1), test_losses, label='Test Loss')


	window_size = 50
	train_losses_ma = pd.Series(train_losses).rolling(window=window_size).mean()
	test_losses_ma = pd.Series(test_losses).rolling(window=window_size).mean()

	plt.plot(range(1, num_epochs + 1), train_losses_ma, label='Train Loss (MA)', linestyle='--')
	plt.plot(range(1, num_epochs + 1), test_losses_ma, label='Test Loss (MA)', linestyle='--')

	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.title('Train and Test Loss with Moving Average')
	plt.legend()
	plt.savefig('train_test_loss.png')
	plt.close()


	final_predictions = np.array(all_predictions[-1]).flatten()
	actuals = y_test_tensor.numpy().flatten()


	correlation, p_value = pearsonr(actuals, final_predictions)
	print(f'Pearson Correlation: {correlation}')
	print(f'P-value: {p_value}')


	plt.figure(figsize=(10, 5))
	plt.scatter(actuals, final_predictions, color='blue', label=f'Predictions vs Actuals (r={correlation:.2f}, p={p_value:.2g})')
	plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], color='red', linestyle='--', label='Ideal Fit')
	plt.xlabel('Actual OS.time')
	plt.ylabel('Predicted OS.time')
	plt.title('Predictions vs Actuals')
	plt.legend()
	plt.savefig('predictions_vs_actuals.png')
	plt.close()


	errors = final_predictions - actuals
	plt.figure(figsize=(10, 5))
	plt.hist(errors, bins=30, color='purple', alpha=0.7)
	plt.xlabel('Prediction Error')
	plt.ylabel('Frequency')
	plt.title('Error Distribution')
	plt.savefig('error_distribution.png')
	plt.close()


	actuals = y_test_tensor.numpy()
	colors = cm.viridis(np.linspace(0, 1, num_epochs))

	plt.figure(figsize=(10, 5))
	plt.plot(actuals, label='Actual Values', color='b', marker='o', linestyle='-')

	for i in range(0, num_epochs, max(1, num_epochs // 100)):
	predictions = np.array(all_predictions[i]).flatten()
	plt.plot(predictions, label=f'Epoch {i+1}', color=colors[i], linestyle='--')

	plt.xlabel('Sample Index')
	plt.ylabel('OS.time')
	plt.title('Actual vs Predicted Values Over Time')
	plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
	plt.savefig('actual_vs_predicted_over_time.png')
	plt.close()


	for i, layer in enumerate(model.children()):
	if isinstance(layer, nn.Linear):
	plt.figure(figsize=(10, 5))
	plt.hist(layer.weight.detach().numpy().flatten(), bins=30, alpha=0.6, color='blue')
	plt.xlabel(f'Layer {i+1} Weights')
	plt.ylabel('Frequency')
	plt.title(f'Weight Distribution of Layer {i+1}')
	plt.savefig(f'layer_{i+1}_weight_distribution.png')
	plt.close()


	importances = np.abs(model.fc1.weight.detach().numpy()).sum(axis=0)
	indices = np.argsort(importances)

	plt.figure(figsize=(10, 5))
	plt.barh(range(X_train.shape[1]), importances[indices], align='center')
	plt.xlabel('Importance')
	plt.ylabel('Feature Index')
	plt.title('Feature Importances in the First Layer')
	plt.savefig('feature_importances.png')
	plt.close()


	for i, layer in enumerate(model.children()):
	if isinstance(layer, nn.Linear):
	plt.figure(figsize=(10, 5))
	plt.imshow(layer.weight.detach().numpy(), aspect='auto', cmap='viridis')
	plt.colorbar()
	plt.title(f'Weight Heatmap of Layer {i+1}')
	plt.xlabel('Input Features')
	plt.ylabel('Neurons')
	plt.savefig(f'layer_{i+1}_weight_heatmap.png')
	plt.close()


	plt.figure(figsize=(10, 5))
	plt.plot(range(1, num_epochs + 1), r2_scores, label='R^2 Score')


	r2_scores_ma = pd.Series(r2_scores).rolling(window=window_size).mean()
	plt.plot(range(1, num_epochs + 1), r2_scores_ma, label='R^2 Score (MA)', linestyle='--')

	plt.xlabel('Epoch')
	plt.ylabel('R^2 Score')
	plt.title('R^2 Score over Epochs')
	plt.legend()
	plt.savefig('r2_over_epochs.png')
	plt.close()