| import pandas as pd | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| import matplotlib.pyplot as plt | |
| from mpl_toolkits.mplot3d import Axes3D | |
| import os | |
| import joblib | |
| script_path = os.path.abspath(__file__) | |
| script_dir = os.path.dirname(script_path) | |
| os.chdir(script_dir) | |
| file_path = 'TCGA-LGG.methylation450.tsv' | |
| df = pd.read_csv(file_path, sep='\t', index_col=0) | |
| df.dropna(inplace=True) | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(df.T) | |
| pca = PCA(n_components=50) | |
| principal_components = pca.fit_transform(scaled_data) | |
| pca_model_path = 'pca_model.pkl' | |
| joblib.dump(pca, pca_model_path) | |
| print(f"PCA模型已保存为 {pca_model_path}") | |
| loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=df.index) | |
| loadings.to_csv('pca_loadings.csv') | |
| print("主成分载荷矩阵已保存为 pca_loadings.csv") | |
| sample_ids = df.columns | |
| principal_df = pd.DataFrame(data=principal_components, columns=[f'Principal Component {i+1}' for i in range(50)], index=sample_ids) | |
| fig = plt.figure(figsize=(10, 8)) | |
| ax = fig.add_subplot(111, projection='3d') | |
| ax.scatter(principal_df['Principal Component 1'], principal_df['Principal Component 2'], principal_df['Principal Component 3']) | |
| for i, sample_id in enumerate(sample_ids): | |
| ax.text(principal_df['Principal Component 1'][i], principal_df['Principal Component 2'][i], principal_df['Principal Component 3'][i], sample_id) | |
| ax.set_xlabel('Principal Component 1') | |
| ax.set_ylabel('Principal Component 2') | |
| ax.set_zlabel('Principal Component 3') | |
| ax.set_title('3D PCA of Methylation Data') | |
| plt.show() | |
| output_file_path = 'pca_principal_components.csv' | |
| principal_df.to_csv(output_file_path) | |
| explained_variance = pca.explained_variance_ratio_ | |
| print(f"Explained variance by each component: {explained_variance}") | |
| print(f"50个主成分已保存为 {output_file_path}") | |