Spaces:

HUBioDataLab
/

ProtHGT

Sleeping

Erva Ulusoy commited on Feb 3

Commit

c86e7b2

1 Parent(s): 8f00c3f

updated _create_prediction_df function

Files changed (1) hide show

run_prothgt_app.py CHANGED Viewed

@@ -88,30 +88,41 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
         'GO_term_P': 'Biological Process',
         'GO_term_C': 'Cellular Component'
     }
-    # Create a list to store individual protein predictions
-    all_predictions = []
-    # Number of GO terms for this category
     n_go_terms = len(heterodata[go_category]['id_mapping'])
     # Process predictions for each protein
     for i, protein_id in enumerate(protein_ids):
-        # Get the slice of predictions for this protein
-        protein_predictions = predictions[i * n_go_terms:(i + 1) * n_go_terms]
-        prediction_df = pd.DataFrame({
-            'Protein': protein_id,
-            'GO_category': go_category_dict[go_category],
-            'GO_term': list(heterodata[go_category]['id_mapping'].keys()),
-            'Probability': protein_predictions.tolist()
-        })
-        all_predictions.append(prediction_df)
-    # Combine all predictions
-    combined_df = pd.concat(all_predictions, ignore_index=True)
-    combined_df.sort_values(by=['Protein', 'Probability'], ascending=[True, False], inplace=True)
-    combined_df.reset_index(drop=True, inplace=True)
-    return combined_df
 def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category):
     all_predictions = []

         'GO_term_P': 'Biological Process',
         'GO_term_C': 'Cellular Component'
     }
+    # Get number of GO terms for this category
     n_go_terms = len(heterodata[go_category]['id_mapping'])
+    # Create lists to store the data
+    all_proteins = []
+    all_go_terms = []
+    all_categories = []
+    all_probabilities = []
+    # Get list of GO terms once
+    go_terms = list(heterodata[go_category]['id_mapping'].keys())
     # Process predictions for each protein
     for i, protein_id in enumerate(protein_ids):
+        # Get predictions for this protein
+        start_idx = i * n_go_terms
+        end_idx = (i + 1) * n_go_terms
+        protein_predictions = predictions[start_idx:end_idx]
+        # Extend the lists
+        all_proteins.extend([protein_id] * n_go_terms)
+        all_go_terms.extend(go_terms)
+        all_categories.extend([go_category_dict[go_category]] * n_go_terms)
+        all_probabilities.extend(protein_predictions.tolist())
+    # Create DataFrame
+    prediction_df = pd.DataFrame({
+        'Protein': all_proteins,
+        'GO_term': all_go_terms,
+        'GO_category': all_categories,
+        'Probability': all_probabilities
+    })
+    return prediction_df
 def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category):
     all_predictions = []