Spaces:

tahoebio
/

tx1-demo

Running on A10G

App Files Files Community

Umair Khan commited on 21 days ago

Commit

0811027

1 Parent(s): 237ece6

update UI and reformat parquet output

Browse files

Files changed (2) hide show

app.py +38 -15
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ import anndata as ad
 import pandas as pd
 import numpy as np
 import scanpy as sc
 from pathlib import Path
 from composer import Trainer, Callback
 from tahoex.model.model import ComposerTX
@@ -24,10 +26,11 @@ EMB_KEY = "X_tx1-70m"
 APP_TITLE = "Tx1-70M Embeddings"
 APP_DESC = """
 Upload an AnnData, compute Tx1-70M embeddings,
-preview a UMAP, and download the results. Files are
-limited to 5GB / 50K cells. If a file is less than 5GB but
-contains more than 50K cells, embeddings will be
-computed only for the first 50K cells.
 """
 # set up directories
@@ -39,10 +42,14 @@ with open("./symbol-to-ensembl.json", "r") as f:
     SYMBOL_TO_ENSEMBL = json.load(f)
     SYMBOL_TO_ENSEMBL_UCASE = {str(k).upper(): v for k, v in SYMBOL_TO_ENSEMBL.items()}
 # helper to read AnnData header
 def read_anndata_header(fileobj):
     adata = sc.read_h5ad(fileobj.name, backed="r")
-    layers = ["<use .X>"] + list(adata.layers.keys())
     var_cols = list(adata.var.columns)
     obs_cols = list(adata.obs.columns)
     del adata
@@ -72,11 +79,27 @@ def _unique_output(name):
 # helper to save outputs
 def _save_outputs(adata, emb):
-    emb_df = pd.DataFrame(emb, index=adata.obs_names)
     parquet_path = _unique_output("embs.parquet")
-    emb_df.to_parquet(parquet_path)
     out_h5ad = _unique_output("adata_with_embs.h5ad")
     adata.write(out_h5ad)
     return parquet_path, out_h5ad
 # refresh dropdowns given a file object
@@ -103,7 +126,7 @@ def ensure_dropdowns(fileobj):
 # custom callback to report progress to Gradio
 class GradioProgressCallback(Callback):
-    def __init__(self, progress, total_batches, start=0.1, end=0.6):
         self.progress = progress
         self.total = max(1, int(total_batches))
         self.seen = 0
@@ -119,6 +142,7 @@ class GradioProgressCallback(Callback):
 def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
     # retrieve AnnData from bytes
     with tempfile.TemporaryDirectory() as td:
         # persist to a temporary file
@@ -206,7 +230,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
         raise gr.Error(f"Feature column '{feature_col}' does not appear to contain Ensembl gene IDs. If the column contains gene symbols, use the checkbox.")
     # load model
-    print("loading model")
     model, vocab, _, collator_config = ComposerTX.from_hf(
         "tahoebio/TahoeX1",
         "70m",
@@ -214,7 +238,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
     )
     # prepare AnnData
-    print("preparing AnnData")
     gene_id_key = feature_col
     adata.var["id_in_vocab"] = [vocab[gene] if gene in vocab else -1 for gene in adata.var[gene_id_key]]
     gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
@@ -228,7 +252,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
     gene_ids = np.array([vocab[gene] for gene in genes], dtype=int)
     # create data loader
-    print("creating data loader")
     count_matrix = _pick_layer(adata, layer_name)
     dataset = CountDataset(
         count_matrix,
@@ -279,7 +303,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
     predictions = trainer.predict(loader, return_outputs=True)
     # aggregate embeddings
-    print("aggregating embeddings")
     cell_embs = []
     for out in predictions:
         cell_embs.append(out["cell_emb"].cpu())
@@ -330,7 +354,6 @@ def run_pipeline(fileobj, layer_choice, var_choice, obs_choice, use_symbols, pro
         adata_bytes = f.read()
     # compute embeddings on GPU
-    progress(0.10, desc="computing Tx1 embeddings")
     E, layers, var_cols, obs_cols, adata_with_emb_bytes = _embed(
         adata_bytes=adata_bytes,
         layer_name=(None if layer_choice in [None, "", "<use .X>"] else layer_choice),
@@ -347,13 +370,13 @@ def run_pipeline(fileobj, layer_choice, var_choice, obs_choice, use_symbols, pro
         adata = sc.read_h5ad(tmp_in, backed=None)
     # compute UMAP
-    progress(0.60, desc="computing UMAP")
     color_series = adata.obs[obs_choice] if (obs_choice and obs_choice in adata.obs) else None
     coords = _compute_umap_from_emb(E)
     adata.obsm["X_umap"] = coords
     # plot UMAP
-    progress(0.80, desc="plotting UMAP")
     import matplotlib.pyplot as plt
     fig = plt.figure(figsize=(5.5, 5.0))
     ax = fig.add_subplot(111)

 import pandas as pd
 import numpy as np
 import scanpy as sc
+import pyarrow as pa
+import pyarrow.parquet as pq
 from pathlib import Path
 from composer import Trainer, Callback
 from tahoex.model.model import ComposerTX
 APP_TITLE = "Tx1-70M Embeddings"
 APP_DESC = """
 Upload an AnnData, compute Tx1-70M embeddings,
+preview a UMAP, and download the results.
+**Limits:** Files up to 5GB. If an AnnData contains more
+than 50K cells, embeddings will be computed **only
+for the first 50K cells**.
 """
 # set up directories
     SYMBOL_TO_ENSEMBL = json.load(f)
     SYMBOL_TO_ENSEMBL_UCASE = {str(k).upper(): v for k, v in SYMBOL_TO_ENSEMBL.items()}
+# set up parquet outputs
+PARQUET_INDEX_COL = "index"
+PARQUET_EMB_COL = "tx1-70m"
 # helper to read AnnData header
 def read_anndata_header(fileobj):
     adata = sc.read_h5ad(fileobj.name, backed="r")
+    layers = list(adata.layers.keys())
     var_cols = list(adata.var.columns)
     obs_cols = list(adata.obs.columns)
     del adata
 # helper to save outputs
 def _save_outputs(adata, emb):
+    # save parquet
+    d_model = int(emb.shape[1])
+    index_arr = pa.array(adata.obs_names.astype(str).tolist(), type=pa.string())
+    emb_arr = pa.array(emb.tolist(), type=pa.list_(pa.float32(), d_model))
+    table = pa.Table.from_arrays(
+        [index_arr, emb_arr],
+        names=[PARQUET_INDEX_COL, PARQUET_EMB_COL],
+        schema=pa.schema([
+            pa.field(PARQUET_INDEX_COL, pa.string()),
+            pa.field(PARQUET_EMB_COL, pa.list_(pa.float32(), d_model)),
+        ]),
+    )
     parquet_path = _unique_output("embs.parquet")
+    pq.write_table(table, parquet_path, compression="zstd", use_dictionary=True)
+    # save AnnData
     out_h5ad = _unique_output("adata_with_embs.h5ad")
     adata.write(out_h5ad)
+    # return paths
     return parquet_path, out_h5ad
 # refresh dropdowns given a file object
 # custom callback to report progress to Gradio
 class GradioProgressCallback(Callback):
+    def __init__(self, progress, total_batches, start=0.35, end=0.75):
         self.progress = progress
         self.total = max(1, int(total_batches))
         self.seen = 0
 def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
     # retrieve AnnData from bytes
+    progress(0.12, desc="loading AnnData")
     with tempfile.TemporaryDirectory() as td:
         # persist to a temporary file
         raise gr.Error(f"Feature column '{feature_col}' does not appear to contain Ensembl gene IDs. If the column contains gene symbols, use the checkbox.")
     # load model
+    progress(0.22, desc="loading model")
     model, vocab, _, collator_config = ComposerTX.from_hf(
         "tahoebio/TahoeX1",
         "70m",
     )
     # prepare AnnData
+    progress(0.30, desc="preparing AnnData")
     gene_id_key = feature_col
     adata.var["id_in_vocab"] = [vocab[gene] if gene in vocab else -1 for gene in adata.var[gene_id_key]]
     gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
     gene_ids = np.array([vocab[gene] for gene in genes], dtype=int)
     # create data loader
+    progress(0.35, desc="creating data loader")
     count_matrix = _pick_layer(adata, layer_name)
     dataset = CountDataset(
         count_matrix,
     predictions = trainer.predict(loader, return_outputs=True)
     # aggregate embeddings
+    progress(0.78, desc="aggregating embeddings")
     cell_embs = []
     for out in predictions:
         cell_embs.append(out["cell_emb"].cpu())
         adata_bytes = f.read()
     # compute embeddings on GPU
     E, layers, var_cols, obs_cols, adata_with_emb_bytes = _embed(
         adata_bytes=adata_bytes,
         layer_name=(None if layer_choice in [None, "", "<use .X>"] else layer_choice),
         adata = sc.read_h5ad(tmp_in, backed=None)
     # compute UMAP
+    progress(0.85, desc="computing UMAP")
     color_series = adata.obs[obs_choice] if (obs_choice and obs_choice in adata.obs) else None
     coords = _compute_umap_from_emb(E)
     adata.obsm["X_umap"] = coords
     # plot UMAP
+    progress(0.90, desc="plotting UMAP")
     import matplotlib.pyplot as plt
     fig = plt.figure(figsize=(5.5, 5.0))
     ax = fig.add_subplot(111)

requirements.txt CHANGED Viewed

@@ -17,4 +17,5 @@ scanpy
 pynndescent
 umap-learn
 anndata
-h5py

 pynndescent
 umap-learn
 anndata
+h5py
+pyarrow