Spaces:

Zaixi
/

FoldMark

Running on Zero

App Files Files Community

Zaixi commited on Feb 8

Commit

287a06f

1 Parent(s): c6eeef3

fix bug

Browse files

Files changed (3) hide show

configs/configs_data.py +1 -1
protenix/data/data_pipeline.py +30 -30
runner/inference.py +8 -1

configs/configs_data.py CHANGED Viewed

@@ -60,7 +60,7 @@ default_weighted_pdb_configs = {
     "shuffle_sym_ids": GlobalConfigValue("train_shuffle_sym_ids"),
 }
-DATA_ROOT_DIR = "./"
 # Use CCD cache created by scripts/gen_ccd_cache.py priority. (without date in filename)
 # See: docs/prepare_data.md

     "shuffle_sym_ids": GlobalConfigValue("train_shuffle_sym_ids"),
 }
+DATA_ROOT_DIR = "./release_data/ccd_cache"
 # Use CCD cache created by scripts/gen_ccd_cache.py priority. (without date in filename)
 # See: docs/prepare_data.md

protenix/data/data_pipeline.py CHANGED Viewed

@@ -57,41 +57,41 @@ class DataPipeline(object):
                 sample_indices_list (list[dict[str, Any]]): The sample indices list (each one is a chain or an interface).
                 bioassembly_dict (dict[str, Any]): The bioassembly dict with sequence, atom_array, and token_array.
         """
-        try:
-            if dataset == "WeightedPDB":
-                parser = MMCIFParser(mmcif_file=mmcif)
-                bioassembly_dict = parser.get_bioassembly()
-            elif dataset == "Distillation":
-                parser = DistillationMMCIFParser(mmcif_file=mmcif)
-                bioassembly_dict = parser.get_structure_dict()
-            else:
-                raise NotImplementedError(
-                    'Unsupported "dataset", please input either "WeightedPDB" or "Distillation".'
-                )
-            sample_indices_list = parser.make_indices(
-                bioassembly_dict=bioassembly_dict, pdb_cluster_file=pdb_cluster_file
             )
-            if len(sample_indices_list) == 0:
-                # empty indices and AtomArray
-                return [], bioassembly_dict
-            atom_array = bioassembly_dict["atom_array"]
-            atom_array.set_annotation(
-                "resolution", [parser.resolution] * len(atom_array)
-            )
-            tokenizer = AtomArrayTokenizer(atom_array)
-            token_array = tokenizer.get_token_array()
-            bioassembly_dict["msa_features"] = None
-            bioassembly_dict["template_features"] = None
-            bioassembly_dict["token_array"] = token_array
-            return sample_indices_list, bioassembly_dict
-        except Exception as e:
-            logging.warning("Gen data failed for %s due to %s", mmcif, e)
-            return [], {}
     @staticmethod
     def get_label_entity_id_to_asym_id_int(atom_array: AtomArray) -> dict[str, int]:

                 sample_indices_list (list[dict[str, Any]]): The sample indices list (each one is a chain or an interface).
                 bioassembly_dict (dict[str, Any]): The bioassembly dict with sequence, atom_array, and token_array.
         """
+        #try:
+        if dataset == "WeightedPDB":
+            parser = MMCIFParser(mmcif_file=mmcif)
+            bioassembly_dict = parser.get_bioassembly()
+        elif dataset == "Distillation":
+            parser = DistillationMMCIFParser(mmcif_file=mmcif)
+            bioassembly_dict = parser.get_structure_dict()
+        else:
+            raise NotImplementedError(
+                'Unsupported "dataset", please input either "WeightedPDB" or "Distillation".'
             )
+        sample_indices_list = parser.make_indices(
+            bioassembly_dict=bioassembly_dict, pdb_cluster_file=pdb_cluster_file
+        )
+        if len(sample_indices_list) == 0:
+            # empty indices and AtomArray
+            return [], bioassembly_dict
+        atom_array = bioassembly_dict["atom_array"]
+        atom_array.set_annotation(
+            "resolution", [parser.resolution] * len(atom_array)
+        )
+        tokenizer = AtomArrayTokenizer(atom_array)
+        token_array = tokenizer.get_token_array()
+        bioassembly_dict["msa_features"] = None
+        bioassembly_dict["template_features"] = None
+        bioassembly_dict["token_array"] = token_array
+        return sample_indices_list, bioassembly_dict
+        # except Exception as e:
+        #     logging.warning("Gen data failed for %s due to %s", mmcif, e)
+        #     return [], {}
     @staticmethod
     def get_label_entity_id_to_asym_id_int(atom_array: AtomArray) -> dict[str, int]:

runner/inference.py CHANGED Viewed

@@ -208,12 +208,19 @@ def download_infercence_cache(configs: Any, model_version: str = "v0.2.0") -> No
     os.makedirs(data_cache_dir, exist_ok=True)
     for cache_name, fname in [
         ("ccd_components_file", "components.v20240608.cif"),
-        ("ccd_components_rdkit_mol_file", "components.v20240608.cif.rdkit_mol.pkl"),
     ]:
         if not opexists(cache_path := os.path.abspath(opjoin(data_cache_dir, fname))):
             tos_url = URL[cache_name]
             logger.info(f"Downloading data cache from\n {tos_url}...")
             urllib.request.urlretrieve(tos_url, cache_path)
     if not os.path.exists('./checkpoint.pt'):
         # Google Drive file ID

     os.makedirs(data_cache_dir, exist_ok=True)
     for cache_name, fname in [
         ("ccd_components_file", "components.v20240608.cif"),
     ]:
         if not opexists(cache_path := os.path.abspath(opjoin(data_cache_dir, fname))):
             tos_url = URL[cache_name]
             logger.info(f"Downloading data cache from\n {tos_url}...")
             urllib.request.urlretrieve(tos_url, cache_path)
+    if not os.path.exists('./release_data/ccd_cache/components.v20240608.cif.rdkit_mol.pkl'):
+        file_id = '1R9d678aBfQwTd0Rh15doRmW-fETNdeWf'
+        # Construct the download URL
+        download_url = f'https://drive.google.com/uc?id={file_id}'
+        # Specify the output file name
+        output_file = './release_data/ccd_cache/components.v20240608.cif.rdkit_mol.pkl'
+        gdown.download(download_url, output_file, quiet=False)
     if not os.path.exists('./checkpoint.pt'):
         # Google Drive file ID