Spaces:

ml-jku
/

mhnfs

Sleeping

App Files Files Community

Tschoui commited on Mar 15, 2024

Commit

cf004a6

1 Parent(s): 7971ae3

move project from private to public space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.streamlit/config.toml +19 -0
README.md +108 -8
app.py +65 -0
assets/data_preprocessing_objects/ecdfs.pkl +3 -0
assets/data_preprocessing_objects/scaler_fitted.pkl +3 -0
assets/example_csv/.~lock.known_inactive_molecules.csv# +1 -0
assets/example_csv/known_active_molecules.csv +3 -0
assets/example_csv/known_inactive_molecules.csv +3 -0
assets/example_csv/molecules_for_prediction.csv +3 -0
assets/example_csv/predictions/nottrustworthy_example.csv +3 -0
assets/example_csv/predictions/nottrustworthy_example.png +3 -0
assets/example_csv/predictions/trustworthy_example.csv +3 -0
assets/example_csv/predictions/trustworthy_example.png +3 -0
assets/header.png +3 -0
assets/logo.png +3 -0
assets/mhnfs_data/cfg.yaml +42 -0
assets/mhnfs_data/full_context_set.npy +3 -0
assets/mhnfs_data/mhnfs_checkpoint.ckpt +3 -0
assets/mhnfs_overview.png +3 -0
assets/test_reference_data/ecfps.npy +3 -0
assets/test_reference_data/model_input_query.pt +3 -0
assets/test_reference_data/model_input_support_actives.pt +3 -0
assets/test_reference_data/model_input_support_inactives.pt +3 -0
assets/test_reference_data/model_predictions.pt +3 -0
assets/test_reference_data/preprocessed_features.npy +3 -0
assets/test_reference_data/rdkit_descr_quantils.npy +3 -0
assets/test_reference_data/rdkit_descrs.npy +3 -0
assets/test_reference_data/smiles.pkl +3 -0
requirements.txt +10 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-37.pyc +0 -0
src/__pycache__/prediction_pipeline.cpython-37.pyc +0 -0
src/app/__pycache__/constants.cpython-37.pyc +0 -0
src/app/__pycache__/layout.cpython-37.pyc +0 -0
src/app/__pycache__/prediction_utils.cpython-37.pyc +0 -0
src/app/constants.py +269 -0
src/app/layout.py +439 -0
src/app/prediction_utils.py +33 -0
src/data_preprocessing/__init__.py +0 -0
src/data_preprocessing/__pycache__/__init__.cpython-36.pyc +0 -0
src/data_preprocessing/__pycache__/__init__.cpython-37.pyc +0 -0
src/data_preprocessing/__pycache__/constants.cpython-37.pyc +0 -0
src/data_preprocessing/__pycache__/create_descriptors.cpython-36.pyc +0 -0
src/data_preprocessing/__pycache__/create_descriptors.cpython-37.pyc +0 -0
src/data_preprocessing/__pycache__/create_model_inputs.cpython-37.pyc +0 -0
src/data_preprocessing/__pycache__/utils.cpython-37.pyc +0 -0
src/data_preprocessing/constants.py +11 -0
src/data_preprocessing/create_descriptors.py +148 -0
src/data_preprocessing/create_model_inputs.py +46 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[theme]
+base="light"
+# Primary accent for interactive elements
+primaryColor = '#0078aa'
+# Background color for the main content area
+# backgroundColor = '#273346'
+# Background color for sidebar and most interactive widgets
+# secondaryBackgroundColor = '#7d828c'
+# Color used for almost all text
+# textColor = '#4bc9ff'
+# Font family for all text in the app, except code blocks
+# Accepted values (serif | sans serif | monospace)
+# Default: "sans serif"
+# font = "sans serif"

README.md CHANGED Viewed

@@ -1,13 +1,113 @@
 ---
-title: Mhnfs
-emoji: 🚀
-colorFrom: yellow
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.32.2
 app_file: app.py
-pinned: false
-license: gpl-3.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MHNfs
+emoji: 🔬
+short_description: Activity prediction for low-data scenarios
+colorFrom: gray
+colorTo: gray
 sdk: streamlit
+sdk_version: 1.29.0
 app_file: app.py
+pinned: true
 ---
+# Activity Predictions with MHNfs for low-data scenarios
+## ⚙️ Under the hood
+<div style="text-align: justify">
+    The predictive model (MHNfs) used in this application was specifically designed and
+    trained for low-data scenarios. The model predicts whether a molecule is active or
+    inactive. The predicted activity value is a continuous value between 0 and 1, and,
+    similar to a probability, the higher/lower the value, the more confident the model
+    is that the molecule is active/inactive.<br>
+    <br>
+    The model was trained on the FS-Mol dataset which
+    includes 5120 tasks (roughly 5000 tasks were used for training, rest for evaluation).
+    The training tasks are listed here:
+    <a href="https://github.com/microsoft/FS-Mol/tree/main/datasets/targets"
+    target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
+</div>
+## 🎯 About few-shot learning and the model MHNfs
+<div style="text-align: justify">
+    <b>Few-shot learning</b> is a machine learning sub-field which aims to provide
+    predictive models for scenarios in which only little data is known/available.<br>
+    <br>
+    <b>MHNfs</b> is a few-shot learning model which is specifically designed for drug
+    discovery applications. It is built to use the input prompts in a way such that
+    the provided available knowledge, i.e. the known active and inactive molecules,
+    functions as context to predict the activity of the new requested molecules.
+    Precisely, the provided active and inactive molecules are associated with a
+    large set of general molecules - called context molecules - to enrich the
+    provided information and to remove spurious correlations arising from the
+    decoration of molecules. This is analogous to a Large Language Model which would
+    not only use the provided information in the current prompt as context but would
+    also have access to way more information, e.g., a prompting history.
+    </div>
+## 💻 Run the prediction pipeline locally for larger screening chunks
+### Get started:
+```bash
+# Copied from hugging face
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+# Clone repo
+git clone https://huggingface.co/spaces/tschouis/mhnfs
+# Alternatively, if you want to clone without large files
+GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/tschouis/mhnfs
+```
+### Install requirements
+```bash
+pip install -r requirements.txt
+```
+Notably, this command was tested inside a conda environment with python 3.7.
+### Run the prediction pipeline:
+For your screening, load the model, i.e. the **Activity Predictor** into your python file or notebook and simply run it:
+```python
+from src.prediction_pipeline load ActivityPredictor
+# Define inputs
+query_smiles = ["C1CCCCC1", "C1CCCCC1", "C1CCCCC1", "C1CCCCC1"] # Replace with your data
+support_actives_smiles = ["C1CCCCC1", "C1CCCCC1"] # Replace with your data
+support_inactives_smiles = ["C1CCCCC1", "C1CCCCC1"] # Replace with your data
+# Make predictions
+predictions = predictor.predict(query_smiles, support_actives_smiles support_inactives_smiles)
+```
+* Provide molecules in SMILES notation.
+* Make sure that the inputs to the Activity Predictor are either comma separated lists, or flattened numpy arrays, or pandas DataFrames. In the latter case, there should be a "smiles" column (both upper and lower case "SMILES" are accepted). All other columns are ignored.
+### Run the app locally with streamlib:
+```bash
+# Navigate into root directory of this project
+cd .../whatever_your_dir_name_is/ # Replace with your path
+# Run streamlit app
+python -m streamlit run
+```
+## 🤗 Hugging face app
+Explore our hugging-face app here:
+## 📚 Cite us
+```
+@inproceedings{
+    schimunek2023contextenriched,
+    title={Context-enriched molecule representations improve few-shot drug discovery},
+    author={Johannes Schimunek and Philipp Seidl and Lukas Friedrich and Daniel Kuhn and Friedrich Rippmann and Sepp Hochreiter and Günter Klambauer},
+    booktitle={The Eleventh International Conference on Learning Representations},
+    year={2023},
+    url={https://openreview.net/forum?id=XrMWUuEevr}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+This script runs the streamlit app for MHNfs
+MHNfs: Few-shot method for drug discovery activity predictions
+       (https://openreview.net/pdf?id=XrMWUuEevr)
+"""
+# --------------------------------------------------------------------------------------
+# Imports
+import streamlit as st
+from src.app.layout import LayoutMaker
+from src.app.prediction_utils import (create_prediction_df,
+                                      create_molecule_grid_plot)
+from src.prediction_pipeline import ActivityPredictor
+# --------------------------------------------------------------------------------------
+# Functions
+class App():
+    def __init__(self):
+        # Set page configration to wide
+        st.set_page_config(layout="wide", page_title="MHNfs", page_icon="🔬")
+        # Layout maker
+        self.layoutMaker = LayoutMaker()
+        # Load mhnfs model
+        self.predictor = ActivityPredictor()
+    def define_layout(self):
+        # Define Sidebar width
+        css = '''
+        <style>
+            [data-testid="stSidebar"]{
+                min-width: 500px;
+                max-width: 500px;
+            }
+        </style>
+        '''
+        st.markdown(css, unsafe_allow_html=True)
+        # Sidebar
+        self.inputs, self.buttons = self.layoutMaker.make_sidebar()
+        # Main page
+        # - header
+        self.layoutMaker.make_header()
+        # - main body
+        self.layoutMaker.make_main_content_area(self.predictor,
+                                                self.inputs,
+                                                self.buttons,
+                                                create_prediction_df,
+                                                create_molecule_grid_plot)
+def run_app():
+    app = App()
+    app.define_layout()
+# --------------------------------------------------------------------------------------
+# Run script
+if __name__ == "__main__":
+    run_app()

assets/data_preprocessing_objects/ecdfs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eeec12688fd9e0bb0bbd68d5203e2fb46c45d30a07417f0883adbfc133d48e9f
+size 520417347

assets/data_preprocessing_objects/scaler_fitted.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4538c1c1d9b5b50d29a14c14134f66a563c3a0f4022ce77b8eb2959c3eff51ea
+size 54501

assets/example_csv/.~lock.known_inactive_molecules.csv# ADDED Viewed

	@@ -0,0 +1 @@


1	+ ,johannes,Latitude-5501,02.01.2024 15:57,file:///home/johannes/.config/libreoffice/4;

assets/example_csv/known_active_molecules.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc98c05246b42d84c6833d191efa32c7c6473d76c5f2719c8ff3310cfe22df04
+size 353

assets/example_csv/known_inactive_molecules.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6e183c33b7445ae0c00bea4a7cdae52bfce14da2829f6827e20dda162df23af
+size 363

assets/example_csv/molecules_for_prediction.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:497adfdbd026c7ab7d1564b685a246fcb7eb6eabb2442918862b31ccd0b32369
+size 460

assets/example_csv/predictions/nottrustworthy_example.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3f8b5e017175b8d62982b1fc4138a4348f51b6a0469c32df991f5d2576a679d
+size 588

assets/example_csv/predictions/nottrustworthy_example.png ADDED Viewed

Git LFS Details

SHA256: ae7aff2e2cd2e68bdcb4a5563be38c13d7780453443657b36f01333ab57a949c
Pointer size: 130 Bytes
Size of remote file: 25.5 kB

assets/example_csv/predictions/trustworthy_example.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3517bcef4a9998975b031d1b4f2b4aa29679669079100230f84e27bc06f80c02
+size 889

assets/example_csv/predictions/trustworthy_example.png ADDED Viewed

Git LFS Details

SHA256: df2a73cdf527546e8b078cb45618b4554a77f11fdd48367ef25939e0a6a2b518
Pointer size: 130 Bytes
Size of remote file: 28.3 kB

assets/header.png ADDED Viewed

Git LFS Details

SHA256: 1d355c5fc158281371a09759584110e611c810d2442e8aad30551998aa728f0a
Pointer size: 131 Bytes
Size of remote file: 123 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 505cc795dcaac622e2af6bf2ed118d7ab28d3eab27fd421755844c042ed7646a
Pointer size: 130 Bytes
Size of remote file: 40.9 kB

assets/mhnfs_data/cfg.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+model:
+  encoder:
+    activation: selu
+    input_dim: 2248
+    number_hidden_layers: 0
+    number_hidden_neurons: 1024
+    regularization:
+      input_dropout: 0.1
+      dropout: 0.5
+  layerNormBlock:
+    affine: False
+    usage: True
+  transformer:
+    activity_embedding_dim: 64
+    number_heads: 8
+    dim_forward: 567
+    dropout: 0.5
+    num_layers: 1
+    ss_dropout: 0.1
+  hopfield:
+    dim_QK: 512
+    heads: 8
+    beta: 0.044194173824159216
+    dropout: 0.5
+  prediction_scaling: 0.044194173824159216
+  associationSpace_dim: 1024
+  similarityModule:
+    type: cosineSim
+    l2Norm: False
+    scaling: 1/N
+  training:
+    optimizer: AdamW
+    batch_size: 512
+    lr: 0.0001
+    weightDecay: 0.0
+    lrScheduler:
+      usage: True
+  context:
+    ratio_training_molecules: 0.05
+system:
+  ressources:
+    device: cpu

assets/mhnfs_data/full_context_set.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed40b8d9cc39859772af0d32ed69c7f2467b9235f83f37ff42611bc22828e52
+size 3899416896

assets/mhnfs_data/mhnfs_checkpoint.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25fcfdb7c6355b7781edaefc9ec56351f012356b17e4087f72b0a78c6d8e2300
+size 313588174

assets/mhnfs_overview.png ADDED Viewed

Git LFS Details

SHA256: f89731eaf842e6018b4153d60193ea57442fb5933774135a653d4b70ac48afe2
Pointer size: 131 Bytes
Size of remote file: 467 kB

assets/test_reference_data/ecfps.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:056a628c308cf69e647f2c86090f8f93c2aedcd719845f57f11e653ce6d9d70b
+size 24704

assets/test_reference_data/model_input_query.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e889558eb3300355b5c6ea0ce1518bb949141238b8d26b257ec1bd496baeda18
+size 36715

assets/test_reference_data/model_input_support_actives.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5e55816e09597d267fb91297a56f58a4f4420ed32340650be4c1dd37efe1656
+size 72683

assets/test_reference_data/model_input_support_inactives.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e62e8b18da47d1c9475c18bc2ad50a563f10f0d0bced247d848e453321a13ced
+size 72683

assets/test_reference_data/model_predictions.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7e63ad2ad9b664e3301479427f8d5cf005c979d7cc9e4bce033f18640eb4df0
+size 747

assets/test_reference_data/preprocessed_features.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e97dc7eb85509c6b07156292b57a1bee4eaa8d60fbdb40c7e2e5738c8c6a460
+size 54080

assets/test_reference_data/rdkit_descr_quantils.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cde4d2fd8658cdbcd55e75f14cb360cfa1b239f99d281c1f7296449636e94c6a
+size 4928

assets/test_reference_data/rdkit_descrs.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1b06153004b3f2ac02f0cefd16b0f17225527bbf53f8efe6e43c035b3d21690
+size 2528

assets/test_reference_data/smiles.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0168a7aaa6f7f3eca611a42d70782bae9eb970194449320d37b64f5a8c264f9
+size 179

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+rdkit==2022.3.3
+pytorch-lightning==1.6.1
+torch==1.13.1
+numpy==1.21.5
+pandas==1.3.5
+omegaconf==2.1.2
+mols2grid==1.1.1
+scikit-learn
+statsmodels==0.13.5
+streamlit

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (154 Bytes). View file

src/__pycache__/prediction_pipeline.cpython-37.pyc ADDED Viewed

Binary file (2.73 kB). View file

src/app/__pycache__/constants.cpython-37.pyc ADDED Viewed

Binary file (13.1 kB). View file

src/app/__pycache__/layout.cpython-37.pyc ADDED Viewed

Binary file (13.3 kB). View file

src/app/__pycache__/prediction_utils.cpython-37.pyc ADDED Viewed

Binary file (1.05 kB). View file

src/app/constants.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+This file includes all the constant content shown in the app
+"""
+# --------------------------------------------------------------------------------------
+summary_text = ('''
+                This application allows you to make **activity predictions** for
+                **biological targets** for which you have only a **little knowledge** in
+                terms of known active and inactive molecules.
+                **Provide** via the sidebar:\n
+                - some active molecules,
+                - some inactive molecules, and
+                - molecules you want to predict.
+                Hit **Predict** and explore the predictions!
+                For more **information** about the **model** and **how to provide the
+                molecules**, please visit the **Additional Information** tab.
+                ''')
+mhnfs_text =('''
+        <div style="text-align: justify">
+        <b>MHNfs</b> is a few-shot drug discovery model which consists of a <b>context
+        module</b> , a <b>cross-attention module</b> , and a <b>similarity module</b>
+        as described here: <a href="https://openreview.net/pdf?id=XrMWUuEevr"
+        target="_blank">https://openreview.net/pdf?id=XrMWUuEevr</a>.
+        </div>
+        <br>
+        <div style="text-align: justify">
+        <b>Abstract</b>. A central task in computational drug discovery is to construct
+        models from known active molecules to find further promising molecules for
+        subsequent screening. However, typically only very few active molecules are
+        known. Therefore, few-shot learning methods have the potential to improve the
+        effectiveness of this critical phase of the drug discovery process. We introduce
+        a new method for few-shot drug discovery. Its main idea is to enrich a molecule
+        representation by knowledge about known context or reference molecules. Our
+        novel concept for molecule representation enrichment is to associate molecules
+        from both the support set and the query set with a large set of reference
+        (context) molecules through a modern Hopfield network. Intuitively, this
+        enrichment step is analogous to a human expert who would associate a given
+        molecule with familiar molecules whose properties are known. The enrichment step
+        reinforces and amplifies the covariance structure of the data, while
+        simultaneously removing spurious correlations arising from the decoration of
+        molecules. Our approach is compared with other few-shot methods for drug
+        discovery on the FS-Mol benchmark dataset. On FS-Mol, our approach outperforms
+        all compared methods and therefore sets a new state-of-the art for few-shot
+        learning in drug discovery. An ablation study shows that the enrichment step of
+        our method is the key to improve the predictive quality. In a domain shift
+        experiment, we further demonstrate the robustness of our method. Code is
+        available at <a href="https://github.com/ml-jku/MHNfs"
+        target="_blank">https://github.com/ml-jku/MHNfs</a>.
+        </div>
+        <br>
+        <br>
+        ''')
+citation_text = '''
+        ###
+            @inproceedings{
+                schimunek2023contextenriched,
+                title={Context-enriched molecule representations improve few-shot drug discovery},
+                author={Johannes Schimunek and Philipp Seidl and Lukas Friedrich and Daniel Kuhn and Friedrich Rippmann and Sepp Hochreiter and Günter
+                Klambauer},
+                booktitle={The Eleventh International Conference on Learning Representations},
+                year={2023},
+                url={https://openreview.net/forum?id=XrMWUuEevr}
+            }
+        '''
+few_shot_learning_text = (
+    '''
+    <div style="text-align: justify">
+    <b>Few-shot learning</b> is a machine learning sub-field which aims to provide
+    predictive models for scenarios in which only little data is known/available.<br>
+    <br>
+    <b>MHNfs</b> is a few-shot learning model which is specifically designed for drug
+    discovery applications. It is built to use the input prompts in a way such that
+    the provided available knowledge, i.e. the known active and inactive molecules,
+    functions as context to predict the activity of the new requested molecules.
+    Precisely, the provided active and inactive molecules are associated with a
+    large set of general molecules - called context molecules - to enrich the
+    provided information and to remove spurious correlations arising from the
+    decoration of molecules. This is analogous to a Large Language Model which would
+    not only use the provided information in the current prompt as context but would
+    also have access to way more information, e.g., a prompting history.
+    </div>
+    ''')
+under_the_hood_text = ('''
+    <div style="text-align: justify">
+    The predictive model (MHNfs) used in this application was specifically designed and
+    trained for low-data scenarios. The model predicts whether a molecule is active or
+    inactive. The predicted activity value is a continuous value between 0 and 1, and,
+    similar to a probability, the higher/lower the value, the more confident the model
+    is that the molecule is active/inactive.
+    The model was trained on the FS-Mol dataset which
+    includes 5120 tasks (roughly 5000 tasks were used for training, rest for evaluation).
+    The training tasks are listed here:
+    <a href="https://github.com/microsoft/FS-Mol/tree/main/datasets/targets"
+    target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
+    </div>
+    ''')
+usage_text = ('''
+    <div style="text-align: justify">
+    To use this application, you need to provide <b>3 different sets of molecules</b>:
+    <ol>
+        <li><b>active</b> molecules: set of known active molecules,</li>
+        <li><b>inactive</b> molecules: set of known inactive molecules, and</li>
+        <li>molecules to <b>predict</b>: set of molecules you want to predict.</li>
+    </ol>
+    These three sets can be provided via the <b>sidebar</b>. The sidebar also includes two
+    buttons <b>predict</b> and <b>reset</b> to run the prediction pipeline and to
+    reset it.
+    </div>
+    ''')
+data_text = ('''
+    <div style="text-align: justify">
+    <ul>
+        <li> Molecules have to be provided in SMILES format</li>
+        <li> For each input, the maximum number of molecules which can be provided is
+        restricted to 20 </li>
+        <li> You can provide the molecules via the text boxes or via CSV upload
+            <ul>
+                <li> Text box
+                    <ul>
+                        <li> Replace the pseudo input by directly typing your molecules
+                        into
+                        the text box </li>
+                        <li> Separate the molecules by comma </li>
+                    </ul>
+                </li>
+                <li> CSV upload
+                    <ul>
+                        <li> The CSV file should include a "smiles" column (both upper
+                        and lower case "SMILES" are accepted) </li>
+                        <li> All other columns will be ignored </li>
+                        <li> Examples are provided here:
+                            <div style="background-color: #efefef">
+                            assets/example_csv/ </li>
+                            </div>
+                    </ul>
+                </li>
+            </ul>
+        </li>
+    </ul>
+    </div>
+    ''')
+trust_text = ('''
+    <div style="text-align: justify">
+    Just like all other machine learning models, the performance of MHNfs varies
+    and, generally, the model works well if the task is somehow close to tasks which
+    were used to train the model. The model performance for very different tasks is
+    unclear and might be poor.<br>
+    <br>
+    MHNfs was trained on the FS-Mol dataset which includes 5120 tasks (roughly
+    5000 tasks were used for training, rest for evaluation). The training tasks are
+    listed here: <a href= https://github.com/microsoft/FS-Mol/tree/main/datasets/targets
+    target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
+    </div>
+    ''')
+example_trustworthy_text = ('''
+    <div style="text-align: justify">
+    Since the predicitve model has seen a lot of kinase related tasks during training,
+    the model is expected to generally perform well on kinase targets. For this example,
+    we use data for the target
+    <a href=https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL5914/
+    target="_blank">CHEMBL5914</a>. Notably, this specific kinase has not been seen
+    during training. Precisely, we use the available inhibition data while molecules
+    with an inhibition value greater (smaller) than 50 % are considered as active
+    (inactive).<br>
+    From the known available data, we have selected 4 "known" active molecules,
+    8 "known" inactive molecules, and 11 molecules to predict.<br>
+    <b>Molecules to predict</b>:
+    <div style="background-color: #efefef">
+    FC(F)(F)c1ccc(Cl)cc1CN1CCNc2ncc(-c3ccnc(N4CCNCC4)c3)cc21,<br>
+    CS(=O)(=O)c1ccc(-n2nc(-c3cnc4[nH]ccc4c3)c3c(N)ncnc32)cc1,<br>
+    O=C(Nc1ccccc1Cl)c1cnc2ccc(C3CCNCC3)cn12.O=C(O)C(=O)O,<br>
+    CC(C)n1cnc2c(Nc3cccc(Cl)c3)nc(N[C@@H]3CCCC[C@@H]3N)nc21,<br>
+    Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2F)cc1-c1ccc2c(c1)CCNC2=O,<br>
+    CCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(C#Cc4cccnc4)c3)cc2C(F)(F)F)CC1,<br>
+    CN1CCN(c2ccc(-c3cnc4c(c3)N(Cc3cc(Cl)ccc3C(F)(F)F)CCN4)cn2)CC1,<br>
+    CC(C)n1nc(-c2cnc(N)c(OC(F)(F)F)c2)cc1[C@H]1[C@@H]2CN(C3COC3)C[C@@H]21,<br>
+    Nc1ncc(-c2cc([C@H]3[C@@H]4CN(C5COC5)C[C@@H]43)n(CC3CC3)n2)cc1C(F)(F)F,<br>
+    Cc1ccc(NC(=O)C2(C(=O)Nc3ccc(Nc4ncc(F)c(-c5cc(F)c6nc(C)n(C(C)C)c6c5)n4)cc3)CC2)cc1,<br>
+    C[C@@H](Oc1cc(-c2cnn(C3CCNCC3)c2)cnc1N)c1c(Cl)ccc(F)c1Cl
+    </div><br>
+    <b>Known active molecules</b>:
+    <div style="background-color: #efefef">
+    CC(=O)N1CCN(c2cc(-c3cnc4c(c3)N(Cc3cc(Cl)ccc3C(F)(F)F)CCN4)ccn2)CC1,<br>
+    CS(=O)(=O)c1cccc(Nc2nccc(-c3sc(N4CCOCC4)nc3-c3cccc(NS(=O)(=O)c4c(F)cccc4F)c3)n2)c1,<br>
+    COc1cnccc1Nc1nc(-c2nn(Cc3c(F)cc(OCCO)cc3F)c3ccccc23)ncc1OC,<br>
+    CN(C)[C@@H]1CC[C@@]2(C)[C@@H](CC[C@@H]3[C@@H]2CC[C@]2(C)C(c4cccc5cnccc45)=CC[C@@H]32)C1<br>
+    </div><br>
+    <b>Known inactive molecules</b>:
+    <div style="background-color: #efefef">
+    c1cc(-c2c[nH]c3cnccc23)ccn1,<br>
+    COc1ccc2c3ccnc(C(F)(F)F)c3n(CCCCN)c2c1,<br>
+    CNS(=O)(=O)c1ccc(N(C)C)c(Nc2ncnc3cc(OC)c(OC)cc23)c1,<br>
+    CN(C1CC1)S(=O)(=O)c1ccc(-c2cnc(N)c(-c3ccc4c(c3)CCNC4=O)c2)c(F)c1,<br>
+    CCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(C#Cc4cnc5[nH]ccc5c4)c3)cc2C(F)(F)F)CC1,<br>
+    CC(C)n1cc(-c2cc(-c3ccc(CN4CCOCC4)cc3)cnc2N)nn1,<br>
+    CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c(N3CCOCC3)cc2C1=O,<br>
+    [2H]C([2H])([2H])C1(C([2H])([2H])[2H])Cn2nc(-c3ccc(F)cn3)c(-c3ccnc4[nH]ncc34)c2CO1<br>
+    </div><br>
+    <b>Predictions</b>:<br>
+    </div>
+    ''')
+example_nottrustworthy_text = ('''
+    <div style="text-align: justify">
+    For this example, we use data for the auxiliary transport protein target
+    <a href=https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL5738/
+    target="_blank">CHEMBL5738</a>. Precisely, we use the available Ki data
+    while molecules with a pCHEMBL value greater (smaller) than 5 are considered
+    as active (inactive).<br>
+    From the known available data, we have selected 4 "known" active molecules,
+    3 "known" inactive molecules, and 10 molecules to predict.<br>
+    <b>Molecules to predict</b>:
+    <div style="background-color: #efefef">
+    CC(C(=O)O)c1ccc(-c2ccccc2)c(F)c1,<br>
+    O=S(=O)(O)Oc1cccc2cccc(Nc3ccccc3)c12,<br>
+    CCCCCCCC/C=C\CCCCCCCC(=O)O,<br>
+    C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[C@@]2(C)[C@H]1CC[C@]2(O)C(=O)CO,<br>
+    CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,<br>
+    Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl,<br>
+    O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,<br>
+    CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,<br>
+    O=C(c1ccccc1)c1ccc2n1CCC2C(=O)O,<br>
+    CC(C)OC(=O)C(C)(C)Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1<br>
+    </div><br>
+    <b>Known active molecules</b>:
+    <div style="background-color: #efefef">
+    CC(C(=O)O)c1ccc(N2Cc3ccccc3C2=O)cc1,<br>
+    CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc21,<br>
+    CC(C)(Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1)C(=O)O,<br>
+    CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@H]3CC[C@]12C
+    </div><br>
+    <b>Known inactive molecules</b>:
+    <div style="background-color: #efefef">
+    CC(C)Cc1ccc(C(C)C(=O)O)cc1,<br>
+    O=C1Nc2ccc(Cl)cc2C(c2ccccc2Cl)=NC1O,<br>
+    C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C@@H](O)C[C@]2(C)[C@@]1(O)C(=O)CO
+    </div><br>
+    <b>Predictions</b>:<br>
+    </div>
+    ''')

src/app/layout.py ADDED Viewed

	@@ -0,0 +1,439 @@

+"""
+This file defines the layout of the app including the header, sidebar, and tabs in the
+main content area.
+"""
+#---------------------------------------------------------------------------------------
+# Imports
+import streamlit as st
+import streamlit.components.v1 as components
+from PIL import Image
+import pandas as pd
+import yaml
+from src.data_preprocessing.create_descriptors import handle_inputs
+from src.app.constants import (summary_text,
+                               mhnfs_text,
+                               citation_text,
+                               few_shot_learning_text,
+                               under_the_hood_text,
+                               usage_text,
+                               data_text,
+                               trust_text,
+                               example_trustworthy_text,
+                               example_nottrustworthy_text)
+#---------------------------------------------------------------------------------------
+# Global variables
+MAX_INPUT_LENGTH = 20
+#---------------------------------------------------------------------------------------
+# Functions
+class LayoutMaker():
+    """
+    This class includes all the design choices regarding the layout of the app. This
+    class can be used in the main file to define header, sidebar, and main content area.
+    """
+    def __init__(self):
+        # Initialize the inputs dictionary
+        self.inputs = dict() # this will be the storage for query and support set inputs
+        self.inputs_lists = dict()
+        # Initialize prediction storage
+        self.predictions = None
+        # Buttons
+        self.buttons = dict() # this will be the storage for buttons
+        # content
+        self.summary_text = summary_text
+        self.mhnfs_text = mhnfs_text
+        self.citation_text = citation_text
+        self.few_shot_learning_text = few_shot_learning_text
+        self.under_the_hood_text = under_the_hood_text
+        self.usage_text = usage_text
+        self.data_text = data_text
+        self.trust_text = trust_text
+        self.example_trustworthy_text = example_trustworthy_text
+        self.example_nottrustworthy_text = example_nottrustworthy_text
+        self.df_trustworthy = pd.read_csv("./assets/example_csv/predictions/"
+                                          "trustworthy_example.csv")
+        self.df_nottrustworthy = pd.read_csv("./assets/example_csv/predictions/"
+                                            "nottrustworthy_example.csv")
+        self.max_input_length = MAX_INPUT_LENGTH
+    def make_sidebar(self):
+        """
+        This function defines the sidebar of the app. It includes the logo, query box,
+        support set boxes, and predict buttons.
+        It returns the stored inputs (for query and support set) and the buttons which
+        allow for user interactions.
+        """
+        with st.sidebar:
+            # Logo
+            logo = Image.open("./assets/logo.png")
+            st.image(logo)
+            st.divider()
+            # Query box
+            self._make_query_box()
+            st.divider()
+            # Support set actives box
+            self._make_active_support_set_box()
+            st.divider()
+            # Support set inactives box
+            self._make_inactive_support_set_box()
+            st.divider()
+            # Predict buttons
+            self.buttons["predict"] = st.button("Predict...")
+            self.buttons["reset"] = st.button("Reset")
+        return self.inputs, self.buttons
+    def make_header(self):
+        """
+        This function defines the header of the app. It consists only of a png image
+        in which the title and an overview is given.
+        """
+        header_container = st.container()
+        with header_container:
+            header = Image.open("./assets/header.png")
+            st.image(header)
+    def make_main_content_area(self,
+                               predictor,
+                               inputs,
+                               buttons,
+                               create_prediction_df: callable,
+                               create_molecule_grid_plot: callable):
+        tab1, tab2, tab3, tab4 = st.tabs(["Predictions",
+                                    "Paper / Cite",
+                                    "Additional Information",
+                                    "Examples"])
+        # Results tab
+        with tab1:
+            self._fill_tab_with_results_content(predictor,
+                                                inputs,
+                                                buttons,
+                                                create_prediction_df,
+                                                create_molecule_grid_plot)
+        # Paper tab
+        with tab2:
+            self._fill_paper_and_citation_tab()
+        # More explanations tab
+        with tab3:
+            self._fill_more_explanations_tab()
+        with tab4:
+            self._fill_examples_tab()
+    def _make_query_box(self):
+        """
+        This function
+        a) defines the query box and
+        b) stores the query input in the inputs dictionary
+        """
+        st.info(":blue[Molecules to predict:]", icon="❓")
+        query_container = st.container()
+        with query_container:
+            input_choice = st.radio(
+                "Input your data in SMILES notation via:", ["Text box", "CSV upload"]
+            )
+            if input_choice == "Text box":
+                query_input = st.text_area(
+                    label="SMILES input for query molecules",
+                    label_visibility="hidden",
+                    key="query_textbox",
+                    value="CC(C)Sc1nc(C(C)(C)C)nc(OCC(=O)O)c1C#N, "
+                            "Cc1nc(NCc2cccnc2)cc(=O)n1CC(=O)O",
+                )
+            elif input_choice == "CSV upload":
+                query_file = st.file_uploader(key="query_csv",
+                                              label = "CSV upload for query mols",
+                                              label_visibility="hidden")
+                if query_file is not None:
+                    query_input = pd.read_csv(query_file)
+                else: query_input = None
+        # Update storage
+        self.inputs["query"] = query_input
+    def _make_active_support_set_box(self):
+        """
+        This function
+        a) defines the active support set box and
+        b) stores the active support set input in the inputs dictionary
+        """
+        st.info(":blue[Known active molecules:]", icon="✨")
+        active_container = st.container()
+        with active_container:
+            active_input_choice = st.radio(
+                "Input your data in SMILES notation via:",
+                ["Text box", "CSV upload"],
+                key="active_input_choice",
+            )
+            if active_input_choice == "Text box":
+                support_active_input = st.text_area(
+                    label="SMILES input for active support set molecules",
+                    label_visibility="hidden",
+                    key="active_textbox",
+                    value="Cc1nc(NCC2CCCCC2)c(C#N)c(=O)n1CC(=O)O, "
+                            "CSc1nc(C(C)C)nc(OCC(=O)O)c1C#N"
+                )
+            elif active_input_choice == "CSV upload":
+                support_active_file = st.file_uploader(
+                    key="support_active_csv",
+                    label = "CSV upload for active support set molecules",
+                    label_visibility="hidden"
+                    )
+                if support_active_file is not None:
+                    support_active_input  = pd.read_csv(support_active_file)
+                else: support_active_input = None
+        # Update storage
+        self.inputs["support_active"] = support_active_input
+    def _make_inactive_support_set_box(self):
+        st.info(":blue[Known inactive molecules:]", icon="✨")
+        inactive_container = st.container()
+        with inactive_container:
+            inactive_input_choice = st.radio(
+                "Input your data in SMILES notation via:",
+                ["Text box", "CSV upload"],
+                key="inactive_input_choice",
+            )
+            if inactive_input_choice == "Text box":
+                support_inactive_input  = st.text_area(
+                    label="SMILES input for inactive support set molecules",
+                    label_visibility="hidden",
+                    key="inactive_textbox",
+                    value="CSc1nc(C)nc(OCC(=O)O)c1C#N, "
+                            "CSc1nc(C)n(CC(=O)O)c(=O)c1C#N"
+                )
+            elif inactive_input_choice == "CSV upload":
+                support_inactive_file  = st.file_uploader(
+                    key="support_inactive_csv",
+                    label = "CSV upload for inactive support set molecules",
+                    label_visibility="hidden"
+                    )
+                if support_inactive_file is not None:
+                    support_inactive_input  = pd.read_csv(
+                        support_inactive_file
+                        )
+                else: support_inactive_input = None
+        # Update storage
+        self.inputs["support_inactive"] = support_inactive_input
+    def _fill_tab_with_results_content(self, predictor, inputs, buttons,
+                                       create_prediction_df, create_molecule_grid_plot):
+        tab_container = st.container()
+        with tab_container:
+            # Info
+            st.info(":blue[Summary:]", icon="🚀")
+            st.markdown(self.summary_text)
+            # Results
+            st.info(":blue[Results:]",icon="👨‍💻")
+            if buttons['predict']:
+                # Check 1: Are all inputs provided?
+                if (inputs['query'] is None or
+                    inputs['support_active'] is None or
+                    inputs['support_inactive'] is None):
+                        st.error("You didn't provide all necessary inputs.\n\n"
+                                 "Please provide all three necessary inputs via the "
+                                 "sidebar and hit the predict button again.")
+                else:
+                    # Check 2: Less than max allowed molecules provided?
+                    max_input_length = 0
+                    for key, input in inputs.items():
+                            input_list = handle_inputs(input)
+                            self.inputs_lists[key] = input_list
+                            max_input_length = max(max_input_length, len(input_list))
+                    if max_input_length > self.max_input_length:
+                        st.error("You provided too many molecules. The number of "
+                                 "molecules for each input is restricted to "
+                                f"{self.max_input_length}.\n\n"
+                                "For larger screenings, we suggest to clone the repo "
+                                "and to run the model locally.")
+                    else:
+                        # Progress bar
+                        progress_bar_text = ("I'm predicting activities. This might "
+                                                "need some minutes. Please wait...")
+                        progress_bar = st.progress(50, text=progress_bar_text)
+                        # Results table
+                        df = self._predict_and_create_results_table(predictor,
+                                                                    inputs,
+                                                                    create_prediction_df)
+                        progress_bar_text = ("Done. Here are the results:")
+                        progress_bar = progress_bar.progress(100, text=progress_bar_text)
+                        st.dataframe(df, use_container_width=True)
+                        col1, col2, col3, col4 = st.columns([1,1,1,1])
+                        # Provide download button for predictions
+                        with col2:
+                            self.buttons["download_results"] = st.download_button(
+                                "Download predictions as CSV",
+                                self._convert_df_to_binary(df),
+                                file_name="predictions.csv",
+                            )
+                        # Provide download button for inputs
+                        with col3:
+                            with open("inputs.yml", 'w') as fl:
+                                self.buttons["download_inputs"] = st.download_button(
+                                    "Download inputs as YML",
+                                    self._convert_to_yml(self.inputs_lists),
+                                    file_name="inputs.yml",
+                                )
+                        st.divider()
+                        # Results grid
+                        st.info(":blue[Grid plot of the predicted molecules:]",
+                                icon="📊")
+                        mol_html_grid = create_molecule_grid_plot(df)
+                        components.html(mol_html_grid, height=1000, scrolling=True)
+            elif buttons['reset']:
+                self._reset()
+    def _fill_paper_and_citation_tab(self):
+        st.info(":blue[**Paper: Context-enriched molecule representations improve "
+                "few-shot drug discovery**]", icon="📄")
+        st.markdown(self.mhnfs_text, unsafe_allow_html=True)
+        st.image("./assets/mhnfs_overview.png")
+        st.write("")
+        st.write("")
+        st.write("")
+        st.info(":blue[**Cite us / BibTex**]", icon="📚")
+        st.markdown(self.citation_text)
+    def _fill_more_explanations_tab(self):
+        st.info(":blue[**Under the hood**]", icon="⚙️")
+        st.markdown(self.under_the_hood_text, unsafe_allow_html=True)
+        st.write("")
+        st.write("")
+        st.info(":blue[**About few-shot learning and the model MHNfs**]", icon="🎯")
+        st.markdown(self.few_shot_learning_text, unsafe_allow_html=True)
+        st.write("")
+        st.write("")
+        st.info(":blue[**Usage**]", icon="🎛️")
+        st.markdown(self.usage_text, unsafe_allow_html=True)
+        st.write("")
+        st.write("")
+        st.info(":blue[**How to provide the data**]", icon="📀")
+        st.markdown(self.data_text, unsafe_allow_html=True)
+        st.write("")
+        st.write("")
+        st.info(":blue[**When to trust the predictions**]", icon="🔍")
+        st.markdown(self.trust_text, unsafe_allow_html=True)
+    def _fill_examples_tab(self):
+        st.info(":blue[**Example for trustworthy predictions**]", icon="✅")
+        st.markdown(self.example_trustworthy_text, unsafe_allow_html=True)
+        st.dataframe(self.df_trustworthy, use_container_width=True)
+        st.markdown("**Plot: Predictions for active and inactive molecules (model AUC="
+                    "0.96**)")
+        prediction_plot_tw = Image.open("./assets/example_csv/predictions/"
+                                        "trustworthy_example.png")
+        st.image(prediction_plot_tw)
+        st.write("")
+        st.write("")
+        st.info(":blue[**Example for not trustworthy predictions**]", icon="⛔️")
+        st.markdown(self.example_nottrustworthy_text, unsafe_allow_html=True)
+        st.dataframe(self.df_nottrustworthy, use_container_width=True)
+        st.markdown("**Plot: Predictions for active and inactive molecules (model AUC="
+                    "0.42**)")
+        prediction_plot_ntw = Image.open("./assets/example_csv/predictions/"
+                                        "nottrustworthy_example.png")
+        st.image(prediction_plot_ntw)
+    def _predict_and_create_results_table(self,
+                                          predictor,
+                                          inputs,
+                                          create_prediction_df: callable):
+            df = create_prediction_df(predictor,
+                                      inputs['query'],
+                                      inputs['support_active'],
+                                      inputs['support_inactive'])
+            return df
+    def _reset(self):
+        keys = list(st.session_state.keys())
+        for key in keys:
+            st.session_state.pop(key)
+    def _convert_df_to_binary(_self, df):
+        return df.to_csv(index=False).encode('utf-8')
+    def _convert_to_yml(_self, inputs):
+        return yaml.dump(inputs)
+        content = """
+        # Usage
+        As soon as you have a few active and inactive molecules for your task, you can
+        provide them here and make predictions for new molecules.
+        ## About few-shot learning and the model MHNfs
+        **Few-shot learning** is a machine learning sub-field which aims to provide
+        predictive models for scenarios in which only little data is known/available.
+        **MHNfs** is a few-shot learning model which is specifically designed for drug
+        discovery applications. It is built to use the input prompts in a way such that
+        the provided available knowledge - i.e. the known active and inactive molecules -
+        functions as context to predict the activity of the new requested molecules.
+        Precisely, the provided active and inactive molecules are associated with a
+        large set of general molecules - called context molecules - to enrich the
+        provided information and to remove spurious correlations arising from the
+        decoration of molecules. This is analogous to a Large Language Model which would
+        not only use the provided information in the current prompt as context but would
+        also have access to way more information, e.g. a prompting history.
+        ## How to provide the data
+        * Molecules have to be provided in SMILES format.
+        * You can provide the molecules via the text boxes or via CSV upload.
+            - Text box: Replace the pseudo input by directly typing your molecules into
+            the text box. Please separate the molecules by comma.
+            - CSV upload: Upload a CSV file with the molecules.
+                * The CSV file should include a smiles column (both upper and lower
+                case "SMILES" are accepted).
+                * All other columns will be ignored.
+        ## When to trust the predictions
+        Just like all other machine learning models, the performance of MHNfs varies
+        and, generally, the model works well if the task is somehow close to tasks which
+        were used to train the model. The model performance for very different tasks is
+        unclear and might be poor.
+        MHNfs was trained on a the FS-Mol dataset which includes 5120 tasks (Roughly
+        5000 tasks were used for training, rest for evaluation). The training tasks are
+        listed here: https://github.com/microsoft/FS-Mol/tree/main/datasets/targets.
+        """
+        return content

src/app/prediction_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+This module includes all functions which are called from the main app and are needed to
+make activity predictions and to output the results.
+"""
+#---------------------------------------------------------------------------------------
+# Deendencies
+import pandas as pd
+import mols2grid
+#---------------------------------------------------------------------------------------
+# Define functions
+def create_prediction_df(predictor, query_smiles, support_activces_smiles,
+                         support_inactives_smiles):
+    """
+    This function creates a dataframe with the query molecules and the corresponding
+    predictions.
+    """
+    # Make predictions
+    predictions = predictor.predict(query_smiles, support_activces_smiles,
+                                    support_inactives_smiles)
+    smiles = predictor._return_query_mols_as_list()
+    # Create dataframe
+    prediction_df = pd.DataFrame({"Molecule": smiles,
+                                  "Predicted activity": predictions.astype('str')})
+    return prediction_df
+def create_molecule_grid_plot(df, smiles_col="Molecule"):
+    mol_html_grid = mols2grid.display(df,smiles_col=smiles_col)._repr_html_()
+    return mol_html_grid

src/data_preprocessing/__init__.py ADDED Viewed

File without changes

src/data_preprocessing/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (167 Bytes). View file

src/data_preprocessing/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (173 Bytes). View file

src/data_preprocessing/__pycache__/constants.cpython-37.pyc ADDED Viewed

Binary file (1.61 kB). View file

src/data_preprocessing/__pycache__/create_descriptors.cpython-36.pyc ADDED Viewed

Binary file (2.39 kB). View file

src/data_preprocessing/__pycache__/create_descriptors.cpython-37.pyc ADDED Viewed

Binary file (4.19 kB). View file

src/data_preprocessing/__pycache__/create_model_inputs.cpython-37.pyc ADDED Viewed

Binary file (1.29 kB). View file

src/data_preprocessing/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (7.49 kB). View file

src/data_preprocessing/constants.py ADDED Viewed

	@@ -0,0 +1,11 @@

+USED_200_DESCR = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,25,26,27,28,29,30, 31,32,33,
+                  34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,
+                  57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
+                  80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,
+                  102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,
+                  119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,
+                  136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,
+                  153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,
+                  170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,
+                  187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,
+                  204,205,206,207]

src/data_preprocessing/create_descriptors.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+This file includes all necessary code to preprocess molecules (assumed to be in SMILES
+format) and create descriptors which can be fed into MHNfs.
+"""
+#---------------------------------------------------------------------------------------
+# Dependencies
+import numpy as np
+import pandas as pd
+import pickle
+from typing import List
+from rdkit import Chem, DataStructs
+from rdkit.Chem.rdchem import Mol
+from rdkit.Chem import Descriptors, rdFingerprintGenerator
+from src.data_preprocessing.constants import USED_200_DESCR
+from src.data_preprocessing.utils import Standardizer
+#---------------------------------------------------------------------------------------
+# Define main function
+def preprocess_molecules(input_molecules: [str, List[str], pd.DataFrame]):
+    """
+    This function preprocesses molecules (assumed to be in SMILES format) and creates
+    descriptors which can be fed into MHNfs.
+    """
+    # Load needed objects
+    current_loc = __file__.rsplit("/",3)[0]
+    with open(current_loc + "/assets/data_preprocessing_objects/scaler_fitted.pkl",
+              "rb") as fl:
+        scaler = pickle.load(fl)
+    with open(current_loc + "/assets/data_preprocessing_objects/ecdfs.pkl", "rb") as fl:
+        ecdfs = pickle.load(fl)
+    # Ensure that input_molecules is an Iterable with strs
+    input_smiles = handle_inputs(input_molecules)
+    # Create cleanded rdkit mol objects
+    input_molecules = create_cleaned_mol_objects(input_smiles)
+    # Create fingerprints and descriptors
+    ecfps = create_ecfp_fps(input_molecules)
+    rdkit_descrs = create_rdkit_descriptors(input_molecules)
+    # Create quantils
+    rdkit_descr_quantils = create_quantils(rdkit_descrs, ecdfs)
+    # Concatenate features
+    raw_features = np.concatenate((ecfps, rdkit_descr_quantils), axis=1)
+    # Normalize feature vectors
+    normalized_features = scaler.transform(raw_features)
+    # Return feature vectors
+    return normalized_features
+#---------------------------------------------------------------------------------------
+# Define helper functions
+def handle_inputs(input_molecules: [str, List[str], pd.DataFrame]):
+    """
+    This function handles the input molecules.
+    """
+    if isinstance(input_molecules, list):
+        return input_molecules
+    elif isinstance(input_molecules, pd.DataFrame):
+        input_molecules.columns =  [c.lower() for c in input_molecules.columns]
+        if "smiles" not in input_molecules.columns:
+            raise ValueError(("Input DataFrame must have a column named 'Smiles'."))
+        iterable = list(input_molecules["smiles"].values)
+        return iterable
+    elif isinstance(input_molecules, str):
+        smiles_list = input_molecules.split(",")
+        smiles_list_cleaned = [smiles.strip() for smiles in smiles_list]
+        smiles_list_cleaned = [smiles for smiles in smiles_list_cleaned if smiles != ""]
+        return smiles_list_cleaned
+    else:
+        raise TypeError(("Input molecules must be a string,a list of strings or a "
+                         "pandas DataFrame."))
+def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
+    """
+    This function ECFP fingerprints for a list of molecules.
+    """
+    ecfps = list()
+    for mol in mols:
+        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
+            [mol], fpType=rdFingerprintGenerator.MorganFP
+        )[0]
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
+    """
+    This function creates RDKit descriptors for a list of molecules.
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantils(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    quantils = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantils[:, column] = q
+    return quantils
+def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
+    """
+    This function creates cleaned RDKit mol objects from a list of SMILES.
+    """
+    sm = Standardizer(canon_taut=True)
+    mols = list()
+    for smile in smiles:
+        #try:
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols
+#---------------------------------------------------------------------------------------

src/data_preprocessing/create_model_inputs.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+In this file, the input functions for query and support set molecules are defined.
+Input is assumed to be either a SMILES string, a list of SMILES strings, or a pandas
+dataframe.
+"""
+#---------------------------------------------------------------------------------------
+# Dependencies
+import pandas as pd
+from typing import List
+import torch
+from src.data_preprocessing.create_descriptors import preprocess_molecules
+#---------------------------------------------------------------------------------------
+# Define main functions
+def create_query_input(smiles_input: [str, List[str], pd.DataFrame]):
+    """
+    This function creates the input for the query molecules.
+    """
+    # Create vector representation
+    numpy_vector_representation = preprocess_molecules(smiles_input)
+    assert len(numpy_vector_representation.shape) == 2
+    # Create pytorch tensor
+    tensor = torch.from_numpy(numpy_vector_representation).unsqueeze(1).float()
+    return tensor
+def create_support_set_input(smiles_input: [str, List[str], pd.DataFrame]):
+    """
+    This function creates the input for the support set molecules.
+    """
+    # Create vector representation
+    numpy_vector_representation = preprocess_molecules(smiles_input)
+    assert len(numpy_vector_representation.shape) == 2
+    size = numpy_vector_representation.shape[0]
+    # Create pytorch tensors
+    tensor = torch.from_numpy(numpy_vector_representation).unsqueeze(0).float()
+    size = torch.tensor(size)
+    return tensor, size