Spaces:

keras-io
/

video-vision-transformer-CT

Build error

App Files Files Community

pablorodriper commited on Oct 26, 2022

Commit

e0cf56d

1 Parent(s): 199b9a9

Fix Space

Browse files

Files changed (7) hide show

app.py +36 -18
examples/femur-right.mp4 +0 -0
examples/kidney-left.mp4 +0 -0
requirements.txt +5 -1
utils/__init__.py +0 -0
utils/constants.py +27 -0
utils/predict.py +84 -0

app.py CHANGED Viewed

@@ -1,26 +1,44 @@
 import gradio as gr
 import tensorflow as tf
-from huggingface_hub import from_pretrained_keras
-description = "Keras implementation for Video Vision Transformer trained with OrganMNIST3D (CT videos)"
-article = "Classes: liver, kidney-right, kidney-left, femur-right, femur-left, bladder, heart, lung-right, lung-left, spleen, pancreas.\n\nAuthor:<a href=\"https://huggingface.co/pablorodriper/\"> Pablo Rodríguez</a>; Based on the keras example by <a href=\"https://keras.io/examples/vision/vivit/\">Aritra Roy Gosthipaty and Ayush Thakur</a>"
-title = "Video Vision Transformer on OrganMNIST3D"
-def infer(video):
-    return model.predict(tf.expand_dims(video, axis=0))[0]
-model = from_pretrained_keras("keras-io/video-vision-transformer")
-labels = ['liver', 'kidney-right', 'kidney-left', 'femur-right', 'femur-left', 'bladder', 'heart', 'lung-right', 'lung-left', 'spleen', 'pancreas']
-iface = gr.Interface(
-    fn = infer,
-    inputs = "video",
-    outputs = "number",
-    description = description,
-    title = title,
-    article = article,
-    examples=["example_1.mp4", "example_2.mp4"]
-    )
-iface.launch()

+import glob
 import gradio as gr
 import tensorflow as tf
+from utils.predict import predict_label
+##Create list of examples to be loaded
+example_list = glob.glob("examples/*.mp4")
+example_list = list(map(lambda el:[el], example_list))
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("# **<p align='center'>Video Vision Transformer on medmnist</p>**")
+    with gr.Tab("Upload & Predict"):
+        with gr.Box():
+            with gr.Row():
+                input_video = gr.Video(label="Input Video", show_label=True)
+                output_label = gr.Label(label="Model Output", show_label=True)
+        gr.Markdown("**Predict**")
+        with gr.Box():
+            with gr.Row():
+                submit_button = gr.Button("Submit")
+        gr.Markdown("The model is trained to classify videos belonging to the following classes: liver, kidney-right, kidney-left, femur-right, femur-left, bladder, heart, lung-right, lung-left, spleen and pancreas")
+        gr.Examples(example_list, [input_video], [output_label], predict_label, cache_examples=True)
+    submit_button.click(predict_label, inputs=input_video, outputs=output_label)
+    gr.Markdown('\n Demo created by: <a href=\"https://huggingface.co/pablorodriper\"> Pablo Rodríguez</a> <br> Based on the Keras example by <a href=\"https://keras.io/examples/vision/vivit/\">Aritra Roy Gosthipaty and Ayush Thakur</a>')
+demo.launch()

examples/femur-right.mp4 ADDED Viewed

Binary file (7.7 kB). View file

examples/kidney-left.mp4 ADDED Viewed

Binary file (7.19 kB). View file

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- ~~tensorflow>2~~.6

+transformers==4.23
+huggingface_hub>0.10
+tensorflow>2.6
+gradio
+opencv-python

utils/__init__.py ADDED Viewed

File without changes

utils/constants.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import tensorflow as tf
+# DATA
+DATASET_NAME = "organmnist3d"
+BATCH_SIZE = 32
+AUTO = tf.data.AUTOTUNE
+INPUT_SHAPE = (28, 28, 28, 1)
+NUM_CLASSES = 11
+# OPTIMIZER
+LEARNING_RATE = 1e-4
+WEIGHT_DECAY = 1e-5
+# TRAINING
+EPOCHS = 80
+# TUBELET EMBEDDING
+PATCH_SIZE = (8, 8, 8)
+NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2
+# ViViT ARCHITECTURE
+LAYER_NORM_EPS = 1e-6
+PROJECTION_DIM = 128
+NUM_HEADS = 8
+NUM_LAYERS = 8
+labels = ['liver', 'kidney-right', 'kidney-left', 'femur-right', 'femur-left', 'bladder', 'heart', 'lung-right', 'lung-left', 'spleen', 'pancreas']

utils/predict.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import cv2
+import numpy as np
+import tensorflow as tf
+from huggingface_hub import from_pretrained_keras
+from tensorflow.keras.optimizers import Adam
+from .constants import LEARNING_RATE
+def get_model():
+    """
+    Download the model from the Hugging Face Hub and compile it.
+    """
+    model = from_pretrained_keras("pablorodriper/video-vision-transformer")
+    model.compile(
+        optimizer=Adam(learning_rate=LEARNING_RATE),
+        loss="sparse_categorical_crossentropy",
+        # metrics=[
+        #     keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+        #     keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        # ],
+    )
+    return model
+model = get_model()
+labels = ['liver', 'kidney-right', 'kidney-left', 'femur-right', 'femur-left', 'bladder', 'heart', 'lung-right', 'lung-left', 'spleen', 'pancreas']
+def predict_label(path):
+    frames = load_video(path)
+    dataloader = prepare_dataloader(frames)
+    prediction = model.predict(dataloader)[0]
+    label = np.argmax(prediction, axis=0)
+    label = labels[label]
+    return label
+def load_video(path):
+    """
+    Load video from path and return a list of frames.
+    The video is converted to grayscale because it is the format expected by the model.
+    """
+    cap = cv2.VideoCapture(path)
+    frames = []
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            frames.append(frame)
+    finally:
+        cap.release()
+    return np.array(frames)
+def prepare_dataloader(video):
+    video = tf.expand_dims(video, axis=0)
+    dataset = tf.data.Dataset.from_tensor_slices((video, np.array([0])))
+    dataloader = (
+        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
+        .batch(1)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+    return dataloader
+@tf.function
+def preprocess(frames: tf.Tensor, label: tf.Tensor):
+    """Preprocess the frames tensors and parse the labels."""
+    # Preprocess images
+    frames = tf.image.convert_image_dtype(
+        frames[
+            ..., tf.newaxis
+        ],  # The new axis is to help for further processing with Conv3D layers
+        tf.float32,
+    )
+    # Parse label
+    label = tf.cast(label, tf.float32)
+    return frames, label