Spaces:
Build error
Build error
Commit
·
e0cf56d
1
Parent(s):
199b9a9
Fix Space
Browse files- app.py +36 -18
- examples/femur-right.mp4 +0 -0
- examples/kidney-left.mp4 +0 -0
- requirements.txt +5 -1
- utils/__init__.py +0 -0
- utils/constants.py +27 -0
- utils/predict.py +84 -0
app.py
CHANGED
|
@@ -1,26 +1,44 @@
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import tensorflow as tf
|
| 3 |
-
from huggingface_hub import from_pretrained_keras
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
return model.predict(tf.expand_dims(video, axis=0))[0]
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
fn = infer,
|
| 18 |
-
inputs = "video",
|
| 19 |
-
outputs = "number",
|
| 20 |
-
description = description,
|
| 21 |
-
title = title,
|
| 22 |
-
article = article,
|
| 23 |
-
examples=["example_1.mp4", "example_2.mp4"]
|
| 24 |
-
)
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import glob
|
| 2 |
+
|
| 3 |
import gradio as gr
|
| 4 |
import tensorflow as tf
|
|
|
|
| 5 |
|
| 6 |
+
from utils.predict import predict_label
|
| 7 |
+
|
| 8 |
+
##Create list of examples to be loaded
|
| 9 |
+
example_list = glob.glob("examples/*.mp4")
|
| 10 |
+
example_list = list(map(lambda el:[el], example_list))
|
| 11 |
+
|
| 12 |
+
demo = gr.Blocks()
|
| 13 |
+
|
| 14 |
+
with demo:
|
| 15 |
+
gr.Markdown("# **<p align='center'>Video Vision Transformer on medmnist</p>**")
|
| 16 |
+
|
| 17 |
+
with gr.Tab("Upload & Predict"):
|
| 18 |
+
with gr.Box():
|
| 19 |
+
with gr.Row():
|
| 20 |
+
input_video = gr.Video(label="Input Video", show_label=True)
|
| 21 |
+
output_label = gr.Label(label="Model Output", show_label=True)
|
| 22 |
+
|
| 23 |
+
gr.Markdown("**Predict**")
|
| 24 |
+
|
| 25 |
+
with gr.Box():
|
| 26 |
+
with gr.Row():
|
| 27 |
+
submit_button = gr.Button("Submit")
|
| 28 |
|
| 29 |
+
gr.Markdown("The model is trained to classify videos belonging to the following classes: liver, kidney-right, kidney-left, femur-right, femur-left, bladder, heart, lung-right, lung-left, spleen and pancreas")
|
|
|
|
| 30 |
|
| 31 |
+
gr.Examples(example_list, [input_video], [output_label], predict_label, cache_examples=True)
|
| 32 |
|
| 33 |
+
submit_button.click(predict_label, inputs=input_video, outputs=output_label)
|
| 34 |
|
| 35 |
+
gr.Markdown('\n Demo created by: <a href=\"https://huggingface.co/pablorodriper\"> Pablo Rodríguez</a> <br> Based on the Keras example by <a href=\"https://keras.io/examples/vision/vivit/\">Aritra Roy Gosthipaty and Ayush Thakur</a>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
demo.launch()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
examples/femur-right.mp4
ADDED
|
Binary file (7.7 kB). View file
|
|
|
examples/kidney-left.mp4
ADDED
|
Binary file (7.19 kB). View file
|
|
|
requirements.txt
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers==4.23
|
| 2 |
+
huggingface_hub>0.10
|
| 3 |
+
tensorflow>2.6
|
| 4 |
+
gradio
|
| 5 |
+
opencv-python
|
utils/__init__.py
ADDED
|
File without changes
|
utils/constants.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
|
| 3 |
+
# DATA
|
| 4 |
+
DATASET_NAME = "organmnist3d"
|
| 5 |
+
BATCH_SIZE = 32
|
| 6 |
+
AUTO = tf.data.AUTOTUNE
|
| 7 |
+
INPUT_SHAPE = (28, 28, 28, 1)
|
| 8 |
+
NUM_CLASSES = 11
|
| 9 |
+
|
| 10 |
+
# OPTIMIZER
|
| 11 |
+
LEARNING_RATE = 1e-4
|
| 12 |
+
WEIGHT_DECAY = 1e-5
|
| 13 |
+
|
| 14 |
+
# TRAINING
|
| 15 |
+
EPOCHS = 80
|
| 16 |
+
|
| 17 |
+
# TUBELET EMBEDDING
|
| 18 |
+
PATCH_SIZE = (8, 8, 8)
|
| 19 |
+
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2
|
| 20 |
+
|
| 21 |
+
# ViViT ARCHITECTURE
|
| 22 |
+
LAYER_NORM_EPS = 1e-6
|
| 23 |
+
PROJECTION_DIM = 128
|
| 24 |
+
NUM_HEADS = 8
|
| 25 |
+
NUM_LAYERS = 8
|
| 26 |
+
|
| 27 |
+
labels = ['liver', 'kidney-right', 'kidney-left', 'femur-right', 'femur-left', 'bladder', 'heart', 'lung-right', 'lung-left', 'spleen', 'pancreas']
|
utils/predict.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
from huggingface_hub import from_pretrained_keras
|
| 5 |
+
from tensorflow.keras.optimizers import Adam
|
| 6 |
+
|
| 7 |
+
from .constants import LEARNING_RATE
|
| 8 |
+
|
| 9 |
+
def get_model():
|
| 10 |
+
"""
|
| 11 |
+
Download the model from the Hugging Face Hub and compile it.
|
| 12 |
+
"""
|
| 13 |
+
model = from_pretrained_keras("pablorodriper/video-vision-transformer")
|
| 14 |
+
|
| 15 |
+
model.compile(
|
| 16 |
+
optimizer=Adam(learning_rate=LEARNING_RATE),
|
| 17 |
+
loss="sparse_categorical_crossentropy",
|
| 18 |
+
# metrics=[
|
| 19 |
+
# keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
|
| 20 |
+
# keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
|
| 21 |
+
# ],
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
return model
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
model = get_model()
|
| 28 |
+
labels = ['liver', 'kidney-right', 'kidney-left', 'femur-right', 'femur-left', 'bladder', 'heart', 'lung-right', 'lung-left', 'spleen', 'pancreas']
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def predict_label(path):
|
| 32 |
+
frames = load_video(path)
|
| 33 |
+
dataloader = prepare_dataloader(frames)
|
| 34 |
+
prediction = model.predict(dataloader)[0]
|
| 35 |
+
label = np.argmax(prediction, axis=0)
|
| 36 |
+
label = labels[label]
|
| 37 |
+
|
| 38 |
+
return label
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def load_video(path):
|
| 42 |
+
"""
|
| 43 |
+
Load video from path and return a list of frames.
|
| 44 |
+
The video is converted to grayscale because it is the format expected by the model.
|
| 45 |
+
"""
|
| 46 |
+
cap = cv2.VideoCapture(path)
|
| 47 |
+
frames = []
|
| 48 |
+
try:
|
| 49 |
+
while True:
|
| 50 |
+
ret, frame = cap.read()
|
| 51 |
+
if not ret:
|
| 52 |
+
break
|
| 53 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 54 |
+
frames.append(frame)
|
| 55 |
+
finally:
|
| 56 |
+
cap.release()
|
| 57 |
+
return np.array(frames)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def prepare_dataloader(video):
|
| 61 |
+
video = tf.expand_dims(video, axis=0)
|
| 62 |
+
dataset = tf.data.Dataset.from_tensor_slices((video, np.array([0])))
|
| 63 |
+
|
| 64 |
+
dataloader = (
|
| 65 |
+
dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
|
| 66 |
+
.batch(1)
|
| 67 |
+
.prefetch(tf.data.AUTOTUNE)
|
| 68 |
+
)
|
| 69 |
+
return dataloader
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@tf.function
|
| 73 |
+
def preprocess(frames: tf.Tensor, label: tf.Tensor):
|
| 74 |
+
"""Preprocess the frames tensors and parse the labels."""
|
| 75 |
+
# Preprocess images
|
| 76 |
+
frames = tf.image.convert_image_dtype(
|
| 77 |
+
frames[
|
| 78 |
+
..., tf.newaxis
|
| 79 |
+
], # The new axis is to help for further processing with Conv3D layers
|
| 80 |
+
tf.float32,
|
| 81 |
+
)
|
| 82 |
+
# Parse label
|
| 83 |
+
label = tf.cast(label, tf.float32)
|
| 84 |
+
return frames, label
|