Spaces:

owinymarvin
/

SW_AI_deployment

Sleeping

App Files Files Community

owinymarvin commited on May 24

Commit

aedc519

1 Parent(s): 11e2014

latest changes

Browse files

Files changed (1) hide show

app.py +80 -46

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import torch
-# Choose the `slowfast_r50` model
-model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
-from typing import Dict
 import json
 import urllib
 from torchvision.transforms import Compose, Lambda
@@ -15,24 +13,33 @@ from pytorchvideo.transforms import (
     ShortSideScale,
     UniformTemporalSubsample,
     UniformCropVideo
-)
-import gradio as gr
-# Set to GPU or CPU
 device = "cpu"
 model = model.eval()
 model = model.to(device)
 json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
 json_filename = "kinetics_classnames.json"
-try: urllib.URLopener().retrieve(json_url, json_filename)
-except: urllib.request.urlretrieve(json_url, json_filename)
 with open(json_filename, "r") as f:
     kinetics_classnames = json.load(f)
-# Create an id to label name mapping
 kinetics_id_to_classname = {}
 for k, v in kinetics_classnames.items():
     kinetics_id_to_classname[v] = str(k).replace('"', "")
 side_size = 256
 mean = [0.45, 0.45, 0.45]
 std = [0.225, 0.225, 0.225]
@@ -41,19 +48,18 @@ num_frames = 32
 sampling_rate = 2
 frames_per_second = 30
 slowfast_alpha = 4
-num_clips = 10
-num_crops = 3
 class PackPathway(torch.nn.Module):
     """
-    Transform for converting video frames as a list of tensors.
     """
     def __init__(self):
         super().__init__()
     def forward(self, frames: torch.Tensor):
         fast_pathway = frames
-        # Perform temporal sampling from the fast pathway.
         slow_pathway = torch.index_select(
             frames,
             1,
@@ -64,7 +70,7 @@ class PackPathway(torch.nn.Module):
         frame_list = [slow_pathway, fast_pathway]
         return frame_list
-transform =  ApplyTransformToKey(
     key="video",
     transform=Compose(
         [
@@ -79,53 +85,81 @@ transform =  ApplyTransformToKey(
         ]
     ),
 )
-# The duration of the input clip is also specific to the model.
 clip_duration = (num_frames * sampling_rate)/frames_per_second
 url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
 video_path = 'archery.mp4'
 try: urllib.URLopener().retrieve(url_link, video_path)
 except: urllib.request.urlretrieve(url_link, video_path)
-# Select the duration of the clip to load by specifying the start and end duration
-# The start_sec should correspond to where the action occurs in the video
 def inference(in_vid):
-    start_sec = 0
-    end_sec = start_sec + clip_duration
-    # Initialize an EncodedVideo helper class and load the video
-    video = EncodedVideo.from_path(in_vid)
-    # Load the desired clip
-    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
-    # Apply a transform to normalize the video input
-    video_data = transform(video_data)
-    # Move the inputs to the desired device
-    inputs = video_data["video"]
-    inputs = [i.to(device)[None, ...] for i in inputs]
-    # Pass the input clip through the model
-    preds = model(inputs)
-    # Get the predicted classes
-    post_act = torch.nn.Softmax(dim=1)
-    preds = post_act(preds)
-    pred_classes = preds.topk(k=5).indices[0]
-    # Map the predicted classes to the label names
-    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
-    return "%s" % ", ".join(pred_class_names)
-inputs = gr.inputs.Video(label="Input Video")
-outputs = gr.outputs.Textbox(label="Top 5 predicted labels")
-title = "SLOWFAST"
-description = "demo for SLOWFAST, SlowFast networks pretrained on the Kinetics 400 dataset. To use it, simply upload your video, or click one of the examples to load them. Read more at the links below."
-article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo'>Github Repo</a></p>"
 examples = [
-    ['archery.mp4']
 ]
-gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples, analytics_enabled=False).launch(debug=True)

 import torch
+import gradio as gr
 import json
 import urllib
 from torchvision.transforms import Compose, Lambda
     ShortSideScale,
     UniformTemporalSubsample,
     UniformCropVideo
+)
+import numpy as np # Explicitly add numpy import
+# Choose the `slowfast_r50` model
+model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
+# Set to CPU since you don't have a GPU
 device = "cpu"
 model = model.eval()
 model = model.to(device)
+# --- Class Name Loading (from notebook) ---
 json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
 json_filename = "kinetics_classnames.json"
+try:
+    urllib.URLopener().retrieve(json_url, json_filename)
+except:
+    urllib.request.urlretrieve(json_url, json_filename)
 with open(json_filename, "r") as f:
     kinetics_classnames = json.load(f)
 kinetics_id_to_classname = {}
 for k, v in kinetics_classnames.items():
     kinetics_id_to_classname[v] = str(k).replace('"', "")
+# --- Define Input Transform (from notebook) ---
 side_size = 256
 mean = [0.45, 0.45, 0.45]
 std = [0.225, 0.225, 0.225]
 sampling_rate = 2
 frames_per_second = 30
 slowfast_alpha = 4
+# num_clips = 10 # Not used in inference function
+# num_crops = 3 # Not used in inference function
 class PackPathway(torch.nn.Module):
     """
+    Transform for converting video frames as a list of tensors.
     """
     def __init__(self):
         super().__init__()
     def forward(self, frames: torch.Tensor):
         fast_pathway = frames
         slow_pathway = torch.index_select(
             frames,
             1,
         frame_list = [slow_pathway, fast_pathway]
         return frame_list
+transform = ApplyTransformToKey(
     key="video",
     transform=Compose(
         [
         ]
     ),
 )
 clip_duration = (num_frames * sampling_rate)/frames_per_second
+# Download example video (for local testing and for Gradio examples)
 url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
 video_path = 'archery.mp4'
 try: urllib.URLopener().retrieve(url_link, video_path)
 except: urllib.request.urlretrieve(url_link, video_path)
 def inference(in_vid):
+    if in_vid is None:
+        return "Please upload a video or use the webcam."
+    try:
+        # Initialize an EncodedVideo helper class and load the video
+        video = EncodedVideo.from_path(in_vid)
+        # Ensure we have enough frames for the clip duration
+        if video.duration < clip_duration:
+            return f"Video is too short. Minimum duration is {clip_duration:.2f} seconds."
+        # Select the duration of the clip to load by specifying the start and end duration
+        start_sec = 0
+        end_sec = start_sec + clip_duration
+        # Load the desired clip
+        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+        # Apply a transform to normalize the video input
+        video_data = transform(video_data)
+        # Move the inputs to the desired device
+        inputs = video_data["video"]
+        inputs = [i.to(device)[None, ...] for i in inputs]
+        # Pass the input clip through the model
+        with torch.no_grad(): # Ensure no gradient computation for inference
+            preds = model(inputs)
+        # Get the predicted classes
+        post_act = torch.nn.Softmax(dim=1)
+        preds = post_act(preds)
+        pred_classes = preds.topk(k=5).indices[0]
+        # Map the predicted classes to the label names
+        pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
+        return "Top 5 predicted labels: %s" % ", ".join(pred_class_names)
+    except Exception as e:
+        # Catch common errors like video decoding issues or insufficient frames
+        return f"An error occurred during inference: {e}"
+# --- UPDATED GRADIO INTERFACE SYNTAX ---
+# Removed gr.inputs and gr.outputs
+inputs_gradio = gr.Video(label="Upload Video or Use Webcam", sources=["upload", "webcam"], format="mp4")
+outputs_gradio = gr.Textbox(label="Top 5 Predicted Labels")
+title = "PyTorchVideo SlowFast Action Recognition"
+description = """
+Demo for PyTorchVideo's SlowFast model, pretrained on the Kinetics 400 dataset for action recognition.
+Upload your video or use your webcam to classify the action.
+"""
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1812.03982' target='_blank'>SlowFast Networks for Video Recognition</a> | <a href='https://github.com/facebookresearch/pytorchvideo' target='_blank'>PyTorchVideo GitHub Repo</a></p>"
 examples = [
+    [video_path] # Use the downloaded archery.mp4 as an example
 ]
+gr.Interface(
+    fn=inference,
+    inputs=inputs_gradio,
+    outputs=outputs_gradio,
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    analytics_enabled=False
+).launch()