Spaces:

akhaliq
/

FastVLM-7B

Runtime error

App Files Files Community

akhaliq HF Staff commited on Sep 2

Commit

224eae3

verified ·

1 Parent(s): f93f80b

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -22

app.py CHANGED Viewed

@@ -7,20 +7,28 @@ import numpy as np
 from typing import Optional
 import tempfile
 import os
 MID = "apple/FastVLM-7B"
 IMAGE_TOKEN_INDEX = -200
-# Load model and tokenizer
-print("Loading FastVLM model...")
-tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MID,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
-    trust_remote_code=True,
-)
-print("Model loaded successfully!")
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
     """Extract frames from video"""
@@ -59,8 +67,11 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
     cap.release()
     return frames
 def caption_frame(image: Image.Image, prompt: str) -> str:
     """Generate caption for a single frame"""
     # Build chat with custom prompt
     messages = [
         {"role": "user", "content": f"<image>\n{prompt}"}
@@ -155,15 +166,7 @@ def process_video(
     return video_summary, frame_previews, video_path
 # Create the Gradio interface
-with gr.Blocks(css="""
-    .video-container {
-        height: calc(100vh - 100px) !important;
-    }
-    .sidebar {
-        height: calc(100vh - 100px) !important;
-        overflow-y: auto;
-    }
-""") as demo:
     gr.Markdown("# 🎬 FastVLM Video Captioning")
     with gr.Row():
@@ -171,14 +174,12 @@ with gr.Blocks(css="""
         with gr.Column(scale=7):
             video_display = gr.Video(
                 label="Video Input",
-                height=600,
-                elem_classes=["video-container"],
                 autoplay=True,
                 loop=True
             )
         # Sidebar with controls
-        with gr.Sidebar(width=400, elem_classes=["sidebar"]):
             gr.Markdown("## ⚙️ Settings")
             with gr.Group():

 from typing import Optional
 import tempfile
 import os
+import spaces
 MID = "apple/FastVLM-7B"
 IMAGE_TOKEN_INDEX = -200
+# Initialize model variables
+tok = None
+model = None
+def load_model():
+    global tok, model
+    if tok is None or model is None:
+        print("Loading FastVLM model...")
+        tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            MID,
+            torch_dtype=torch.float16,
+            device_map="cuda",
+            trust_remote_code=True,
+        )
+        print("Model loaded successfully!")
+    return tok, model
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
     """Extract frames from video"""
     cap.release()
     return frames
+@spaces.GPU(duration=60)
 def caption_frame(image: Image.Image, prompt: str) -> str:
     """Generate caption for a single frame"""
+    # Load model on GPU
+    tok, model = load_model()
     # Build chat with custom prompt
     messages = [
         {"role": "user", "content": f"<image>\n{prompt}"}
     return video_summary, frame_previews, video_path
 # Create the Gradio interface
+with gr.Blocks() as demo:
     gr.Markdown("# 🎬 FastVLM Video Captioning")
     with gr.Row():
         with gr.Column(scale=7):
             video_display = gr.Video(
                 label="Video Input",
                 autoplay=True,
                 loop=True
             )
         # Sidebar with controls
+        with gr.Sidebar(width=400):
             gr.Markdown("## ⚙️ Settings")
             with gr.Group():