akhaliq HF Staff commited on
Commit
224eae3
·
verified ·
1 Parent(s): f93f80b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -22
app.py CHANGED
@@ -7,20 +7,28 @@ import numpy as np
7
  from typing import Optional
8
  import tempfile
9
  import os
 
10
 
11
  MID = "apple/FastVLM-7B"
12
  IMAGE_TOKEN_INDEX = -200
13
 
14
- # Load model and tokenizer
15
- print("Loading FastVLM model...")
16
- tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
17
- model = AutoModelForCausalLM.from_pretrained(
18
- MID,
19
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
20
- device_map="auto",
21
- trust_remote_code=True,
22
- )
23
- print("Model loaded successfully!")
 
 
 
 
 
 
 
24
 
25
  def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
26
  """Extract frames from video"""
@@ -59,8 +67,11 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
59
  cap.release()
60
  return frames
61
 
 
62
  def caption_frame(image: Image.Image, prompt: str) -> str:
63
  """Generate caption for a single frame"""
 
 
64
  # Build chat with custom prompt
65
  messages = [
66
  {"role": "user", "content": f"<image>\n{prompt}"}
@@ -155,15 +166,7 @@ def process_video(
155
  return video_summary, frame_previews, video_path
156
 
157
  # Create the Gradio interface
158
- with gr.Blocks(css="""
159
- .video-container {
160
- height: calc(100vh - 100px) !important;
161
- }
162
- .sidebar {
163
- height: calc(100vh - 100px) !important;
164
- overflow-y: auto;
165
- }
166
- """) as demo:
167
  gr.Markdown("# 🎬 FastVLM Video Captioning")
168
 
169
  with gr.Row():
@@ -171,14 +174,12 @@ with gr.Blocks(css="""
171
  with gr.Column(scale=7):
172
  video_display = gr.Video(
173
  label="Video Input",
174
- height=600,
175
- elem_classes=["video-container"],
176
  autoplay=True,
177
  loop=True
178
  )
179
 
180
  # Sidebar with controls
181
- with gr.Sidebar(width=400, elem_classes=["sidebar"]):
182
  gr.Markdown("## ⚙️ Settings")
183
 
184
  with gr.Group():
 
7
  from typing import Optional
8
  import tempfile
9
  import os
10
+ import spaces
11
 
12
  MID = "apple/FastVLM-7B"
13
  IMAGE_TOKEN_INDEX = -200
14
 
15
+ # Initialize model variables
16
+ tok = None
17
+ model = None
18
+
19
+ def load_model():
20
+ global tok, model
21
+ if tok is None or model is None:
22
+ print("Loading FastVLM model...")
23
+ tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ MID,
26
+ torch_dtype=torch.float16,
27
+ device_map="cuda",
28
+ trust_remote_code=True,
29
+ )
30
+ print("Model loaded successfully!")
31
+ return tok, model
32
 
33
  def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
34
  """Extract frames from video"""
 
67
  cap.release()
68
  return frames
69
 
70
+ @spaces.GPU(duration=60)
71
  def caption_frame(image: Image.Image, prompt: str) -> str:
72
  """Generate caption for a single frame"""
73
+ # Load model on GPU
74
+ tok, model = load_model()
75
  # Build chat with custom prompt
76
  messages = [
77
  {"role": "user", "content": f"<image>\n{prompt}"}
 
166
  return video_summary, frame_previews, video_path
167
 
168
  # Create the Gradio interface
169
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
170
  gr.Markdown("# 🎬 FastVLM Video Captioning")
171
 
172
  with gr.Row():
 
174
  with gr.Column(scale=7):
175
  video_display = gr.Video(
176
  label="Video Input",
 
 
177
  autoplay=True,
178
  loop=True
179
  )
180
 
181
  # Sidebar with controls
182
+ with gr.Sidebar(width=400):
183
  gr.Markdown("## ⚙️ Settings")
184
 
185
  with gr.Group():