Spaces:

codelion
/

videoanalysis

Sleeping

App Files Files Community

codelion commited on Apr 2

Commit

7c2c622

verified ·

1 Parent(s): 0f96bc2

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -41

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import gradio as gr
-import matplotlib.pyplot as plt
 from collections import Counter
 from google import genai
 from google.genai import types
@@ -16,13 +18,13 @@ if not GOOGLE_API_KEY:
 client = genai.Client(api_key=GOOGLE_API_KEY)
 # Use the Gemini 2.0 Flash model.
-MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
 def call_gemini(video_url: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video URL and prompt.
-    The video URL is passed as a URI part with MIME type "video/webm".
     """
     response = client.models.generate_content(
         model=MODEL_NAME,
@@ -33,48 +35,100 @@ def call_gemini(video_url: str, prompt: str) -> str:
     )
     return response.text
-def generate_chart(analysis_text: str) -> plt.Figure:
     """
-    Create a simple bar chart based on the frequency of selected keywords in the analysis.
     """
-    # Define keywords of interest
-    keywords = ["suspicious", "anomaly", "incident", "alert", "object", "movement"]
-    # Lowercase the analysis text and split into words
-    words = analysis_text.lower().split()
-    # Count occurrences for each keyword
-    counter = Counter({kw: words.count(kw) for kw in keywords})
-    # Create a bar chart using matplotlib
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.bar(counter.keys(), counter.values(), color="skyblue")
-    ax.set_title("Keyword Frequency in Analysis")
-    ax.set_ylabel("Count")
-    ax.set_xlabel("Keyword")
-    plt.tight_layout()
-    return fig
-def analyze_video(video_url: str, user_query: str) -> (str, plt.Figure):
     """
-    Perform iterative (agentic) video analysis.
-    The analysis is refined over several iterations, incorporating the user query if provided.
-    Returns a Markdown report and a matplotlib chart.
     """
     analysis = ""
     num_iterations = 3
     for i in range(num_iterations):
-        base_prompt = "You are a video analysis agent focusing on security and surveillance. Provide a detailed summary of the video, highlighting key events, suspicious activities, or anomalies."
         if user_query:
             base_prompt += f" Also, focus on the following query: {user_query}"
         if i == 0:
             prompt = base_prompt
         else:
-            prompt = (f"Based on the previous analysis: \"{analysis}\". "
-                      "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
-                      "and details that would help a security team understand the situation better. ")
             if user_query:
-                prompt += f"Remember to focus on: {user_query}"
         try:
             analysis = call_gemini(video_url, prompt)
@@ -82,39 +136,45 @@ def analyze_video(video_url: str, user_query: str) -> (str, plt.Figure):
             analysis += f"\n[Error during iteration {i+1}: {e}]"
             break
-    # Create a Markdown report (adding headings and bullet points if desired)
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{analysis}\n"
-    # Generate a chart visualization based on the analysis text.
-    chart_fig = generate_chart(analysis)
-    return markdown_report, chart_fig
-def gradio_interface(video_url: str, user_query: str) -> (str, any):
     """
-    Gradio interface function that takes a video URL and an optional query,
-    then returns a Markdown report and a visualization chart.
     """
     if not video_url:
-        return "Please provide a valid video URL.", None
     return analyze_video(video_url, user_query)
 # Define the Gradio interface with two inputs and two outputs.
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.Textbox(label="Video URL (publicly accessible, e.g., YouTube link)"),
         gr.Textbox(label="Analysis Query (optional): guide the focus of the analysis", placeholder="e.g., focus on unusual movements near the entrance")
     ],
     outputs=[
         gr.Markdown(label="Security & Surveillance Analysis Report"),
-        gr.Plot(label="Visualization: Keyword Frequency")
     ],
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
         "to iteratively analyze a video for security and surveillance insights. Provide a video URL and, optionally, "
-        "a query to guide the analysis. The tool returns a detailed Markdown report along with a bar chart visualization "
-        "of keyword frequency."
     )
 )

 import os
+import json
 import gradio as gr
+import cv2
+import matplotlib.pyplot as plt  # imported for compatibility if needed later
 from collections import Counter
 from google import genai
 from google.genai import types
 client = genai.Client(api_key=GOOGLE_API_KEY)
 # Use the Gemini 2.0 Flash model.
+MODEL_NAME = "gemini-2.0-flash"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
 def call_gemini(video_url: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video URL and prompt.
+    The video is passed as a URI part with MIME type "video/webm".
     """
     response = client.models.generate_content(
         model=MODEL_NAME,
     )
     return response.text
+def hhmmss_to_seconds(time_str: str) -> float:
     """
+    Convert a HH:MM:SS formatted string into seconds.
     """
+    parts = time_str.strip().split(":")
+    parts = [float(p) for p in parts]
+    if len(parts) == 3:
+        return parts[0]*3600 + parts[1]*60 + parts[2]
+    elif len(parts) == 2:
+        return parts[0]*60 + parts[1]
+    else:
+        return parts[0]
+def get_key_frames(video_url: str, analysis: str, user_query: str) -> list:
+    """
+    Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
+    then extract those frames from the video using OpenCV.
+    Returns a list of tuples: (image_array, caption)
+    """
+    prompt = (
+        "Based on the following video analysis, identify key frames that best illustrate "
+        "the important events or anomalies. Return a JSON array where each element is an object "
+        "with two keys: 'timestamp' (in HH:MM:SS format) and 'description' (a brief explanation of why "
+        "this frame is important)."
+    )
+    prompt += f" Video Analysis: {analysis}"
+    if user_query:
+        prompt += f" Additional focus: {user_query}"
+    try:
+        key_frames_response = call_gemini(video_url, prompt)
+        # Attempt to parse the output as JSON.
+        key_frames = json.loads(key_frames_response)
+        if not isinstance(key_frames, list):
+            key_frames = []
+    except Exception as e:
+        key_frames = []
+    extracted_frames = []
+    cap = cv2.VideoCapture(video_url)
+    if not cap.isOpened():
+        print("Error: Could not open video.")
+        return extracted_frames
+    for frame_obj in key_frames:
+        ts = frame_obj.get("timestamp")
+        description = frame_obj.get("description", "")
+        try:
+            seconds = hhmmss_to_seconds(ts)
+        except Exception:
+            continue
+        # Set video position (in milliseconds)
+        cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
+        ret, frame = cap.read()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            caption = f"{ts}: {description}"
+            extracted_frames.append((frame_rgb, caption))
+    cap.release()
+    return extracted_frames
+def analyze_video(video_url: str, user_query: str) -> (str, list):
     """
+    Perform iterative, agentic video analysis.
+    First, refine the video analysis over several iterations.
+    Then, prompt the model to identify key frames.
+    Returns:
+      - A Markdown report as a string.
+      - A gallery list of key frames (each as a tuple of (image, caption)).
     """
     analysis = ""
     num_iterations = 3
     for i in range(num_iterations):
+        base_prompt = (
+            "You are a video analysis agent focusing on security and surveillance. "
+            "Provide a detailed summary of the video, highlighting key events, suspicious activities, or anomalies."
+        )
         if user_query:
             base_prompt += f" Also, focus on the following query: {user_query}"
         if i == 0:
             prompt = base_prompt
         else:
+            prompt = (
+                f"Based on the previous analysis: \"{analysis}\". "
+                "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
+                "and details that would help a security team understand the situation better."
+            )
             if user_query:
+                prompt += f" Remember to focus on: {user_query}"
         try:
             analysis = call_gemini(video_url, prompt)
             analysis += f"\n[Error during iteration {i+1}: {e}]"
             break
+    # Create a Markdown report
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{analysis}\n"
+    # Get key frames based on the analysis and optional query.
+    key_frames_gallery = get_key_frames(video_url, analysis, user_query)
+    if not key_frames_gallery:
+        markdown_report += "\n*No key frames were extracted.*\n"
+    else:
+        markdown_report += "\n**Key Frames Extracted:**\n"
+        for idx, (img, caption) in enumerate(key_frames_gallery, start=1):
+            markdown_report += f"- **Frame {idx}:** {caption}\n"
+    return markdown_report, key_frames_gallery
+def gradio_interface(video_url: str, user_query: str) -> (str, list):
     """
+    Gradio interface function that accepts a video URL and an optional query,
+    then returns a Markdown report and a gallery of key frame images with captions.
     """
     if not video_url:
+        return "Please provide a valid video URL.", []
     return analyze_video(video_url, user_query)
 # Define the Gradio interface with two inputs and two outputs.
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.Textbox(label="Video URL (publicly accessible, e.g., YouTube direct link or video file URL)"),
         gr.Textbox(label="Analysis Query (optional): guide the focus of the analysis", placeholder="e.g., focus on unusual movements near the entrance")
     ],
     outputs=[
         gr.Markdown(label="Security & Surveillance Analysis Report"),
+        gr.Gallery(label="Extracted Key Frames").style(grid=[2], height="auto")
     ],
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
         "to iteratively analyze a video for security and surveillance insights. Provide a video URL and, optionally, "
+        "a query to guide the analysis. The tool returns a detailed Markdown report along with a gallery of key frame images."
     )
 )