PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 26

Commit

a165b0a

verified ·

1 Parent(s): e685e73

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -90

app.py CHANGED Viewed

@@ -5,11 +5,10 @@ import json
 import time
 import asyncio
 import tempfile
 import base64
 import shutil
 import re
-import gc
-from threading import Thread
 import gradio as gr
 import spaces
@@ -34,10 +33,7 @@ from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
-from diffusers.utils import export_to_ply, export_to_video
-# NEW IMPORTS FOR TEXT-TO-VIDEO
-from diffusers import LTXPipeline, LTXImageToVideoPipeline
 # Global constants and helper functions
@@ -92,7 +88,7 @@ class Model:
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
         images = self.pipe(
             prompt,
             generator=generator,
@@ -105,7 +101,7 @@ class Model:
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.pipe.device).manual_seed(seed)
         images = self.pipe_img(
             image,
             generator=generator,
@@ -239,9 +235,7 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
 # Gradio UI configuration
 DESCRIPTION = """
-# Agent Dino 🌠
-Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
-"""
 css = '''
 h1 {
@@ -410,64 +404,6 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# ---------------------------
-# NEW: Text-to-Video Generation
-# ---------------------------
-# Initialize text-to-video pipeline
-t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
-t2v_pipe.to(device)
-def get_time_cost(run_task_time, time_cost_str):
-    now_time = int(time.time() * 1000)
-    if run_task_time == 0:
-        time_cost_str = 'start'
-    else:
-        if time_cost_str != '':
-            time_cost_str += f'-->'
-        time_cost_str += f'{now_time - run_task_time}'
-    run_task_time = now_time
-    return run_task_time, time_cost_str
-@spaces.GPU(duration=60)
-def text_to_video(
-    prompt: str,
-    negative_prompt: str,
-    width: int = 768,
-    height: int = 512,
-    num_frames: int = 121,
-    frame_rate: int = 25,
-    num_inference_steps: int = 30,
-    seed: int = 8,
-    progress: gr.Progress = gr.Progress(),
-):
-    generator = torch.Generator(device=device).manual_seed(seed)
-    run_task_time = 0
-    time_cost_str = ''
-    run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
-    try:
-        with torch.no_grad():
-            video = t2v_pipe(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                generator=generator,
-                width=width,
-                height=height,
-                num_frames=num_frames,
-                num_inference_steps=num_inference_steps,
-            ).frames[0]
-    finally:
-        torch.cuda.empty_cache()
-        gc.collect()
-    run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
-    output_path = tempfile.mktemp(suffix=".mp4")
-    export_to_video(video, output_path, fps=frame_rate)
-    del video
-    torch.cuda.empty_cache()
-    return output_path, time_cost_str
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
@@ -488,7 +424,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @text2video commands
 @spaces.GPU
 def generate(
@@ -508,7 +444,6 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
-      - "@text2video": triggers text-to-video generation.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -604,23 +539,6 @@ def generate(
         yield gr.Image(result_img)
         return
-    # --- Text-to-Video Generation branch ---
-    if text.strip().lower().startswith("@text2video"):
-        # Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
-        command_body = text[len("@text2video"):].strip()
-        if "||" in command_body:
-            prompt_text, negative_prompt = command_body.split("||", 1)
-            prompt_text = prompt_text.strip()
-            negative_prompt = negative_prompt.strip()
-        else:
-            prompt_text = command_body
-            negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
-        yield "🎞️ Generating video..."
-        video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
-        yield gr.Video(video_path)
-        yield f"Time cost by step (ms): {time_cost_str}"
-        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -717,14 +635,13 @@ demo = gr.ChatInterface(
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
-        ["@text2video A futuristic cityscape at dusk"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, @text2video-video gen, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import time
 import asyncio
 import tempfile
+from threading import Thread
 import base64
 import shutil
 import re
 import gradio as gr
 import spaces
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
+from diffusers.utils import export_to_ply
 # Global constants and helper functions
         return mesh_path.name
     def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.device).manual_seed(seed)
         images = self.pipe(
             prompt,
             generator=generator,
         return self.to_glb(ply_path.name)
     def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.device).manual_seed(seed)
         images = self.pipe_img(
             image,
             generator=generator,
 # Gradio UI configuration
 DESCRIPTION = """
+# Agent Dino 🌠 """
 css = '''
 h1 {
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )