Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,11 +5,10 @@ import json
|
|
| 5 |
import time
|
| 6 |
import asyncio
|
| 7 |
import tempfile
|
|
|
|
| 8 |
import base64
|
| 9 |
import shutil
|
| 10 |
import re
|
| 11 |
-
import gc
|
| 12 |
-
from threading import Thread
|
| 13 |
|
| 14 |
import gradio as gr
|
| 15 |
import spaces
|
|
@@ -34,10 +33,7 @@ from transformers.image_utils import load_image
|
|
| 34 |
|
| 35 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
| 36 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 37 |
-
from diffusers.utils import export_to_ply
|
| 38 |
-
|
| 39 |
-
# NEW IMPORTS FOR TEXT-TO-VIDEO
|
| 40 |
-
from diffusers import LTXPipeline, LTXImageToVideoPipeline
|
| 41 |
|
| 42 |
# Global constants and helper functions
|
| 43 |
|
|
@@ -92,7 +88,7 @@ class Model:
|
|
| 92 |
return mesh_path.name
|
| 93 |
|
| 94 |
def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
|
| 95 |
-
generator = torch.Generator(device=self.
|
| 96 |
images = self.pipe(
|
| 97 |
prompt,
|
| 98 |
generator=generator,
|
|
@@ -105,7 +101,7 @@ class Model:
|
|
| 105 |
return self.to_glb(ply_path.name)
|
| 106 |
|
| 107 |
def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
|
| 108 |
-
generator = torch.Generator(device=self.
|
| 109 |
images = self.pipe_img(
|
| 110 |
image,
|
| 111 |
generator=generator,
|
|
@@ -239,9 +235,7 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
|
|
| 239 |
# Gradio UI configuration
|
| 240 |
|
| 241 |
DESCRIPTION = """
|
| 242 |
-
# Agent Dino 🌠
|
| 243 |
-
Your multimodal chatbot supporting text, image, 3D, web search, object detection, reasoning, and now text-to-video generation.
|
| 244 |
-
"""
|
| 245 |
|
| 246 |
css = '''
|
| 247 |
h1 {
|
|
@@ -410,64 +404,6 @@ def generate_3d_fn(
|
|
| 410 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
| 411 |
return glb_path, seed
|
| 412 |
|
| 413 |
-
# ---------------------------
|
| 414 |
-
# NEW: Text-to-Video Generation
|
| 415 |
-
# ---------------------------
|
| 416 |
-
|
| 417 |
-
# Initialize text-to-video pipeline
|
| 418 |
-
t2v_pipe = LTXPipeline.from_pretrained("Skywork/SkyReels-V1-Hunyuan-T2V", torch_dtype=torch.bfloat16)
|
| 419 |
-
t2v_pipe.to(device)
|
| 420 |
-
|
| 421 |
-
def get_time_cost(run_task_time, time_cost_str):
|
| 422 |
-
now_time = int(time.time() * 1000)
|
| 423 |
-
if run_task_time == 0:
|
| 424 |
-
time_cost_str = 'start'
|
| 425 |
-
else:
|
| 426 |
-
if time_cost_str != '':
|
| 427 |
-
time_cost_str += f'-->'
|
| 428 |
-
time_cost_str += f'{now_time - run_task_time}'
|
| 429 |
-
run_task_time = now_time
|
| 430 |
-
return run_task_time, time_cost_str
|
| 431 |
-
|
| 432 |
-
@spaces.GPU(duration=60)
|
| 433 |
-
def text_to_video(
|
| 434 |
-
prompt: str,
|
| 435 |
-
negative_prompt: str,
|
| 436 |
-
width: int = 768,
|
| 437 |
-
height: int = 512,
|
| 438 |
-
num_frames: int = 121,
|
| 439 |
-
frame_rate: int = 25,
|
| 440 |
-
num_inference_steps: int = 30,
|
| 441 |
-
seed: int = 8,
|
| 442 |
-
progress: gr.Progress = gr.Progress(),
|
| 443 |
-
):
|
| 444 |
-
generator = torch.Generator(device=device).manual_seed(seed)
|
| 445 |
-
run_task_time = 0
|
| 446 |
-
time_cost_str = ''
|
| 447 |
-
run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
|
| 448 |
-
try:
|
| 449 |
-
with torch.no_grad():
|
| 450 |
-
video = t2v_pipe(
|
| 451 |
-
prompt=prompt,
|
| 452 |
-
negative_prompt=negative_prompt,
|
| 453 |
-
generator=generator,
|
| 454 |
-
width=width,
|
| 455 |
-
height=height,
|
| 456 |
-
num_frames=num_frames,
|
| 457 |
-
num_inference_steps=num_inference_steps,
|
| 458 |
-
).frames[0]
|
| 459 |
-
finally:
|
| 460 |
-
torch.cuda.empty_cache()
|
| 461 |
-
gc.collect()
|
| 462 |
-
run_task_time, time_cost_str = get_time_cost(run_task_time, time_cost_str)
|
| 463 |
-
|
| 464 |
-
output_path = tempfile.mktemp(suffix=".mp4")
|
| 465 |
-
export_to_video(video, output_path, fps=frame_rate)
|
| 466 |
-
|
| 467 |
-
del video
|
| 468 |
-
torch.cuda.empty_cache()
|
| 469 |
-
return output_path, time_cost_str
|
| 470 |
-
|
| 471 |
# YOLO Object Detection Setup
|
| 472 |
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
| 473 |
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
|
@@ -488,7 +424,7 @@ def detect_objects(image: np.ndarray):
|
|
| 488 |
|
| 489 |
return Image.fromarray(annotated_image)
|
| 490 |
|
| 491 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent,
|
| 492 |
|
| 493 |
@spaces.GPU
|
| 494 |
def generate(
|
|
@@ -508,7 +444,6 @@ def generate(
|
|
| 508 |
- "@web": triggers a web search or webpage visit.
|
| 509 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
| 510 |
- "@yolo": triggers object detection using YOLO.
|
| 511 |
-
- "@text2video": triggers text-to-video generation.
|
| 512 |
"""
|
| 513 |
text = input_dict["text"]
|
| 514 |
files = input_dict.get("files", [])
|
|
@@ -604,23 +539,6 @@ def generate(
|
|
| 604 |
yield gr.Image(result_img)
|
| 605 |
return
|
| 606 |
|
| 607 |
-
# --- Text-to-Video Generation branch ---
|
| 608 |
-
if text.strip().lower().startswith("@text2video"):
|
| 609 |
-
# Expect the command to be: "@text2video <prompt> [|| <negative prompt>]"
|
| 610 |
-
command_body = text[len("@text2video"):].strip()
|
| 611 |
-
if "||" in command_body:
|
| 612 |
-
prompt_text, negative_prompt = command_body.split("||", 1)
|
| 613 |
-
prompt_text = prompt_text.strip()
|
| 614 |
-
negative_prompt = negative_prompt.strip()
|
| 615 |
-
else:
|
| 616 |
-
prompt_text = command_body
|
| 617 |
-
negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly"
|
| 618 |
-
yield "🎞️ Generating video..."
|
| 619 |
-
video_path, time_cost_str = text_to_video(prompt_text, negative_prompt)
|
| 620 |
-
yield gr.Video(video_path)
|
| 621 |
-
yield f"Time cost by step (ms): {time_cost_str}"
|
| 622 |
-
return
|
| 623 |
-
|
| 624 |
# --- Text and TTS branch ---
|
| 625 |
tts_prefix = "@tts"
|
| 626 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -717,14 +635,13 @@ demo = gr.ChatInterface(
|
|
| 717 |
["@rAgent Explain how a binary search algorithm works."],
|
| 718 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 719 |
["@tts1 Explain Tower of Hanoi"],
|
| 720 |
-
["@text2video A futuristic cityscape at dusk"],
|
| 721 |
],
|
| 722 |
cache_examples=False,
|
| 723 |
type="messages",
|
| 724 |
description=DESCRIPTION,
|
| 725 |
css=css,
|
| 726 |
fill_height=True,
|
| 727 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection,
|
| 728 |
stop_btn="Stop Generation",
|
| 729 |
multimodal=True,
|
| 730 |
)
|
|
|
|
| 5 |
import time
|
| 6 |
import asyncio
|
| 7 |
import tempfile
|
| 8 |
+
from threading import Thread
|
| 9 |
import base64
|
| 10 |
import shutil
|
| 11 |
import re
|
|
|
|
|
|
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import spaces
|
|
|
|
| 33 |
|
| 34 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
| 35 |
from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
| 36 |
+
from diffusers.utils import export_to_ply
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# Global constants and helper functions
|
| 39 |
|
|
|
|
| 88 |
return mesh_path.name
|
| 89 |
|
| 90 |
def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
|
| 91 |
+
generator = torch.Generator(device=self.device).manual_seed(seed)
|
| 92 |
images = self.pipe(
|
| 93 |
prompt,
|
| 94 |
generator=generator,
|
|
|
|
| 101 |
return self.to_glb(ply_path.name)
|
| 102 |
|
| 103 |
def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
|
| 104 |
+
generator = torch.Generator(device=self.device).manual_seed(seed)
|
| 105 |
images = self.pipe_img(
|
| 106 |
image,
|
| 107 |
generator=generator,
|
|
|
|
| 235 |
# Gradio UI configuration
|
| 236 |
|
| 237 |
DESCRIPTION = """
|
| 238 |
+
# Agent Dino 🌠 """
|
|
|
|
|
|
|
| 239 |
|
| 240 |
css = '''
|
| 241 |
h1 {
|
|
|
|
| 404 |
glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
|
| 405 |
return glb_path, seed
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
# YOLO Object Detection Setup
|
| 408 |
YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
|
| 409 |
YOLO_CHECKPOINT_NAME = "images/demo.pt"
|
|
|
|
| 424 |
|
| 425 |
return Image.fromarray(annotated_image)
|
| 426 |
|
| 427 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
|
| 428 |
|
| 429 |
@spaces.GPU
|
| 430 |
def generate(
|
|
|
|
| 444 |
- "@web": triggers a web search or webpage visit.
|
| 445 |
- "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
|
| 446 |
- "@yolo": triggers object detection using YOLO.
|
|
|
|
| 447 |
"""
|
| 448 |
text = input_dict["text"]
|
| 449 |
files = input_dict.get("files", [])
|
|
|
|
| 539 |
yield gr.Image(result_img)
|
| 540 |
return
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
# --- Text and TTS branch ---
|
| 543 |
tts_prefix = "@tts"
|
| 544 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 635 |
["@rAgent Explain how a binary search algorithm works."],
|
| 636 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 637 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
|
| 638 |
],
|
| 639 |
cache_examples=False,
|
| 640 |
type="messages",
|
| 641 |
description=DESCRIPTION,
|
| 642 |
css=css,
|
| 643 |
fill_height=True,
|
| 644 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
|
| 645 |
stop_btn="Stop Generation",
|
| 646 |
multimodal=True,
|
| 647 |
)
|