Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,7 @@ from threading import Thread
|
|
| 9 |
import base64
|
| 10 |
import shutil
|
| 11 |
import re
|
|
|
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import spaces
|
|
@@ -38,8 +39,8 @@ from diffusers.utils import export_to_ply
|
|
| 38 |
# Additional import for Phi-4 multimodality (audio support)
|
| 39 |
import soundfile as sf
|
| 40 |
|
| 41 |
-
|
| 42 |
os.system('pip install backoff')
|
|
|
|
| 43 |
# Global constants and helper functions
|
| 44 |
|
| 45 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -59,6 +60,17 @@ def glb_to_data_url(glb_path: str) -> str:
|
|
| 59 |
b64_data = base64.b64encode(data).decode("utf-8")
|
| 60 |
return f"data:model/gltf-binary;base64,{b64_data}"
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# Model class for Text-to-3D Generation (ShapE)
|
| 63 |
|
| 64 |
class Model:
|
|
@@ -458,11 +470,13 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
|
|
| 458 |
|
| 459 |
if input_type.lower() == "image":
|
| 460 |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 461 |
-
|
|
|
|
| 462 |
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
| 463 |
elif input_type.lower() == "audio":
|
| 464 |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 465 |
-
|
|
|
|
| 466 |
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
| 467 |
else:
|
| 468 |
yield "Invalid input type selected."
|
|
@@ -719,7 +733,7 @@ demo = gr.ChatInterface(
|
|
| 719 |
description=DESCRIPTION,
|
| 720 |
css=css,
|
| 721 |
fill_height=True,
|
| 722 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
|
| 723 |
stop_btn="Stop Generation",
|
| 724 |
multimodal=True,
|
| 725 |
)
|
|
|
|
| 9 |
import base64
|
| 10 |
import shutil
|
| 11 |
import re
|
| 12 |
+
from io import BytesIO
|
| 13 |
|
| 14 |
import gradio as gr
|
| 15 |
import spaces
|
|
|
|
| 39 |
# Additional import for Phi-4 multimodality (audio support)
|
| 40 |
import soundfile as sf
|
| 41 |
|
|
|
|
| 42 |
os.system('pip install backoff')
|
| 43 |
+
|
| 44 |
# Global constants and helper functions
|
| 45 |
|
| 46 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 60 |
b64_data = base64.b64encode(data).decode("utf-8")
|
| 61 |
return f"data:model/gltf-binary;base64,{b64_data}"
|
| 62 |
|
| 63 |
+
def load_audio_file(file):
|
| 64 |
+
"""
|
| 65 |
+
Loads an audio file. If file is a string path, it reads directly.
|
| 66 |
+
Otherwise, it assumes file is a file-like object.
|
| 67 |
+
"""
|
| 68 |
+
if isinstance(file, str):
|
| 69 |
+
audio, samplerate = sf.read(file)
|
| 70 |
+
else:
|
| 71 |
+
audio, samplerate = sf.read(BytesIO(file.read()))
|
| 72 |
+
return audio, samplerate
|
| 73 |
+
|
| 74 |
# Model class for Text-to-3D Generation (ShapE)
|
| 75 |
|
| 76 |
class Model:
|
|
|
|
| 470 |
|
| 471 |
if input_type.lower() == "image":
|
| 472 |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 473 |
+
# Use load_image (as in Qwen2-VL-OCR-2B-Instruct) to handle image file input
|
| 474 |
+
image = load_image(file)
|
| 475 |
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
| 476 |
elif input_type.lower() == "audio":
|
| 477 |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
| 478 |
+
# Use load_audio_file to handle audio file input
|
| 479 |
+
audio, samplerate = load_audio_file(file)
|
| 480 |
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
| 481 |
else:
|
| 482 |
yield "Invalid input type selected."
|
|
|
|
| 733 |
description=DESCRIPTION,
|
| 734 |
css=css,
|
| 735 |
fill_height=True,
|
| 736 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4 - audio, image, or plain text"),
|
| 737 |
stop_btn="Stop Generation",
|
| 738 |
multimodal=True,
|
| 739 |
)
|