PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 28

Commit

ace15c9

verified ·

1 Parent(s): 3541fa7

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -18

app.py CHANGED Viewed

@@ -60,16 +60,19 @@ def glb_to_data_url(glb_path: str) -> str:
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-def load_audio_file(file):
     """
-    Loads an audio file. If file is a string path, it reads directly.
-    Otherwise, it assumes file is a file-like object.
     """
     if isinstance(file, str):
-        audio, samplerate = sf.read(file)
     else:
-        audio, samplerate = sf.read(BytesIO(file.read()))
-    return audio, samplerate
 # Model class for Text-to-3D Generation (ShapE)
@@ -468,15 +471,18 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
         yield "Please upload a file and provide a question."
         return
     if input_type.lower() == "image":
         prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
-        # Use load_image (as in Qwen2-VL-OCR-2B-Instruct) to handle image file input
-        image = load_image(file)
         inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
     elif input_type.lower() == "audio":
         prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
-        # Use load_audio_file to handle audio file input
-        audio, samplerate = load_audio_file(file)
         inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
     else:
         yield "Invalid input type selected."
@@ -565,7 +571,6 @@ def generate(
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
-        # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
@@ -573,7 +578,6 @@ def generate(
             content = visitor.forward(url)
             yield content
         else:
-            # Otherwise, treat the rest as a search query.
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
@@ -585,7 +589,6 @@ def generate(
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
-        # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
@@ -596,13 +599,12 @@ def generate(
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
-        # Use the first attached image
         input_file = files[0]
         try:
             if isinstance(input_file, str):
                 pil_image = Image.open(input_file)
             else:
-                pil_image = input_file
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
@@ -613,7 +615,6 @@ def generate(
     # --- Phi-4 Multimodal branch with text streaming ---
     if text.strip().lower().startswith("@phi4"):
-        # Expected format: "@phi4 [image|audio] <your question>"
         parts = text.strip().split(maxsplit=2)
         if len(parts) < 3:
             yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
@@ -646,9 +647,9 @@ def generate(
     if files:
         if len(files) > 1:
-            images = [load_image(image) for image in files]
         elif len(files) == 1:
-            images = [load_image(files[0])]
         else:
             images = []
         messages = [{

     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
+def get_file_path(file):
     """
+    Normalize a file input. If the input is a string, assume it is a file path.
+    Otherwise, if the object has a 'name' attribute or key, return that.
     """
     if isinstance(file, str):
+        return file
+    elif hasattr(file, "name"):
+        return file.name
+    elif isinstance(file, dict) and "name" in file:
+        return file["name"]
     else:
+        return None
 # Model class for Text-to-3D Generation (ShapE)
         yield "Please upload a file and provide a question."
         return
+    file_path = get_file_path(file)
+    if file_path is None:
+        yield "Could not determine the file path."
+        return
     if input_type.lower() == "image":
         prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
+        image = Image.open(file_path)
         inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
     elif input_type.lower() == "audio":
         prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
+        audio, samplerate = sf.read(file_path)
         inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
     else:
         yield "Invalid input type selected."
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
             content = visitor.forward(url)
             yield content
         else:
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
         input_file = files[0]
         try:
             if isinstance(input_file, str):
                 pil_image = Image.open(input_file)
             else:
+                pil_image = Image.open(get_file_path(input_file))
         except Exception as e:
             yield f"Error loading image: {str(e)}"
             return
     # --- Phi-4 Multimodal branch with text streaming ---
     if text.strip().lower().startswith("@phi4"):
         parts = text.strip().split(maxsplit=2)
         if len(parts) < 3:
             yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
     if files:
         if len(files) > 1:
+            images = [load_image(get_file_path(image)) for image in files]
         elif len(files) == 1:
+            images = [load_image(get_file_path(files[0]))]
         else:
             images = []
         messages = [{