PHI4-Multimodal

Runtime error

App Files Files Community

prithivMLmods commited on Feb 18

Commit

14bfced

verified ·

1 Parent(s): e448df3

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -39

app.py CHANGED Viewed

@@ -31,9 +31,7 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
-# -----------------------------------------------------------------------------
 # Global constants and helper functions
-# -----------------------------------------------------------------------------
 MAX_SEED = np.iinfo(np.int32).max
@@ -52,9 +50,7 @@ def glb_to_data_url(glb_path: str) -> str:
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-# -----------------------------------------------------------------------------
 # Model class for Text-to-3D Generation (ShapE)
-# -----------------------------------------------------------------------------
 class Model:
     def __init__(self):
@@ -113,9 +109,7 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# -----------------------------------------------------------------------------
 # New Tools for Web Functionality using DuckDuckGo and smolagents
-# -----------------------------------------------------------------------------
 from typing import Any, Optional
 from smolagents.tools import Tool
@@ -186,14 +180,68 @@ class VisitWebpageTool(Tool):
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
-# -----------------------------------------------------------------------------
 # Gradio UI configuration
-# -----------------------------------------------------------------------------
 DESCRIPTION = """
-# Agent Dino 🌠
-"""
 css = '''
 h1 {
@@ -215,11 +263,9 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# -----------------------------------------------------------------------------
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# -----------------------------------------------------------------------------
 # Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -244,9 +290,7 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# -----------------------------------------------------------------------------
 # Asynchronous text-to-speech
-# -----------------------------------------------------------------------------
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
@@ -254,9 +298,7 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     await communicate.save(output_file)
     return output_file
-# -----------------------------------------------------------------------------
 # Utility function to clean conversation history
-# -----------------------------------------------------------------------------
 def clean_chat_history(chat_history):
     """
@@ -269,9 +311,7 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# -----------------------------------------------------------------------------
 # Stable Diffusion XL Pipeline for Image Generation
-# -----------------------------------------------------------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
@@ -350,9 +390,7 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# -----------------------------------------------------------------------------
 # Text-to-3D Generation using the ShapE Pipeline
-# -----------------------------------------------------------------------------
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
@@ -371,9 +409,7 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# -----------------------------------------------------------------------------
-# Chat Generation Function with support for @tts, @image, @3d, and now @web commands
-# -----------------------------------------------------------------------------
 @spaces.GPU
 def generate(
@@ -386,14 +422,12 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input, TTS, image generation,
-    3D model generation, and web search/visit.
-    Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
       - "@3d": triggers 3D model generation using the ShapE pipeline.
-      - "@web": triggers a web search or webpage visit. Use "visit" after @web to fetch a page.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -401,7 +435,7 @@ def generate(
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
-        yield "Hold tight, generating a 3D mesh GLB file....."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
@@ -423,7 +457,7 @@ def generate(
     # --- Image Generation branch ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
-        yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
@@ -446,19 +480,28 @@ def generate(
         # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
-            yield "Visiting webpage..."
             visitor = VisitWebpageTool()
             content = visitor.forward(url)
             yield content
         else:
             # Otherwise, treat the rest as a search query.
             query = web_command
-            yield "Perform a web search ..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -496,7 +539,7 @@ def generate(
         thread.start()
         buffer = ""
-        yield "Thinking..."
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
@@ -535,9 +578,7 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# -----------------------------------------------------------------------------
 # Gradio Chat Interface Setup and Launch
-# -----------------------------------------------------------------------------
 demo = gr.ChatInterface(
     fn=generate,
@@ -553,8 +594,9 @@ demo = gr.ChatInterface(
         ["@3d A birthday cupcake with cherry"],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
-        ["Write a Python Code String Reverse With Example!"],
         ["@web latest breakthroughs in renewable energy"],
     ],
     cache_examples=False,
     type="messages",
@@ -570,10 +612,8 @@ demo = gr.ChatInterface(
 if not os.path.exists("static"):
     os.makedirs("static")
-# Mount the static folder onto the FastAPI app so that GLB files are served.
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
 if __name__ == "__main__":
-    # Launch without the unsupported static_dirs parameter.
     demo.queue(max_size=20).launch(share=True)

 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
 # Model class for Text-to-3D Generation (ShapE)
 class Model:
     def __init__(self):
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
 # New Tools for Web Functionality using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
+# New Feature: rAgent Reasoning using Llama mode OpenAI
+from openai import OpenAI
+ACCESS_TOKEN = os.getenv("HF_TOKEN")
+ragent_client = OpenAI(
+    base_url="https://api-inference.huggingface.co/v1/",
+    api_key=ACCESS_TOKEN,
+)
+SYSTEM_PROMPT = """You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
+To do so, you must follow a structured reasoning process in a cycle of:
+1. **Thought:**
+   - Analyze the problem and explain your reasoning.
+   - Identify any necessary tools or techniques.
+2. **Code:**
+   - Implement the solution using Python.
+   - Enclose the code block with `<end_code>`.
+3. **Observation:**
+   - Explain the output and verify correctness.
+4. **Final Answer:**
+   - Summarize the solution clearly.
+Always adhere to the **Thought → Code → Observation → Final Answer** structure.
+"""
+def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95):
+    """
+    Uses the Llama mode OpenAI model to perform a structured reasoning chain.
+    """
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Incorporate conversation history (if any)
+    for msg in history:
+        if msg.get("role") == "user":
+            messages.append({"role": "user", "content": msg["content"]})
+        elif msg.get("role") == "assistant":
+            messages.append({"role": "assistant", "content": msg["content"]})
+    messages.append({"role": "user", "content": prompt})
+    response = ""
+    stream = ragent_client.chat.completions.create(
+         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+         max_tokens=max_tokens,
+         stream=True,
+         temperature=temperature,
+         top_p=top_p,
+         messages=messages,
+    )
+    for message in stream:
+         token = message.choices[0].delta.content
+         response += token
+         yield response
 # Gradio UI configuration
 DESCRIPTION = """
+# Agent Dino 🌠 """
 css = '''
 h1 {
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
 # Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 # Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     await communicate.save(output_file)
     return output_file
 # Utility function to clean conversation history
 def clean_chat_history(chat_history):
     """
             cleaned.append(msg)
     return cleaned
 # Stable Diffusion XL Pipeline for Image Generation
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 # Text-to-3D Generation using the ShapE Pipeline
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
+# Chat Generation Function with support for @tts, @image, @3d, @web, and @rAgent commands
 @spaces.GPU
 def generate(
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input and special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
       - "@3d": triggers 3D model generation using the ShapE pipeline.
+      - "@web": triggers a web search or webpage visit.
+      - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
+        yield "🌀 Hold tight, generating a 3D mesh GLB file....."
         glb_path, used_seed = generate_3d_fn(
             prompt=prompt,
             seed=1,
     # --- Image Generation branch ---
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
+        yield "🪧 Generating image..."
         image_paths, used_seed = generate_image_fn(
             prompt=prompt,
             negative_prompt="",
         # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
+            yield "🌍 Visiting webpage..."
             visitor = VisitWebpageTool()
             content = visitor.forward(url)
             yield content
         else:
             # Otherwise, treat the rest as a search query.
             query = web_command
+            yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
             results = searcher.forward(query)
             yield results
         return
+    # --- rAgent Reasoning branch ---
+    if text.strip().lower().startswith("@ragent"):
+        prompt = text[len("@ragent"):].strip()
+        yield "Initiating reasoning chain using Llama mode..."
+        # Pass the current chat history (cleaned) to help inform the chain.
+        for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
+            yield partial
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         thread.start()
         buffer = ""
+        yield "🤔 Thinking..."
         for new_text in streamer:
             buffer += new_text
             buffer = buffer.replace("<|im_end|>", "")
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 # Gradio Chat Interface Setup and Launch
 demo = gr.ChatInterface(
     fn=generate,
         ["@3d A birthday cupcake with cherry"],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
+        ["@rAgent Explain how a binary search algorithm works."],
         ["@web latest breakthroughs in renewable energy"],
     ],
     cache_examples=False,
     type="messages",
 if not os.path.exists("static"):
     os.makedirs("static")
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)