grounded-vqa

Runtime error

App Files Files Community

vikhyatk commited on Oct 28, 2024

Commit

abc934b

verified ·

1 Parent(s): c57ffa7

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -15

app.py CHANGED Viewed

@@ -4,25 +4,41 @@ import re
 import os
 import gradio as gr
 from threading import Thread
-from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
 import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
 tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = AutoModelForCausalLM.from_pretrained(
-    "vikhyatk/moondream-next", trust_remote_code=True,
-    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
-    attn_implementation="flash_attention_2", use_auth_token=auth_token
 )
 moondream.eval()
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     thread = Thread(
@@ -41,6 +57,30 @@ def answer_question(img, prompt):
         buffer += new_text
         yield buffer.strip()
 def extract_floats(text):
     # Regular expression to match an array of four floating point numbers
     pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
@@ -58,6 +98,7 @@ def extract_bbox(text):
         bbox = (x1, y1, x2, y2)
     return bbox
 def process_answer(img, answer):
     if extract_bbox(answer) is not None:
         x1, y1, x2, y2 = extract_bbox(answer)
@@ -71,7 +112,41 @@ def process_answer(img, answer):
     return gr.update(visible=False, value=None)
-with gr.Blocks() as demo:
     gr.Markdown(
         """
         # 🌔 moondream vl (new)
@@ -79,16 +154,44 @@ with gr.Blocks() as demo:
         """
     )
     with gr.Row():
-        prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
-        submit = gr.Button("Submit")
-    with gr.Row():
-        img = gr.Image(type="pil", label="Upload an Image")
         with gr.Column():
-            output = gr.Markdown(label="Response")
             ann = gr.Image(visible=False, label="Annotated Image")
-     submit.click(answer_question, [img, prompt], output)
-    prompt.submit(answer_question, [img, prompt], output)
-    output.change(process_answer, [img, output], ann, show_progress=False)
-demo.queue().launch()

 import os
 import gradio as gr
 from threading import Thread
+from transformers import (
+    TextIteratorStreamer,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    StaticCache,
+)
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
 import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
 tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = AutoModelForCausalLM.from_pretrained(
+    "vikhyatk/moondream-next",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map={"": "cuda"},
+    attn_implementation="flash_attention_2",
+    token=auth_token,
 )
 moondream.eval()
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
+    if img is None:
+        return
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     thread = Thread(
         buffer += new_text
         yield buffer.strip()
+@spaces.GPU(duration=10)
+def caption(img, mode):
+    if img is None:
+        return
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    thread = Thread(
+        target=moondream.caption,
+        kwargs={
+            "images": [img],
+            "length": "short" if mode == "Short" else None,
+            "tokenizer": tokenizer,
+            "streamer": streamer,
+        },
+    )
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer.strip()
 def extract_floats(text):
     # Regular expression to match an array of four floating point numbers
     pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
         bbox = (x1, y1, x2, y2)
     return bbox
 def process_answer(img, answer):
     if extract_bbox(answer) is not None:
         x1, y1, x2, y2 = extract_bbox(answer)
     return gr.update(visible=False, value=None)
+with gr.Blocks(title="moondream vl (new)") as demo:
+    gr.HTML(
+        """
+        <script>
+            window.addEventListener('load', function () {
+                gradioURL = window.location.href;
+                if (!gradioURL.endsWith('?__theme=dark')) {
+                    window.location.replace(gradioURL + '?__theme=dark');
+                }
+            });
+        </script>
+        <style type="text/css">
+            .output-text span p { font-size: 1.4rem !important; }
+            /* Add a beautiful dark background animation for space theme */
+            body gradio-app {
+                background: linear-gradient(to right, #0c0d21, #1f1e33) !important;
+                animation: gradientBG 15s ease infinite;
+                background-size: 400% 400%;
+            }
+            @keyframes gradientBG {
+                0% {
+                    background-position: 0% 50%;
+                }
+                50% {
+                    background-position: 100% 50%;
+                }
+                100% {
+                    background-position: 0% 50%;
+                }
+            }
+        </style>
+        """
+    )
     gr.Markdown(
         """
         # 🌔 moondream vl (new)
         """
     )
     with gr.Row():
         with gr.Column():
+            mode_radio = gr.Radio(
+                ["Caption", "Query", "Detect"],
+                show_label=False,
+                value=lambda: "Caption",
+            )
+            @gr.render(inputs=[mode_radio])
+            def show_inputs(mode):
+                if mode == "Query":
+                    with gr.Group():
+                        with gr.Row():
+                            prompt = gr.Textbox(
+                                label="Input",
+                                value="How many people are in this image?",
+                                scale=4,
+                            )
+                            submit = gr.Button("Submit")
+                        img = gr.Image(type="pil", label="Upload an Image")
+                    submit.click(answer_question, [img, prompt], output)
+                    prompt.submit(answer_question, [img, prompt], output)
+                    img.change(answer_question, [img, prompt], output)
+                elif mode == "Caption":
+                    with gr.Group():
+                        caption_mode = gr.Radio(
+                            ["Short", "Normal"],
+                            show_label=False,
+                            value=lambda: "Normal",
+                        )
+                        img = gr.Image(type="pil", label="Upload an Image")
+                    caption_mode.change(caption, [img, caption_mode], output)
+                    img.change(caption, [img, caption_mode], output)
+                else:
+                    gr.Markdown("Coming soon!")
+        with gr.Column():
+            output = gr.Markdown(label="Response", elem_classes=["output-text"])
             ann = gr.Image(visible=False, label="Annotated Image")
+demo.queue().launch()