grounded-vqa

Runtime error

App Files Files Community

vikhyatk commited on Oct 30, 2024

Commit

e05052e

verified ·

1 Parent(s): 42586be

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -31

app.py CHANGED Viewed

@@ -1,5 +1,32 @@
-import spaces
 import torch
 import os
 import gradio as gr
 from threading import Thread
@@ -11,24 +38,24 @@ from transformers import (
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
-import subprocess
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
 tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
-    revision="591ff5569240caf61126be6b080ff5c9370b87d4",
     trust_remote_code=True,
     torch_dtype=torch.float16,
     device_map={"": "cuda"},
     attn_implementation="flash_attention_2",
-    token=auth_token,
 )
 moondream.eval()
@@ -36,17 +63,20 @@ moondream.eval()
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
     if img is None:
-        yield ""
         return
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     thread = Thread(
         target=moondream.answer_question,
         kwargs={
             "image_embeds": image_embeds,
             "question": prompt,
             "tokenizer": tokenizer,
             "streamer": streamer,
         },
     )
@@ -55,7 +85,11 @@ def answer_question(img, prompt):
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        yield buffer.strip()
 @spaces.GPU(duration=10)
@@ -84,6 +118,10 @@ def caption(img, mode):
 @spaces.GPU(duration=10)
 def detect(img, object):
     w, h = img.size
     if w > 768 or h > 768:
         img = Resize(768)(img)
@@ -97,7 +135,7 @@ def detect(img, object):
             width=3,
         )
-    return gr.update(visible=True, value=img)
 js = """
@@ -173,22 +211,27 @@ js = """
             // Dark mode colors
             var darkColors = {
                 1: '#4a5788', // Deep blue-grey
                 2: '#4c5a8d',
                 3: '#4e5d92',
                 4: '#506097',
                 5: '#52639c'  // Brighter blue-grey
             };
             return isDarkMode ? darkColors[age] : lightColors[age];
         }
         function draw() {
-            // var isDarkMode = document.body.classList.contains('dark');
-            var isDarkMode = false;
-            ctx.fillStyle = isDarkMode ? '#333' : '#f0f0f0';
             ctx.fillRect(0, 0, canvas.width, canvas.height);
             for (var i = 0; i < cols; i++) {
                 for (var j = 0; j < rows; j++) {
                     if (grid[i][j]) {
@@ -220,6 +263,10 @@ css = """
         font-size: 1.4rem !important;
     }
     #life-canvas {
         position: fixed;
         top: 0;
@@ -262,9 +309,9 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
-                    submit.click(answer_question, [img, prompt], output)
-                    prompt.submit(answer_question, [img, prompt], output)
-                    img.change(answer_question, [img, prompt], output)
                 elif mode == "Caption":
                     with gr.Group():
                         with gr.Row():
@@ -278,7 +325,7 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
                         img = gr.Image(type="pil", label="Upload an Image")
                     submit.click(caption, [img, caption_mode], output)
                     img.change(caption, [img, caption_mode], output)
-                else:
                     with gr.Group():
                         with gr.Row():
                             prompt = gr.Textbox(
@@ -288,18 +335,21 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
-                    submit.click(detect, [img, prompt], ann)
-                    prompt.submit(detect, [img, prompt], ann)
-                    img.change(detect, [img, prompt], ann)
         with gr.Column():
-            output = gr.Markdown(
-                label="Response",
-                elem_classes=["output-text"],
-            )
-            ann = gr.Image(visible=False, show_label=False)
-    mode_radio.change(lambda: "", [], output)
-    mode_radio.change(lambda: gr.update(visible=False, value=None), [], ann)
 demo.queue().launch()

+try:
+    import spaces
+    IN_SPACES = True
+except ImportError:
+    from functools import wraps
+    import inspect
+    class spaces:
+        @staticmethod
+        def GPU(duration):
+            def decorator(func):
+                @wraps(func)  # Preserves the original function's metadata
+                def wrapper(*args, **kwargs):
+                    if inspect.isgeneratorfunction(func):
+                        # If the decorated function is a generator, yield from it
+                        yield from func(*args, **kwargs)
+                    else:
+                        # For regular functions, just return the result
+                        return func(*args, **kwargs)
+                return wrapper
+            return decorator
+    IN_SPACES = False
 import torch
+from queue import Queue
 import os
 import gradio as gr
 from threading import Thread
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
+if IN_SPACES:
+    import subprocess
+    subprocess.run(
+        "pip install flash-attn --no-build-isolation",
+        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+        shell=True,
+    )
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
 tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
     trust_remote_code=True,
     torch_dtype=torch.float16,
     device_map={"": "cuda"},
     attn_implementation="flash_attention_2",
+    token=auth_token if IN_SPACES else None,
 )
 moondream.eval()
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
     if img is None:
+        yield "", ""
         return
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    queue = Queue()
     thread = Thread(
         target=moondream.answer_question,
         kwargs={
             "image_embeds": image_embeds,
             "question": prompt,
             "tokenizer": tokenizer,
+            "allow_cot": True,
+            "result_queue": queue,
             "streamer": streamer,
         },
     )
     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        yield buffer.strip(), "Thinking..."
+    answer = queue.get()
+    # yield answer["answer"], answer["thought"]
+    yield answer["answer"], ""
 @spaces.GPU(duration=10)
 @spaces.GPU(duration=10)
 def detect(img, object):
+    if img is None:
+        yield "", gr.update(visible=False, value=None)
+        return
     w, h = img.size
     if w > 768 or h > 768:
         img = Resize(768)(img)
             width=3,
         )
+    yield f"{len(objs)} detected", gr.update(visible=True, value=img)
 js = """
             // Dark mode colors
             var darkColors = {
+                /*
                 1: '#4a5788', // Deep blue-grey
                 2: '#4c5a8d',
                 3: '#4e5d92',
                 4: '#506097',
                 5: '#52639c'  // Brighter blue-grey
+                */
+                1: 'rgb(16, 20, 32)',
+                2: 'rgb(21, 25, 39)',
+                3: 'rgb(26, 30, 46)',
+                4: 'rgb(31, 35, 53)',
+                5: 'rgb(36, 40, 60)'
             };
             return isDarkMode ? darkColors[age] : lightColors[age];
         }
         function draw() {
+            var isDarkMode = document.body.classList.contains('dark');
+            ctx.fillStyle = isDarkMode ? '#0b0f19' : '#f0f0f0';
             ctx.fillRect(0, 0, canvas.width, canvas.height);
             for (var i = 0; i < cols; i++) {
                 for (var j = 0; j < rows; j++) {
                     if (grid[i][j]) {
         font-size: 1.4rem !important;
     }
+    .chain-of-thought span p {
+        opacity: 0.7 !important;
+    }
     #life-canvas {
         position: fixed;
         top: 0;
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
+                    submit.click(answer_question, [img, prompt], [output, thought])
+                    prompt.submit(answer_question, [img, prompt], [output, thought])
+                    img.change(answer_question, [img, prompt], [output, thought])
                 elif mode == "Caption":
                     with gr.Group():
                         with gr.Row():
                         img = gr.Image(type="pil", label="Upload an Image")
                     submit.click(caption, [img, caption_mode], output)
                     img.change(caption, [img, caption_mode], output)
+                elif mode == "Detect":
                     with gr.Group():
                         with gr.Row():
                             prompt = gr.Textbox(
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
+                    submit.click(detect, [img, prompt], [thought, ann])
+                    prompt.submit(detect, [img, prompt], [thought, ann])
+                    img.change(detect, [img, prompt], [thought, ann])
+                else:
+                    gr.Markdown("Coming soon!")
         with gr.Column():
+            thought = gr.Markdown(elem_classes=["chain-of-thought"])
+            output = gr.Markdown(label="Response", elem_classes=["output-text"])
+            ann = gr.Image(visible=False)
+    mode_radio.change(
+        lambda: ("", "", gr.update(visible=False, value=None)),
+        [],
+        [output, thought, ann],
+    )
 demo.queue().launch()