grounded-vqa

Running on Zero

App Files Files Community

vikhyatk commited on Nov 12, 2024

Commit

df5f888

verified ·

1 Parent(s): dcacf66

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -6

app.py CHANGED Viewed

@@ -26,9 +26,11 @@ except ImportError:
     IN_SPACES = False
 import torch
-from queue import Queue
 import os
 import gradio as gr
 from threading import Thread
 from transformers import (
     TextIteratorStreamer,
@@ -48,7 +50,7 @@ if IN_SPACES:
     )
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
-tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
     trust_remote_code=True,
@@ -57,9 +59,79 @@ moondream = AutoModelForCausalLM.from_pretrained(
     attn_implementation="flash_attention_2",
     token=auth_token if IN_SPACES else None,
 )
 moondream.eval()
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
     if img is None:
@@ -85,10 +157,12 @@ def answer_question(img, prompt):
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        yield buffer.strip(), "Thinking..."
     answer = queue.get()
-    yield answer["answer"], answer["thought"]
 @spaces.GPU(duration=10)
@@ -135,7 +209,9 @@ def detect(img, object):
             width=3,
         )
-    yield f"{len(objs)} detected", gr.update(visible=True, value=img)
 js = """
@@ -266,6 +342,12 @@ css = """
     .chain-of-thought {
         opacity: 0.7 !important;
     }
     #life-canvas {
         position: fixed;
@@ -294,6 +376,9 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
         show_label=False,
         value=lambda: "Caption",
     )
     with gr.Row():
         with gr.Column():
@@ -312,6 +397,7 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
                     submit.click(answer_question, [img, prompt], [output, thought])
                     prompt.submit(answer_question, [img, prompt], [output, thought])
                     img.change(answer_question, [img, prompt], [output, thought])
                 elif mode == "Caption":
                     with gr.Group():
                         with gr.Row():
@@ -342,10 +428,42 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
                     gr.Markdown("Coming soon!")
         with gr.Column():
-            thought = gr.Markdown(elem_classes=["chain-of-thought"], line_breaks=True)
             output = gr.Markdown(label="Response", elem_classes=["output-text"])
             ann = gr.Image(visible=False)
     mode_radio.change(
         lambda: ("", "", gr.update(visible=False, value=None)),
         [],

     IN_SPACES = False
 import torch
 import os
 import gradio as gr
+import json
+from queue import Queue
 from threading import Thread
 from transformers import (
     TextIteratorStreamer,
     )
 auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
+tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream-next")
 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
     trust_remote_code=True,
     attn_implementation="flash_attention_2",
     token=auth_token if IN_SPACES else None,
 )
+# CKPT_DIRS = ["/tmp/md-ckpt/ckpt/ft/song-moon-4c-s15/s72001/"]
+# def get_ckpt(filename):
+#     ckpts = [
+#         torch.load(os.path.join(dir, filename), map_location="cpu") for dir in CKPT_DIRS
+#     ]
+#     avg_ckpt = {
+#         key.replace("._orig_mod", ""): sum(ckpt[key] for ckpt in ckpts) / len(ckpts)
+#         for key in ckpts[0]
+#     }
+#     return avg_ckpt
+# moondream.load_state_dict(get_ckpt("model.pt"))
 moondream.eval()
+def convert_to_entities(text, coords):
+    """
+    Converts a string with special markers into an entity representation.
+    Markers:
+    - <|coord|> pairs indicate coordinate markers
+    - <|start_ground|> indicates the start of a ground term
+    - <|end_ground|> indicates the end of a ground term
+    Returns:
+    - Dictionary with cleaned text and entities with their character positions
+    """
+    # Initialize variables
+    cleaned_text = ""
+    entities = []
+    entity = []
+    # Track current position in cleaned text
+    current_pos = 0
+    # Track if we're currently processing an entity
+    in_entity = False
+    entity_start = 0
+    i = 0
+    while i < len(text):
+        # Check for markers
+        if text[i : i + 9] == "<|coord|>":
+            i += 9
+            entity.append(coords.pop(0))
+            continue
+        elif text[i : i + 16] == "<|start_ground|>":
+            in_entity = True
+            entity_start = current_pos
+            i += 16
+            continue
+        elif text[i : i + 14] == "<|end_ground|>":
+            # Store entity position
+            entities.append(
+                {
+                    "entity": json.dumps(entity),
+                    "start": entity_start,
+                    "end": current_pos,
+                }
+            )
+            entity = []
+            in_entity = False
+            i += 14
+            continue
+        # Add character to cleaned text
+        cleaned_text += text[i]
+        current_pos += 1
+        i += 1
+    return {"text": cleaned_text, "entities": entities}
 @spaces.GPU(duration=10)
 def answer_question(img, prompt):
     if img is None:
     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        yield buffer.strip(), {"text": "Thinking...", "entities": []}
     answer = queue.get()
+    thought = convert_to_entities(answer["thought"], answer["coords"])
+    yield answer["answer"], thought
 @spaces.GPU(duration=10)
             width=3,
         )
+    yield {"text": f"{len(objs)} detected", "entities": []}, gr.update(
+        visible=True, value=img
+    )
 js = """
     .chain-of-thought {
         opacity: 0.7 !important;
     }
+    .chain-of-thought span.label {
+        display: none;
+    }
+    .chain-of-thought span.textspan {
+        padding-right: 0;
+    }
     #life-canvas {
         position: fixed;
         show_label=False,
         value=lambda: "Caption",
     )
+    input_image = gr.State(None)
     with gr.Row():
         with gr.Column():
                     submit.click(answer_question, [img, prompt], [output, thought])
                     prompt.submit(answer_question, [img, prompt], [output, thought])
                     img.change(answer_question, [img, prompt], [output, thought])
+                    img.change(lambda img: img, [img], [input_image])
                 elif mode == "Caption":
                     with gr.Group():
                         with gr.Row():
                     gr.Markdown("Coming soon!")
         with gr.Column():
+            thought = gr.HighlightedText(
+                elem_classes=["chain-of-thought"],
+                label="Thinking tokens",
+                interactive=False,
+            )
             output = gr.Markdown(label="Response", elem_classes=["output-text"])
             ann = gr.Image(visible=False)
+        def on_select(img, evt: gr.SelectData):
+            if img is None or evt.value[1] is None:
+                return gr.update(visible=False, value=None)
+            w, h = img.size
+            if w > 768 or h > 768:
+                img = Resize(768)(img)
+                w, h = img.size
+            coords = json.loads(evt.value[1])
+            if len(coords) != 2:
+                raise ValueError("Only points supported right now.")
+            coords[0] = int(coords[0] * w)
+            coords[1] = int(coords[1] * h)
+            img_clone = img.copy()
+            draw = ImageDraw.Draw(img_clone)
+            draw.ellipse(
+                (coords[0] - 3, coords[1] - 3, coords[0] + 3, coords[1] + 3),
+                fill="red",
+                outline="red",
+            )
+            return gr.update(visible=True, value=img_clone)
+        thought.select(on_select, [input_image], [ann])
+        input_image.change(lambda: gr.update(visible=False), [], [ann])
     mode_radio.change(
         lambda: ("", "", gr.update(visible=False, value=None)),
         [],