grounded-vqa

Running on Zero

App Files Files Community

vikhyatk commited on Oct 29, 2024

Commit

9a9a80e

verified ·

1 Parent(s): f67c206

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -43

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import spaces
 import torch
-import re
 import os
 import gradio as gr
 from threading import Thread
@@ -8,7 +7,6 @@ from transformers import (
     TextIteratorStreamer,
     AutoTokenizer,
     AutoModelForCausalLM,
-    StaticCache,
 )
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
@@ -38,6 +36,7 @@ moondream.eval()
 def answer_question(img, prompt):
     if img is None:
         yield ""
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
@@ -62,6 +61,7 @@ def answer_question(img, prompt):
 def caption(img, mode):
     if img is None:
         yield ""
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     thread = Thread(
@@ -81,59 +81,172 @@ def caption(img, mode):
         yield buffer.strip()
-def extract_floats(text):
-    # Regular expression to match an array of four floating point numbers
-    pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
-    match = re.search(pattern, text)
-    if match:
-        # Extract the numbers and convert them to floats
-        return [float(num) for num in match.groups()]
-    return None  # Return None if no match is found
-def extract_bbox(text):
-    bbox = None
-    if extract_floats(text) is not None:
-        x1, y1, x2, y2 = extract_floats(text)
-        bbox = (x1, y1, x2, y2)
-    return bbox
-def process_answer(img, answer):
-    if extract_bbox(answer) is not None:
-        x1, y1, x2, y2 = extract_bbox(answer)
-        draw_image = Resize(768)(img)
-        width, height = draw_image.size
-        x1, x2 = int(x1 * width), int(x2 * width)
-        y1, y2 = int(y1 * height), int(y2 * height)
-        bbox = (x1, y1, x2, y2)
-        ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
-        return gr.update(visible=True, value=draw_image)
-    return gr.update(visible=False, value=None)
-with gr.Blocks(title="moondream vl (new)") as demo:
-    gr.HTML(
-        """
-        <style type="text/css">
-            .output-text span p { font-size: 1.4rem !important; }
-        </style>
-        """
-    )
     gr.Markdown(
         """
         # 🌔 moondream vl (new)
         A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
         """
     )
     with gr.Row():
         with gr.Column():
-            mode_radio = gr.Radio(
-                ["Caption", "Query", "Detect"],
-                show_label=False,
-                value=lambda: "Caption",
-            )
             @gr.render(inputs=[mode_radio])
             def show_inputs(mode):
@@ -157,17 +270,34 @@ with gr.Blocks(title="moondream vl (new)") as demo:
                                 ["Short", "Normal"],
                                 label="Caption Length",
                                 value=lambda: "Normal",
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
                     submit.click(caption, [img, caption_mode], output)
                     img.change(caption, [img, caption_mode], output)
                 else:
-                    gr.Markdown("Coming soon!")
         with gr.Column():
-            output = gr.Markdown(label="Response", elem_classes=["output-text"])
-            ann = gr.Image(visible=False, label="Annotated Image")
 demo.queue().launch()

 import spaces
 import torch
 import os
 import gradio as gr
 from threading import Thread
     TextIteratorStreamer,
     AutoTokenizer,
     AutoModelForCausalLM,
 )
 from PIL import ImageDraw
 from torchvision.transforms.v2 import Resize
 def answer_question(img, prompt):
     if img is None:
         yield ""
+        return
     image_embeds = moondream.encode_image(img)
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
 def caption(img, mode):
     if img is None:
         yield ""
+        return
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     thread = Thread(
         yield buffer.strip()
+@spaces.GPU(duration=10)
+def detect(img, object):
+    w, h = img.size
+    if w > 768 or h > 768:
+        img = Resize(768)(img)
+    objs = moondream.detect(img, object, tokenizer)
+    draw_image = ImageDraw.Draw(img)
+    for o in objs:
+        draw_image.rectangle(
+            (o["x_min"] * w, o["y_min"] * h, o["x_max"] * w, o["y_max"] * h),
+            outline="red",
+            width=3,
+        )
+    return gr.update(visible=True, value=img)
+js = """
+    function createBgAnimation() {
+        var canvas = document.createElement('canvas');
+        canvas.id = 'life-canvas';
+        document.body.appendChild(canvas);
+        var canvas = document.getElementById('life-canvas');
+        var ctx = canvas.getContext('2d');
+        function resizeCanvas() {
+            canvas.width = window.innerWidth;
+            canvas.height = window.innerHeight;
+        }
+        resizeCanvas();
+        window.addEventListener('resize', resizeCanvas);
+        var cellSize = 8;
+        var cols = Math.ceil(canvas.width / cellSize);
+        var rows = Math.ceil(canvas.height / cellSize);
+        // Track cell age for color variation
+        var grid = new Array(cols).fill(null)
+            .map(() => new Array(rows).fill(null)
+            .map(() => Math.random() > 0.8 ? 1 : 0)); // If alive, start with age 1
+        function countNeighbors(grid, x, y) {
+            var sum = 0;
+            for (var i = -1; i < 2; i++) {
+                for (var j = -1; j < 2; j++) {
+                    var col = (x + i + cols) % cols;
+                    var row = (y + j + rows) % rows;
+                    sum += grid[col][row] ? 1 : 0;
+                }
+            }
+            sum -= grid[x][y] ? 1 : 0;
+            return sum;
+        }
+        function computeNextGeneration() {
+            var next = grid.map(arr => [...arr]);
+            for (var i = 0; i < cols; i++) {
+                for (var j = 0; j < rows; j++) {
+                    var neighbors = countNeighbors(grid, i, j);
+                    var state = grid[i][j];
+                    if (state) {
+                        if (neighbors < 2 || neighbors > 3) {
+                            next[i][j] = 0; // Cell dies
+                        } else {
+                            next[i][j] = Math.min(state + 1, 5); // Age the cell, max age of 5
+                        }
+                    } else if (neighbors === 3) {
+                        next[i][j] = 1; // New cell born
+                    }
+                }
+            }
+            grid = next;
+        }
+        function getColor(age, isDarkMode) {
+            // Light mode colors
+            var lightColors = {
+                1: '#dae1f5', // Light blue-grey
+                2: '#d3e0f4',
+                3: '#ccdff3',
+                4: '#c5def2',
+                5: '#beddf1'  // Slightly deeper blue-grey
+            };
+            // Dark mode colors
+            var darkColors = {
+                1: '#4a5788', // Deep blue-grey
+                2: '#4c5a8d',
+                3: '#4e5d92',
+                4: '#506097',
+                5: '#52639c'  // Brighter blue-grey
+            };
+            return isDarkMode ? darkColors[age] : lightColors[age];
+        }
+        function draw() {
+            var isDarkMode = document.body.classList.contains('dark');
+            ctx.fillStyle = isDarkMode ? '#333' : '#f0f0f0';
+            ctx.fillRect(0, 0, canvas.width, canvas.height);
+            for (var i = 0; i < cols; i++) {
+                for (var j = 0; j < rows; j++) {
+                    if (grid[i][j]) {
+                        ctx.fillStyle = getColor(grid[i][j], isDarkMode);
+                        ctx.fillRect(i * cellSize, j * cellSize, cellSize - 1, cellSize - 1);
+                    }
+                }
+            }
+        }
+        var lastFrame = 0;
+        var frameInterval = 300;
+        function animate(timestamp) {
+            if (timestamp - lastFrame >= frameInterval) {
+                draw();
+                computeNextGeneration();
+                lastFrame = timestamp;
+            }
+            requestAnimationFrame(animate);
+        }
+        animate(0);
+    }
+"""
+css = """
+    .output-text span p {
+        font-size: 1.4rem !important;
+    }
+    #life-canvas {
+        position: fixed;
+        top: 0;
+        left: 0;
+        width: 100%;
+        height: 100%;
+        z-index: -1;
+        opacity: 0.3;
+    }
+    body gradio-app {
+        background: none !important;
+    }
+"""
+with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
     gr.Markdown(
         """
         # 🌔 moondream vl (new)
         A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
         """
     )
+    mode_radio = gr.Radio(
+        ["Caption", "Query", "Detect"],
+        show_label=False,
+        value=lambda: "Caption",
+    )
     with gr.Row():
         with gr.Column():
             @gr.render(inputs=[mode_radio])
             def show_inputs(mode):
                                 ["Short", "Normal"],
                                 label="Caption Length",
                                 value=lambda: "Normal",
+                                scale=4,
                             )
                             submit = gr.Button("Submit")
                         img = gr.Image(type="pil", label="Upload an Image")
                     submit.click(caption, [img, caption_mode], output)
                     img.change(caption, [img, caption_mode], output)
                 else:
+                    with gr.Group():
+                        with gr.Row():
+                            prompt = gr.Textbox(
+                                label="Object",
+                                value="Cat",
+                                scale=4,
+                            )
+                            submit = gr.Button("Submit")
+                        img = gr.Image(type="pil", label="Upload an Image")
+                    submit.click(detect, [img, prompt], ann)
+                    prompt.submit(detect, [img, prompt], ann)
+                    img.change(detect, [img, prompt], ann)
         with gr.Column():
+            output = gr.Markdown(
+                label="Response",
+                elem_classes=["output-text"],
+            )
+            ann = gr.Image(visible=False, show_label=False)
+    mode_radio.change(lambda: "", [], output)
+    mode_radio.change(lambda: gr.update(visible=False, value=None), [], ann)
 demo.queue().launch()