vikhyatk commited on
Commit
9a9a80e
·
verified ·
1 Parent(s): f67c206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -43
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import spaces
2
  import torch
3
- import re
4
  import os
5
  import gradio as gr
6
  from threading import Thread
@@ -8,7 +7,6 @@ from transformers import (
8
  TextIteratorStreamer,
9
  AutoTokenizer,
10
  AutoModelForCausalLM,
11
- StaticCache,
12
  )
13
  from PIL import ImageDraw
14
  from torchvision.transforms.v2 import Resize
@@ -38,6 +36,7 @@ moondream.eval()
38
  def answer_question(img, prompt):
39
  if img is None:
40
  yield ""
 
41
 
42
  image_embeds = moondream.encode_image(img)
43
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
@@ -62,6 +61,7 @@ def answer_question(img, prompt):
62
  def caption(img, mode):
63
  if img is None:
64
  yield ""
 
65
 
66
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
67
  thread = Thread(
@@ -81,59 +81,172 @@ def caption(img, mode):
81
  yield buffer.strip()
82
 
83
 
84
- def extract_floats(text):
85
- # Regular expression to match an array of four floating point numbers
86
- pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
87
- match = re.search(pattern, text)
88
- if match:
89
- # Extract the numbers and convert them to floats
90
- return [float(num) for num in match.groups()]
91
- return None # Return None if no match is found
92
 
 
 
 
 
 
 
 
 
93
 
94
- def extract_bbox(text):
95
- bbox = None
96
- if extract_floats(text) is not None:
97
- x1, y1, x2, y2 = extract_floats(text)
98
- bbox = (x1, y1, x2, y2)
99
- return bbox
100
 
101
 
102
- def process_answer(img, answer):
103
- if extract_bbox(answer) is not None:
104
- x1, y1, x2, y2 = extract_bbox(answer)
105
- draw_image = Resize(768)(img)
106
- width, height = draw_image.size
107
- x1, x2 = int(x1 * width), int(x2 * width)
108
- y1, y2 = int(y1 * height), int(y2 * height)
109
- bbox = (x1, y1, x2, y2)
110
- ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
111
- return gr.update(visible=True, value=draw_image)
112
 
113
- return gr.update(visible=False, value=None)
 
 
 
 
 
 
 
 
114
 
 
 
 
115
 
116
- with gr.Blocks(title="moondream vl (new)") as demo:
117
- gr.HTML(
118
- """
119
- <style type="text/css">
120
- .output-text span p { font-size: 1.4rem !important; }
121
- </style>
122
- """
123
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  gr.Markdown(
125
  """
126
  # 🌔 moondream vl (new)
127
  A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
128
  """
129
  )
 
 
 
 
 
130
  with gr.Row():
131
  with gr.Column():
132
- mode_radio = gr.Radio(
133
- ["Caption", "Query", "Detect"],
134
- show_label=False,
135
- value=lambda: "Caption",
136
- )
137
 
138
  @gr.render(inputs=[mode_radio])
139
  def show_inputs(mode):
@@ -157,17 +270,34 @@ with gr.Blocks(title="moondream vl (new)") as demo:
157
  ["Short", "Normal"],
158
  label="Caption Length",
159
  value=lambda: "Normal",
 
160
  )
161
  submit = gr.Button("Submit")
162
  img = gr.Image(type="pil", label="Upload an Image")
163
  submit.click(caption, [img, caption_mode], output)
164
  img.change(caption, [img, caption_mode], output)
165
  else:
166
- gr.Markdown("Coming soon!")
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  with gr.Column():
169
- output = gr.Markdown(label="Response", elem_classes=["output-text"])
170
- ann = gr.Image(visible=False, label="Annotated Image")
 
 
 
171
 
 
 
172
 
173
  demo.queue().launch()
 
1
  import spaces
2
  import torch
 
3
  import os
4
  import gradio as gr
5
  from threading import Thread
 
7
  TextIteratorStreamer,
8
  AutoTokenizer,
9
  AutoModelForCausalLM,
 
10
  )
11
  from PIL import ImageDraw
12
  from torchvision.transforms.v2 import Resize
 
36
  def answer_question(img, prompt):
37
  if img is None:
38
  yield ""
39
+ return
40
 
41
  image_embeds = moondream.encode_image(img)
42
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
 
61
  def caption(img, mode):
62
  if img is None:
63
  yield ""
64
+ return
65
 
66
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
67
  thread = Thread(
 
81
  yield buffer.strip()
82
 
83
 
84
+ @spaces.GPU(duration=10)
85
+ def detect(img, object):
86
+ w, h = img.size
87
+ if w > 768 or h > 768:
88
+ img = Resize(768)(img)
 
 
 
89
 
90
+ objs = moondream.detect(img, object, tokenizer)
91
+ draw_image = ImageDraw.Draw(img)
92
+ for o in objs:
93
+ draw_image.rectangle(
94
+ (o["x_min"] * w, o["y_min"] * h, o["x_max"] * w, o["y_max"] * h),
95
+ outline="red",
96
+ width=3,
97
+ )
98
 
99
+ return gr.update(visible=True, value=img)
 
 
 
 
 
100
 
101
 
102
+ js = """
103
+ function createBgAnimation() {
104
+ var canvas = document.createElement('canvas');
105
+ canvas.id = 'life-canvas';
106
+ document.body.appendChild(canvas);
 
 
 
 
 
107
 
108
+ var canvas = document.getElementById('life-canvas');
109
+ var ctx = canvas.getContext('2d');
110
+
111
+ function resizeCanvas() {
112
+ canvas.width = window.innerWidth;
113
+ canvas.height = window.innerHeight;
114
+ }
115
+ resizeCanvas();
116
+ window.addEventListener('resize', resizeCanvas);
117
 
118
+ var cellSize = 8;
119
+ var cols = Math.ceil(canvas.width / cellSize);
120
+ var rows = Math.ceil(canvas.height / cellSize);
121
 
122
+ // Track cell age for color variation
123
+ var grid = new Array(cols).fill(null)
124
+ .map(() => new Array(rows).fill(null)
125
+ .map(() => Math.random() > 0.8 ? 1 : 0)); // If alive, start with age 1
126
+
127
+ function countNeighbors(grid, x, y) {
128
+ var sum = 0;
129
+ for (var i = -1; i < 2; i++) {
130
+ for (var j = -1; j < 2; j++) {
131
+ var col = (x + i + cols) % cols;
132
+ var row = (y + j + rows) % rows;
133
+ sum += grid[col][row] ? 1 : 0;
134
+ }
135
+ }
136
+ sum -= grid[x][y] ? 1 : 0;
137
+ return sum;
138
+ }
139
+
140
+ function computeNextGeneration() {
141
+ var next = grid.map(arr => [...arr]);
142
+
143
+ for (var i = 0; i < cols; i++) {
144
+ for (var j = 0; j < rows; j++) {
145
+ var neighbors = countNeighbors(grid, i, j);
146
+ var state = grid[i][j];
147
+
148
+ if (state) {
149
+ if (neighbors < 2 || neighbors > 3) {
150
+ next[i][j] = 0; // Cell dies
151
+ } else {
152
+ next[i][j] = Math.min(state + 1, 5); // Age the cell, max age of 5
153
+ }
154
+ } else if (neighbors === 3) {
155
+ next[i][j] = 1; // New cell born
156
+ }
157
+ }
158
+ }
159
+
160
+ grid = next;
161
+ }
162
+
163
+ function getColor(age, isDarkMode) {
164
+ // Light mode colors
165
+ var lightColors = {
166
+ 1: '#dae1f5', // Light blue-grey
167
+ 2: '#d3e0f4',
168
+ 3: '#ccdff3',
169
+ 4: '#c5def2',
170
+ 5: '#beddf1' // Slightly deeper blue-grey
171
+ };
172
+
173
+ // Dark mode colors
174
+ var darkColors = {
175
+ 1: '#4a5788', // Deep blue-grey
176
+ 2: '#4c5a8d',
177
+ 3: '#4e5d92',
178
+ 4: '#506097',
179
+ 5: '#52639c' // Brighter blue-grey
180
+ };
181
+
182
+ return isDarkMode ? darkColors[age] : lightColors[age];
183
+ }
184
+
185
+ function draw() {
186
+ var isDarkMode = document.body.classList.contains('dark');
187
+ ctx.fillStyle = isDarkMode ? '#333' : '#f0f0f0';
188
+ ctx.fillRect(0, 0, canvas.width, canvas.height);
189
+
190
+ for (var i = 0; i < cols; i++) {
191
+ for (var j = 0; j < rows; j++) {
192
+ if (grid[i][j]) {
193
+ ctx.fillStyle = getColor(grid[i][j], isDarkMode);
194
+ ctx.fillRect(i * cellSize, j * cellSize, cellSize - 1, cellSize - 1);
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ var lastFrame = 0;
201
+ var frameInterval = 300;
202
+
203
+ function animate(timestamp) {
204
+ if (timestamp - lastFrame >= frameInterval) {
205
+ draw();
206
+ computeNextGeneration();
207
+ lastFrame = timestamp;
208
+ }
209
+ requestAnimationFrame(animate);
210
+ }
211
+
212
+ animate(0);
213
+ }
214
+ """
215
+
216
+ css = """
217
+ .output-text span p {
218
+ font-size: 1.4rem !important;
219
+ }
220
+
221
+ #life-canvas {
222
+ position: fixed;
223
+ top: 0;
224
+ left: 0;
225
+ width: 100%;
226
+ height: 100%;
227
+ z-index: -1;
228
+ opacity: 0.3;
229
+ }
230
+
231
+ body gradio-app {
232
+ background: none !important;
233
+ }
234
+ """
235
+
236
+ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
237
  gr.Markdown(
238
  """
239
  # 🌔 moondream vl (new)
240
  A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
241
  """
242
  )
243
+ mode_radio = gr.Radio(
244
+ ["Caption", "Query", "Detect"],
245
+ show_label=False,
246
+ value=lambda: "Caption",
247
+ )
248
  with gr.Row():
249
  with gr.Column():
 
 
 
 
 
250
 
251
  @gr.render(inputs=[mode_radio])
252
  def show_inputs(mode):
 
270
  ["Short", "Normal"],
271
  label="Caption Length",
272
  value=lambda: "Normal",
273
+ scale=4,
274
  )
275
  submit = gr.Button("Submit")
276
  img = gr.Image(type="pil", label="Upload an Image")
277
  submit.click(caption, [img, caption_mode], output)
278
  img.change(caption, [img, caption_mode], output)
279
  else:
280
+ with gr.Group():
281
+ with gr.Row():
282
+ prompt = gr.Textbox(
283
+ label="Object",
284
+ value="Cat",
285
+ scale=4,
286
+ )
287
+ submit = gr.Button("Submit")
288
+ img = gr.Image(type="pil", label="Upload an Image")
289
+ submit.click(detect, [img, prompt], ann)
290
+ prompt.submit(detect, [img, prompt], ann)
291
+ img.change(detect, [img, prompt], ann)
292
 
293
  with gr.Column():
294
+ output = gr.Markdown(
295
+ label="Response",
296
+ elem_classes=["output-text"],
297
+ )
298
+ ann = gr.Image(visible=False, show_label=False)
299
 
300
+ mode_radio.change(lambda: "", [], output)
301
+ mode_radio.change(lambda: gr.update(visible=False, value=None), [], ann)
302
 
303
  demo.queue().launch()