KarthiEz commited on
Commit
a2938a5
·
verified ·
1 Parent(s): b84a459

Upload 12 files

Browse files
Files changed (12) hide show
  1. .gitattributes +52 -40
  2. README.md +14 -13
  3. app.py +440 -0
  4. examples/1.jpg +0 -0
  5. examples/2.jpg +3 -0
  6. examples/3.jpg +3 -0
  7. md3/1.jpg +0 -0
  8. md3/2.jpg +3 -0
  9. md3/3.png +3 -0
  10. md3/4.jpeg +3 -0
  11. pre-requirements.txt +1 -0
  12. requirements.txt +25 -0
.gitattributes CHANGED
@@ -1,40 +1,52 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- moondream3pre/examples/2.jpg filter=lfs diff=lfs merge=lfs -text
37
- moondream3pre/examples/3.jpg filter=lfs diff=lfs merge=lfs -text
38
- moondream3pre/md3/2.jpg filter=lfs diff=lfs merge=lfs -text
39
- moondream3pre/md3/3.png filter=lfs diff=lfs merge=lfs -text
40
- moondream3pre/md3/4.jpeg filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/1.png filter=lfs diff=lfs merge=lfs -text
37
+ images/4.png filter=lfs diff=lfs merge=lfs -text
38
+ object/1.png filter=lfs diff=lfs merge=lfs -text
39
+ object/2.png filter=lfs diff=lfs merge=lfs -text
40
+ videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ images/11.png filter=lfs diff=lfs merge=lfs -text
43
+ images/22.png filter=lfs diff=lfs merge=lfs -text
44
+ images/5.jpg filter=lfs diff=lfs merge=lfs -text
45
+ videos/a.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ videos/b.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ examples/sample_image1.png filter=lfs diff=lfs merge=lfs -text
48
+ md3/2.jpg filter=lfs diff=lfs merge=lfs -text
49
+ md3/3.png filter=lfs diff=lfs merge=lfs -text
50
+ md3/4.jpeg filter=lfs diff=lfs merge=lfs -text
51
+ examples/2.jpg filter=lfs diff=lfs merge=lfs -text
52
+ examples/3.jpg filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,14 @@
1
- ---
2
- title: Stampdetection
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: It is for detection sign/stamp detection
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ ---
2
+ title: Multimodal VLM v1.0
3
+ emoji:
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: OCR, VQA, Thinking and Object Detection.
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import json
3
+ import math
4
+ import os
5
+ import traceback
6
+ from io import BytesIO
7
+ from typing import Any, Dict, List, Optional, Tuple, Iterable
8
+ import re
9
+ import time
10
+ from threading import Thread
11
+ from io import BytesIO
12
+ import uuid
13
+ import tempfile
14
+
15
+ import gradio as gr
16
+ import numpy as np
17
+ import torch
18
+ from PIL import Image
19
+ import supervision as sv
20
+
21
+
22
+ from transformers import (
23
+ Qwen2_5_VLForConditionalGeneration,
24
+ Glm4vForConditionalGeneration,
25
+ Qwen2VLForConditionalGeneration,
26
+ AutoModelForCausalLM,
27
+ AutoProcessor,
28
+ TextIteratorStreamer,
29
+ )
30
+ from gradio.themes import Soft
31
+ from gradio.themes.utils import colors, fonts, sizes
32
+
33
+ # --- Theme Definition ---
34
+
35
+ # Define a new color palette for Blue
36
+ colors.blue_theme_color = colors.Color(
37
+ name="blue_theme_color",
38
+ c50="#E6E6FF",
39
+ c100="#CCCCFF",
40
+ c200="#9999FF",
41
+ c300="#6666FF",
42
+ c400="#3333FF",
43
+ c500="#0000FF", # Base Blue color
44
+ c600="#0000D9",
45
+ c700="#0000B3",
46
+ c800="#000080",
47
+ c900="#000066",
48
+ c950="#000033",
49
+ )
50
+
51
+ class BlueTheme(Soft):
52
+ def __init__(
53
+ self,
54
+ *,
55
+ primary_hue: colors.Color | str = colors.gray,
56
+ secondary_hue: colors.Color | str = colors.blue_theme_color,
57
+ neutral_hue: colors.Color | str = colors.slate,
58
+ text_size: sizes.Size | str = sizes.text_lg,
59
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
60
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
61
+ ),
62
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
63
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
64
+ ),
65
+ ):
66
+ super().__init__(
67
+ primary_hue=primary_hue,
68
+ secondary_hue=secondary_hue,
69
+ neutral_hue=neutral_hue,
70
+ text_size=text_size,
71
+ font=font,
72
+ font_mono=font_mono,
73
+ )
74
+ super().set(
75
+ background_fill_primary="*primary_50",
76
+ background_fill_primary_dark="*primary_900",
77
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
78
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
79
+ button_primary_text_color="white",
80
+ button_primary_text_color_hover="white",
81
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
82
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
83
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
84
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
85
+ slider_color="*secondary_500",
86
+ slider_color_dark="*secondary_600",
87
+ block_title_text_weight="600",
88
+ block_border_width="2px",
89
+ block_shadow="*shadow_drop_lg",
90
+ button_primary_shadow="*shadow_drop_lg",
91
+ button_large_padding="12px",
92
+ block_label_background_fill="*primary_200",
93
+ )
94
+
95
+ # Instantiate the theme
96
+ blue_theme = BlueTheme()
97
+
98
+ # --- Constants and Model Setup ---
99
+ MAX_INPUT_TOKEN_LENGTH = 4096
100
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
101
+
102
+ print("--- System Information ---")
103
+ print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
104
+ print("torch.__version__ =", torch.__version__)
105
+ print("torch.version.cuda =", torch.version.cuda)
106
+ print("CUDA available:", torch.cuda.is_available())
107
+ print("CUDA device count:", torch.cuda.device_count())
108
+ if torch.cuda.is_available():
109
+ print("Current device:", torch.cuda.current_device())
110
+ print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
111
+ print("Using device:", device)
112
+ print("--------------------------")
113
+
114
+
115
+ # --- Model Loading ---
116
+
117
+ # Load Camel-Doc-OCR-062825
118
+ print("Loading Camel-Doc-OCR-062825...")
119
+ MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-062825"
120
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
121
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
122
+ MODEL_ID_M,
123
+ trust_remote_code=True,
124
+ torch_dtype=torch.float16
125
+ ).to(device).eval()
126
+ print("Camel-Doc-OCR-062825 loaded.")
127
+
128
+ # GLM-4.1V-9B-Thinking
129
+ print("Loading GLM-4.1V-9B-Thinking")
130
+ MODEL_ID_T = "zai-org/GLM-4.1V-9B-Thinking"
131
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
132
+ model_t = Glm4vForConditionalGeneration.from_pretrained(
133
+ MODEL_ID_T,
134
+ trust_remote_code=True,
135
+ torch_dtype=torch.float16
136
+ ).to(device).eval()
137
+ print("GLM-4.1V-9B-Thinking loaded.")
138
+
139
+ # Load moondream3
140
+ print("Loading moondream3-preview...")
141
+ MODEL_ID_MD3 = "moondream/moondream3-preview"
142
+ model_md3 = AutoModelForCausalLM.from_pretrained(
143
+ MODEL_ID_MD3,
144
+ trust_remote_code=True,
145
+ torch_dtype=torch.bfloat16,
146
+ device_map={"": "cuda"},
147
+ )
148
+ model_md3.compile()
149
+ print("moondream3-preview loaded and compiled.")
150
+
151
+
152
+ # --- Moondream3 Utility Functions ---
153
+
154
+ def create_annotated_image(image, detection_result, object_name="Object"):
155
+ if not isinstance(detection_result, dict) or "objects" not in detection_result:
156
+ return image
157
+
158
+ original_width, original_height = image.size
159
+ annotated_image = np.array(image.convert("RGB"))
160
+
161
+ bboxes = []
162
+ labels = []
163
+
164
+ for i, obj in enumerate(detection_result["objects"]):
165
+ x_min = int(obj["x_min"] * original_width)
166
+ y_min = int(obj["y_min"] * original_height)
167
+ x_max = int(obj["x_max"] * original_width)
168
+ y_max = int(obj["y_max"] * original_height)
169
+
170
+ x_min = max(0, min(x_min, original_width))
171
+ y_min = max(0, min(y_min, original_height))
172
+ x_max = max(0, min(x_max, original_width))
173
+ y_max = max(0, min(y_max, original_height))
174
+
175
+ if x_max > x_min and y_max > y_min:
176
+ bboxes.append([x_min, y_min, x_max, y_max])
177
+ labels.append(f"{object_name} {i+1}")
178
+
179
+ if not bboxes:
180
+ return image
181
+
182
+ detections = sv.Detections(
183
+ xyxy=np.array(bboxes, dtype=np.float32),
184
+ class_id=np.arange(len(bboxes))
185
+ )
186
+
187
+ bounding_box_annotator = sv.BoxAnnotator(
188
+ thickness=3,
189
+ color_lookup=sv.ColorLookup.INDEX
190
+ )
191
+ label_annotator = sv.LabelAnnotator(
192
+ text_thickness=2,
193
+ text_scale=0.6,
194
+ color_lookup=sv.ColorLookup.INDEX
195
+ )
196
+
197
+ annotated_image = bounding_box_annotator.annotate(
198
+ scene=annotated_image, detections=detections
199
+ )
200
+ annotated_image = label_annotator.annotate(
201
+ scene=annotated_image, detections=detections, labels=labels
202
+ )
203
+
204
+ return Image.fromarray(annotated_image)
205
+
206
+ def create_point_annotated_image(image, point_result):
207
+ if not isinstance(point_result, dict) or "points" not in point_result:
208
+ return image
209
+
210
+ original_width, original_height = image.size
211
+ annotated_image = np.array(image.convert("RGB"))
212
+
213
+ points = []
214
+ for point in point_result["points"]:
215
+ x = int(point["x"] * original_width)
216
+ y = int(point["y"] * original_height)
217
+ points.append([x, y])
218
+
219
+ if points:
220
+ points_array = np.array(points).reshape(1, -1, 2)
221
+ key_points = sv.KeyPoints(xy=points_array)
222
+ vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
223
+ annotated_image = vertex_annotator.annotate(
224
+ scene=annotated_image, key_points=key_points
225
+ )
226
+
227
+ return Image.fromarray(annotated_image)
228
+
229
+ @spaces.GPU()
230
+ def detect_objects_md3(image, prompt, task_type, max_objects):
231
+ STANDARD_SIZE = (1024, 1024)
232
+ if image is None:
233
+ raise gr.Error("Please upload an image.")
234
+ image.thumbnail(STANDARD_SIZE)
235
+
236
+ t0 = time.perf_counter()
237
+
238
+ if task_type == "Object Detection":
239
+ settings = {"max_objects": max_objects} if max_objects > 0 else {}
240
+ result = model_md3.detect(image, prompt, settings=settings)
241
+ annotated_image = create_annotated_image(image, result, prompt)
242
+ elif task_type == "Point Detection":
243
+ result = model_md3.point(image, prompt)
244
+ annotated_image = create_point_annotated_image(image, result)
245
+ elif task_type == "Caption":
246
+ result = model_md3.caption(image, length="normal")
247
+ annotated_image = image
248
+ else:
249
+ result = model_md3.query(image=image, question=prompt, reasoning=True)
250
+ annotated_image = image
251
+
252
+ elapsed_ms = (time.perf_counter() - t0) * 1_000
253
+
254
+ if isinstance(result, dict):
255
+ if "objects" in result:
256
+ output_text = f"Found {len(result['objects'])} objects:\n"
257
+ for i, obj in enumerate(result['objects'], 1):
258
+ output_text += f"\n{i}. Bounding box: ({obj['x_min']:.3f}, {obj['y_min']:.3f}, {obj['x_max']:.3f}, {obj['y_max']:.3f})"
259
+ elif "points" in result:
260
+ output_text = f"Found {len(result['points'])} points:\n"
261
+ for i, point in enumerate(result['points'], 1):
262
+ output_text += f"\n{i}. Point: ({point['x']:.3f}, {point['y']:.3f})"
263
+ elif "caption" in result:
264
+ output_text = result['caption']
265
+ elif "answer" in result:
266
+ output_text = f"Reasoning: {result.get('reasoning', 'N/A')}\n\nAnswer: {result['answer']}"
267
+ else:
268
+ output_text = json.dumps(result, indent=2)
269
+ else:
270
+ output_text = str(result)
271
+
272
+ timing_text = f"Inference time: {elapsed_ms:.0f} ms"
273
+
274
+ return annotated_image, output_text, timing_text
275
+
276
+
277
+ # --- Core Application Logic (for other models) ---
278
+ @spaces.GPU
279
+ def process_document_stream(
280
+ model_name: str,
281
+ image: Image.Image,
282
+ prompt_input: str,
283
+ max_new_tokens: int,
284
+ temperature: float,
285
+ top_p: float,
286
+ top_k: int,
287
+ repetition_penalty: float
288
+ ):
289
+ """
290
+ Main generator function for models other than Moondream3.
291
+ """
292
+ if image is None:
293
+ yield "Please upload an image."
294
+ return
295
+ if not prompt_input or not prompt_input.strip():
296
+ yield "Please enter a prompt."
297
+ return
298
+
299
+ # Select processor and model based on dropdown choice
300
+ if model_name == "Camel-Doc-OCR-062825 (OCR)":
301
+ processor, model = processor_m, model_m
302
+ elif model_name == "GLM-4.1V-9B (Thinking)":
303
+ processor, model = processor_t, model_t
304
+ else:
305
+ yield "Invalid model selected."
306
+ return
307
+
308
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_input}]}]
309
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
310
+ inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
311
+
312
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
313
+
314
+ generation_kwargs = {
315
+ **inputs,
316
+ "streamer": streamer,
317
+ "max_new_tokens": max_new_tokens,
318
+ "temperature": temperature,
319
+ "top_p": top_p,
320
+ "top_k": top_k,
321
+ "repetition_penalty": repetition_penalty,
322
+ "do_sample": True if temperature > 0 else False
323
+ }
324
+
325
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
326
+ thread.start()
327
+
328
+ buffer = ""
329
+ for new_text in streamer:
330
+ buffer += new_text
331
+ # Clean up potential model-specific tokens
332
+ buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
333
+ time.sleep(0.01)
334
+ yield buffer
335
+
336
+ def create_gradio_interface():
337
+ """Builds and returns the Gradio web interface."""
338
+ css = """
339
+ .main-container { max-width: 1400px; margin: 0 auto; }
340
+ #main-title h1 {font-size: 2.2em !important;}
341
+ """
342
+ with gr.Blocks(theme=blue_theme, css=css) as demo:
343
+ gr.Markdown("# **Multimodal VLM v1.0**", elem_id="main-title")
344
+ gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
345
+
346
+ with gr.Tabs():
347
+ # --- TAB 1: Document and General VLMs ---
348
+ with gr.TabItem("📄 Document & General VLM"):
349
+ with gr.Row():
350
+ with gr.Column(scale=1):
351
+ #gr.Markdown("### 1. Configure Inputs")
352
+ model_choice = gr.Dropdown(
353
+ choices=["Camel-Doc-OCR-062825 (OCR)", "GLM-4.1V-9B (Thinking)"],
354
+ label="Select Model", value= "Camel-Doc-OCR-062825 (OCR)"
355
+ )
356
+ image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'], height=280)
357
+ prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
358
+
359
+ with gr.Accordion("Advanced Generation Settings", open=False):
360
+ max_new_tokens = gr.Slider(minimum=256, maximum=4096, value=2048, step=128, label="Max New Tokens")
361
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7)
362
+ top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, step=0.05, value=0.9)
363
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=40)
364
+ repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
365
+
366
+ process_btn = gr.Button("Submit", variant="primary")
367
+ clear_btn = gr.Button("Clear", variant="secondary")
368
+
369
+ with gr.Column(scale=2):
370
+ #gr.Markdown("### 2. View Output")
371
+ with gr.Tab("Output Stream"):
372
+ output_stream = gr.Textbox(label="Model Output", interactive=False, lines=24, show_copy_button=True)
373
+
374
+ gr.Examples(
375
+ examples=[
376
+ ["examples/1.jpg", "Transcribe this receipt."],
377
+ ["examples/2.jpg", "Extract the content."],
378
+ ["examples/3.jpg", "OCR the image."],
379
+ ],
380
+ inputs=[image_input_doc, prompt_input_doc]
381
+ )
382
+
383
+ # --- TAB 2: Moondream3 Lab ---
384
+ with gr.TabItem("🌝 Moondream3"):
385
+ with gr.Row():
386
+ with gr.Column(scale=1):
387
+ md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
388
+ md3_task_type = gr.Radio(
389
+ choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
390
+ label="Task Type", value="Object Detection"
391
+ )
392
+ md3_prompt_input = gr.Textbox(
393
+ label="Prompt (object to detect/question to ask)",
394
+ placeholder="e.g., 'car', 'person', 'What's in this image?'"
395
+ )
396
+ md3_max_objects = gr.Number(
397
+ label="Max Objects (for Object Detection only)",
398
+ value=10, minimum=1, maximum=50, step=1, visible=True
399
+ )
400
+ md3_generate_btn = gr.Button(value="Submit", variant="primary")
401
+ with gr.Column(scale=1):
402
+ md3_output_image = gr.Image(type="pil", label="Result", height=400)
403
+ md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
404
+ md3_output_time = gr.Markdown()
405
+
406
+ gr.Examples(
407
+ examples=[
408
+ ["md3/1.jpg", "Object Detection", "boats", 7],
409
+ ["md3/2.jpg", "Point Detection", "children", 7],
410
+ ["md3/3.png", "Caption", "", 5],
411
+ ["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
412
+ ],
413
+ inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
414
+ label="Click an example to populate inputs"
415
+ )
416
+
417
+ process_btn.click(
418
+ fn=process_document_stream,
419
+ inputs=[model_choice, image_input_doc, prompt_input_doc, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
420
+ outputs=[output_stream]
421
+ )
422
+ clear_btn.click(lambda: (None, "", ""), outputs=[image_input_doc, prompt_input_doc, output_stream])
423
+
424
+ # Moondream3 Tab
425
+ def update_max_objects_visibility(task):
426
+ return gr.update(visible=(task == "Object Detection"))
427
+
428
+ md3_task_type.change(fn=update_max_objects_visibility, inputs=[md3_task_type], outputs=[md3_max_objects])
429
+
430
+ md3_generate_btn.click(
431
+ fn=detect_objects_md3,
432
+ inputs=[md3_image_input, md3_prompt_input, md3_task_type, md3_max_objects],
433
+ outputs=[md3_output_image, md3_output_textbox, md3_output_time]
434
+ )
435
+
436
+ return demo
437
+
438
+ if __name__ == "__main__":
439
+ demo = create_gradio_interface()
440
+ demo.queue(max_size=50).launch(ssr_mode=False, mcp_server=True, show_error=True)
examples/1.jpg ADDED
examples/2.jpg ADDED

Git LFS Details

  • SHA256: 742a756ea115d793979fd29b726dacd1c341aeabb47f20561d2ba4513dff3ad3
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
examples/3.jpg ADDED

Git LFS Details

  • SHA256: 02e9aa9ccdfe57430119b7ae7dc7a2d9967df58450c059bac795ac32aecf5900
  • Pointer size: 131 Bytes
  • Size of remote file: 332 kB
md3/1.jpg ADDED
md3/2.jpg ADDED

Git LFS Details

  • SHA256: 61bcb1c013f1efc0f1c3bf9a7ff23854f004e658ae64ada79c5f0bbf7d8e70ea
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
md3/3.png ADDED

Git LFS Details

  • SHA256: 10b27eea33a5271baed5bf6c7f36477840d5501025a1f0b0503eeaeff9101876
  • Pointer size: 131 Bytes
  • Size of remote file: 147 kB
md3/4.jpeg ADDED

Git LFS Details

  • SHA256: 54908e3a1ebfdae3553658bb5d370b2fca66a40d1c9977f9ee9cf9df116d8372
  • Pointer size: 131 Bytes
  • Size of remote file: 426 kB
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pip>=23.0.0
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/accelerate.git
2
+ git+https://github.com/huggingface/peft.git
3
+ transformers-stream-generator
4
+ huggingface_hub
5
+ albumentations
6
+ qwen-vl-utils
7
+ sentencepiece
8
+ opencv-python
9
+ transformers
10
+ torchvision
11
+ supervision
12
+ matplotlib
13
+ num2words
14
+ reportlab
15
+ xformers
16
+ markdown
17
+ requests
18
+ hf_xet
19
+ spaces
20
+ pillow
21
+ gradio
22
+ einops
23
+ torch
24
+ timm
25
+ av