yuhangzang commited on
Commit
74cb695
·
1 Parent(s): 583c33f
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +80 -15
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: CapRL
3
- emoji: 🌖
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
@@ -8,7 +8,7 @@ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Generate captions for images with CapRL (CPU-only)
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: CapRL
3
+ emoji: 🚀
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Generate captions for images with CapRL
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -7,6 +7,52 @@ MODEL_ID = "internlm/CapRL-3B"
7
  DEFAULT_PROMPT = "Describe the image in detail."
8
  MAX_NEW_TOKENS = 4096
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def load_model():
12
  device = "cpu"
@@ -14,7 +60,7 @@ def load_model():
14
 
15
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
16
  MODEL_ID,
17
- torch_dtype=dtype,
18
  device_map="cpu",
19
  trust_remote_code=True,
20
  low_cpu_mem_usage=True,
@@ -28,7 +74,7 @@ MODEL, PROCESSOR = load_model()
28
 
29
 
30
  @torch.inference_mode()
31
- def generate_caption(image: Image.Image):
32
  if image is None:
33
  return "", 0
34
 
@@ -61,9 +107,15 @@ def generate_caption(image: Image.Image):
61
  return_tensors="pt",
62
  ).to(device)
63
 
 
 
 
 
 
 
64
  generated_ids = MODEL.generate(
65
  **inputs,
66
- max_new_tokens=MAX_NEW_TOKENS,
67
  do_sample=False,
68
  )
69
 
@@ -102,43 +154,56 @@ with gr.Blocks(title="CapRL Image Captioning (CPU)") as demo:
102
  )
103
 
104
  gr.Markdown(
105
- "👉 Prefer faster inference? Try the GPU Space: "
106
- "<a href=\"https://huggingface.co/spaces/yuhangzang/caprl\">yuhangzang/caprl</a>"
 
 
 
 
 
 
107
  )
108
 
109
  with gr.Row():
110
  with gr.Column():
111
- image_input = gr.Image(type="pil", label="Input Image")
 
 
 
 
 
 
 
112
  generate_button = gr.Button("Generate Caption")
113
  with gr.Column():
114
- caption_output = gr.Textbox(label="Caption", lines=6)
115
- token_output = gr.Number(label="Generated Tokens", precision=0)
116
 
117
  generate_button.click(
118
  fn=generate_caption,
119
- inputs=image_input,
120
  outputs=[caption_output, token_output],
121
  show_progress=True,
122
  )
123
 
124
  image_input.upload(
125
  fn=generate_caption,
126
- inputs=image_input,
127
  outputs=[caption_output, token_output],
128
  show_progress=True,
129
  )
130
 
131
  gr.Examples(
132
  examples=[
133
- ["./examples/example_chinese.png"],
134
- ["./examples/example_receipt.jpg"],
135
- ["./examples/example_table.png"],
136
  ],
137
- inputs=image_input,
138
  outputs=[caption_output, token_output],
139
  fn=generate_caption,
140
  cache_examples=True,
141
- label="📸 Example Images"
142
  )
143
 
144
  gr.Markdown("### Citation")
 
7
  DEFAULT_PROMPT = "Describe the image in detail."
8
  MAX_NEW_TOKENS = 4096
9
 
10
+ # Defaults for UI
11
+ DEFAULT_IMAGE_PATH = "./examples/example_chinese.png"
12
+ DEFAULT_CAPTION = """The image depicts a text excerpt, likely from an ancient or classical Chinese document, written in traditional script. Here is a detailed description:
13
+
14
+ ---
15
+
16
+ **Image Description:**
17
+
18
+ The image shows a page from what appears to be a historical or literary text, specifically related to ancient Chinese poetry or prose. The text is titled "序" (preface), indicating it's an introductory section. The content is structured as follows:
19
+
20
+ 1. **Header and Title:**
21
+ - The top line reads "古人云五百年萬六千日何況人生不閨雅佳" which translates to "Ancient people say that even five hundred years and six thousand days are nothing compared to life, not to mention being unreservedly beautiful."
22
+
23
+ 2. **Main Text Content:**
24
+ - The passage begins with "詎醒世之人萬六千日謂生人一事無成" suggesting a reflection on the fleeting nature of life over a vast period.
25
+
26
+ 3. **Key Phrases and Names:
27
+ - "過道還書身易張日夕思慕最凡在秋神冷" refers to someone who frequently thinks about books and autumn, possibly a scholar or writer.
28
+
29
+ 4. **Detailed Descriptions:
30
+ - "道士六魌急力問夢至春田處於筆頭翻得詩" describes a道士在六个地狱中匆忙询问梦境,最终在春天的田野处从笔下翻出诗句。 This indicates a dream or mystical experience leading to poetic inspiration.
31
+
32
+ 5. **Cultural References:
33
+ - "曲數本其間四時風景幽蘭處蜘蛛繚繞之思應如知" mentions counting the four seasons and the beauty of幽兰,with spiders weaving around, suggesting a vivid, natural scene.
34
+
35
+ 6. **Time and Weather:
36
+ - "在日前不眠眼中多釃精橋琥珀色忘冰奚與異" indicates someone who was awake all night, observing and noting the color of a bridge made of精 (possibly a type of stone or material) and ice, perhaps reflecting on a specific moment or event.
37
+
38
+ 7. **Emotional and Personal Touch:
39
+ - "悉心之樂不禁語話諸兄以初意手" expresses deep personal joy and initial intentions, likely referring to a heartfelt message or letter from someone named "初意手."
40
+
41
+ 8. **Additional Notes:
42
+ - "錄載曰亦自作不由得清退法這法後合因人省" suggests that the text is a record of someone's own writing, perhaps a self-reflection or diary entry, mentioning a need for self-retirement or withdrawal from certain matters.
43
+
44
+ 9. **Numerical References:
45
+ - "萬人云五百年" emphasizes the vastness of time, stating five hundred years.
46
+
47
+ 10. **Characters Mentioned:**
48
+ - The text mentions "鍾戴" (Zhòng dài), likely a person or place name, who is described as not leaving at a certain time, implying they were engaged in some activity until late.
49
+
50
+ 11. **Visual Elements:**
51
+ - The text is written in traditional Chinese characters, with a formal and classical style typical of ancient literary works.
52
+
53
+ This detailed description should provide a pure text model with sufficient context to answer any related questions about the image."""
54
+ DEFAULT_TOKENS = 674
55
+
56
 
57
  def load_model():
58
  device = "cpu"
 
60
 
61
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
62
  MODEL_ID,
63
+ dtype=dtype,
64
  device_map="cpu",
65
  trust_remote_code=True,
66
  low_cpu_mem_usage=True,
 
74
 
75
 
76
  @torch.inference_mode()
77
+ def generate_caption(image: Image.Image, max_new_tokens: int = MAX_NEW_TOKENS):
78
  if image is None:
79
  return "", 0
80
 
 
107
  return_tensors="pt",
108
  ).to(device)
109
 
110
+ # Ensure slider value is an integer within bounds
111
+ try:
112
+ max_tokens = int(max(32, min(4096, int(max_new_tokens))))
113
+ except Exception:
114
+ max_tokens = MAX_NEW_TOKENS
115
+
116
  generated_ids = MODEL.generate(
117
  **inputs,
118
+ max_new_tokens=max_tokens,
119
  do_sample=False,
120
  )
121
 
 
154
  )
155
 
156
  gr.Markdown(
157
+ """
158
+ <div style="font-size: 1.2rem; font-weight: 800; color: #e67300;">
159
+ 👉 Prefer faster inference? Try the GPU Space:
160
+ <a href="https://huggingface.co/spaces/yuhangzang/caprl" style="color: #e67300; text-decoration: underline; font-weight: 900;">
161
+ caprl (GPU Space)
162
+ </a>
163
+ </div>
164
+ """
165
  )
166
 
167
  with gr.Row():
168
  with gr.Column():
169
+ image_input = gr.Image(value=DEFAULT_IMAGE_PATH, type="pil", label="Input Image")
170
+ max_new_tokens_slider = gr.Slider(
171
+ minimum=32,
172
+ maximum=4096,
173
+ step=1,
174
+ value=MAX_NEW_TOKENS,
175
+ label="Max New Tokens (32–4096)",
176
+ )
177
  generate_button = gr.Button("Generate Caption")
178
  with gr.Column():
179
+ caption_output = gr.Textbox(value=DEFAULT_CAPTION, label="Caption", lines=6)
180
+ token_output = gr.Number(value=DEFAULT_TOKENS, label="Generated Tokens", precision=0)
181
 
182
  generate_button.click(
183
  fn=generate_caption,
184
+ inputs=[image_input, max_new_tokens_slider],
185
  outputs=[caption_output, token_output],
186
  show_progress=True,
187
  )
188
 
189
  image_input.upload(
190
  fn=generate_caption,
191
+ inputs=[image_input, max_new_tokens_slider],
192
  outputs=[caption_output, token_output],
193
  show_progress=True,
194
  )
195
 
196
  gr.Examples(
197
  examples=[
198
+ ["./examples/example_chinese.png", MAX_NEW_TOKENS],
199
+ ["./examples/example_receipt.jpg", MAX_NEW_TOKENS],
200
+ ["./examples/example_table.png", MAX_NEW_TOKENS],
201
  ],
202
+ inputs=[image_input, max_new_tokens_slider],
203
  outputs=[caption_output, token_output],
204
  fn=generate_caption,
205
  cache_examples=True,
206
+ label="📸 Example Images",
207
  )
208
 
209
  gr.Markdown("### Citation")