yuhangzang
commited on
Commit
·
74cb695
1
Parent(s):
583c33f
update
Browse files
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: CapRL
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
|
@@ -8,7 +8,7 @@ sdk_version: 5.49.1
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
-
short_description: Generate captions for images with CapRL
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: CapRL
|
| 3 |
+
emoji: 🚀
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
+
short_description: Generate captions for images with CapRL
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -7,6 +7,52 @@ MODEL_ID = "internlm/CapRL-3B"
|
|
| 7 |
DEFAULT_PROMPT = "Describe the image in detail."
|
| 8 |
MAX_NEW_TOKENS = 4096
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def load_model():
|
| 12 |
device = "cpu"
|
|
@@ -14,7 +60,7 @@ def load_model():
|
|
| 14 |
|
| 15 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 16 |
MODEL_ID,
|
| 17 |
-
|
| 18 |
device_map="cpu",
|
| 19 |
trust_remote_code=True,
|
| 20 |
low_cpu_mem_usage=True,
|
|
@@ -28,7 +74,7 @@ MODEL, PROCESSOR = load_model()
|
|
| 28 |
|
| 29 |
|
| 30 |
@torch.inference_mode()
|
| 31 |
-
def generate_caption(image: Image.Image):
|
| 32 |
if image is None:
|
| 33 |
return "", 0
|
| 34 |
|
|
@@ -61,9 +107,15 @@ def generate_caption(image: Image.Image):
|
|
| 61 |
return_tensors="pt",
|
| 62 |
).to(device)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
generated_ids = MODEL.generate(
|
| 65 |
**inputs,
|
| 66 |
-
max_new_tokens=
|
| 67 |
do_sample=False,
|
| 68 |
)
|
| 69 |
|
|
@@ -102,43 +154,56 @@ with gr.Blocks(title="CapRL Image Captioning (CPU)") as demo:
|
|
| 102 |
)
|
| 103 |
|
| 104 |
gr.Markdown(
|
| 105 |
-
"
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
)
|
| 108 |
|
| 109 |
with gr.Row():
|
| 110 |
with gr.Column():
|
| 111 |
-
image_input = gr.Image(type="pil", label="Input Image")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
generate_button = gr.Button("Generate Caption")
|
| 113 |
with gr.Column():
|
| 114 |
-
caption_output = gr.Textbox(label="Caption", lines=6)
|
| 115 |
-
token_output = gr.Number(label="Generated Tokens", precision=0)
|
| 116 |
|
| 117 |
generate_button.click(
|
| 118 |
fn=generate_caption,
|
| 119 |
-
inputs=image_input,
|
| 120 |
outputs=[caption_output, token_output],
|
| 121 |
show_progress=True,
|
| 122 |
)
|
| 123 |
|
| 124 |
image_input.upload(
|
| 125 |
fn=generate_caption,
|
| 126 |
-
inputs=image_input,
|
| 127 |
outputs=[caption_output, token_output],
|
| 128 |
show_progress=True,
|
| 129 |
)
|
| 130 |
|
| 131 |
gr.Examples(
|
| 132 |
examples=[
|
| 133 |
-
["./examples/example_chinese.png"],
|
| 134 |
-
["./examples/example_receipt.jpg"],
|
| 135 |
-
["./examples/example_table.png"],
|
| 136 |
],
|
| 137 |
-
inputs=image_input,
|
| 138 |
outputs=[caption_output, token_output],
|
| 139 |
fn=generate_caption,
|
| 140 |
cache_examples=True,
|
| 141 |
-
label="📸 Example Images"
|
| 142 |
)
|
| 143 |
|
| 144 |
gr.Markdown("### Citation")
|
|
|
|
| 7 |
DEFAULT_PROMPT = "Describe the image in detail."
|
| 8 |
MAX_NEW_TOKENS = 4096
|
| 9 |
|
| 10 |
+
# Defaults for UI
|
| 11 |
+
DEFAULT_IMAGE_PATH = "./examples/example_chinese.png"
|
| 12 |
+
DEFAULT_CAPTION = """The image depicts a text excerpt, likely from an ancient or classical Chinese document, written in traditional script. Here is a detailed description:
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
**Image Description:**
|
| 17 |
+
|
| 18 |
+
The image shows a page from what appears to be a historical or literary text, specifically related to ancient Chinese poetry or prose. The text is titled "序" (preface), indicating it's an introductory section. The content is structured as follows:
|
| 19 |
+
|
| 20 |
+
1. **Header and Title:**
|
| 21 |
+
- The top line reads "古人云五百年萬六千日何況人生不閨雅佳" which translates to "Ancient people say that even five hundred years and six thousand days are nothing compared to life, not to mention being unreservedly beautiful."
|
| 22 |
+
|
| 23 |
+
2. **Main Text Content:**
|
| 24 |
+
- The passage begins with "詎醒世之人萬六千日謂生人一事無成" suggesting a reflection on the fleeting nature of life over a vast period.
|
| 25 |
+
|
| 26 |
+
3. **Key Phrases and Names:
|
| 27 |
+
- "過道還書身易張日夕思慕最凡在秋神冷" refers to someone who frequently thinks about books and autumn, possibly a scholar or writer.
|
| 28 |
+
|
| 29 |
+
4. **Detailed Descriptions:
|
| 30 |
+
- "道士六魌急力問夢至春田處於筆頭翻得詩" describes a道士在六个地狱中匆忙询问梦境,最终在春天的田野处从笔下翻出诗句。 This indicates a dream or mystical experience leading to poetic inspiration.
|
| 31 |
+
|
| 32 |
+
5. **Cultural References:
|
| 33 |
+
- "曲數本其間四時風景幽蘭處蜘蛛繚繞之思應如知" mentions counting the four seasons and the beauty of幽兰,with spiders weaving around, suggesting a vivid, natural scene.
|
| 34 |
+
|
| 35 |
+
6. **Time and Weather:
|
| 36 |
+
- "在日前不眠眼中多釃精橋琥珀色忘冰奚與異" indicates someone who was awake all night, observing and noting the color of a bridge made of精 (possibly a type of stone or material) and ice, perhaps reflecting on a specific moment or event.
|
| 37 |
+
|
| 38 |
+
7. **Emotional and Personal Touch:
|
| 39 |
+
- "悉心之樂不禁語話諸兄以初意手" expresses deep personal joy and initial intentions, likely referring to a heartfelt message or letter from someone named "初意手."
|
| 40 |
+
|
| 41 |
+
8. **Additional Notes:
|
| 42 |
+
- "錄載曰亦自作不由得清退法這法後合因人省" suggests that the text is a record of someone's own writing, perhaps a self-reflection or diary entry, mentioning a need for self-retirement or withdrawal from certain matters.
|
| 43 |
+
|
| 44 |
+
9. **Numerical References:
|
| 45 |
+
- "萬人云五百年" emphasizes the vastness of time, stating five hundred years.
|
| 46 |
+
|
| 47 |
+
10. **Characters Mentioned:**
|
| 48 |
+
- The text mentions "鍾戴" (Zhòng dài), likely a person or place name, who is described as not leaving at a certain time, implying they were engaged in some activity until late.
|
| 49 |
+
|
| 50 |
+
11. **Visual Elements:**
|
| 51 |
+
- The text is written in traditional Chinese characters, with a formal and classical style typical of ancient literary works.
|
| 52 |
+
|
| 53 |
+
This detailed description should provide a pure text model with sufficient context to answer any related questions about the image."""
|
| 54 |
+
DEFAULT_TOKENS = 674
|
| 55 |
+
|
| 56 |
|
| 57 |
def load_model():
|
| 58 |
device = "cpu"
|
|
|
|
| 60 |
|
| 61 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 62 |
MODEL_ID,
|
| 63 |
+
dtype=dtype,
|
| 64 |
device_map="cpu",
|
| 65 |
trust_remote_code=True,
|
| 66 |
low_cpu_mem_usage=True,
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
@torch.inference_mode()
|
| 77 |
+
def generate_caption(image: Image.Image, max_new_tokens: int = MAX_NEW_TOKENS):
|
| 78 |
if image is None:
|
| 79 |
return "", 0
|
| 80 |
|
|
|
|
| 107 |
return_tensors="pt",
|
| 108 |
).to(device)
|
| 109 |
|
| 110 |
+
# Ensure slider value is an integer within bounds
|
| 111 |
+
try:
|
| 112 |
+
max_tokens = int(max(32, min(4096, int(max_new_tokens))))
|
| 113 |
+
except Exception:
|
| 114 |
+
max_tokens = MAX_NEW_TOKENS
|
| 115 |
+
|
| 116 |
generated_ids = MODEL.generate(
|
| 117 |
**inputs,
|
| 118 |
+
max_new_tokens=max_tokens,
|
| 119 |
do_sample=False,
|
| 120 |
)
|
| 121 |
|
|
|
|
| 154 |
)
|
| 155 |
|
| 156 |
gr.Markdown(
|
| 157 |
+
"""
|
| 158 |
+
<div style="font-size: 1.2rem; font-weight: 800; color: #e67300;">
|
| 159 |
+
👉 Prefer faster inference? Try the GPU Space:
|
| 160 |
+
<a href="https://huggingface.co/spaces/yuhangzang/caprl" style="color: #e67300; text-decoration: underline; font-weight: 900;">
|
| 161 |
+
caprl (GPU Space)
|
| 162 |
+
</a>
|
| 163 |
+
</div>
|
| 164 |
+
"""
|
| 165 |
)
|
| 166 |
|
| 167 |
with gr.Row():
|
| 168 |
with gr.Column():
|
| 169 |
+
image_input = gr.Image(value=DEFAULT_IMAGE_PATH, type="pil", label="Input Image")
|
| 170 |
+
max_new_tokens_slider = gr.Slider(
|
| 171 |
+
minimum=32,
|
| 172 |
+
maximum=4096,
|
| 173 |
+
step=1,
|
| 174 |
+
value=MAX_NEW_TOKENS,
|
| 175 |
+
label="Max New Tokens (32–4096)",
|
| 176 |
+
)
|
| 177 |
generate_button = gr.Button("Generate Caption")
|
| 178 |
with gr.Column():
|
| 179 |
+
caption_output = gr.Textbox(value=DEFAULT_CAPTION, label="Caption", lines=6)
|
| 180 |
+
token_output = gr.Number(value=DEFAULT_TOKENS, label="Generated Tokens", precision=0)
|
| 181 |
|
| 182 |
generate_button.click(
|
| 183 |
fn=generate_caption,
|
| 184 |
+
inputs=[image_input, max_new_tokens_slider],
|
| 185 |
outputs=[caption_output, token_output],
|
| 186 |
show_progress=True,
|
| 187 |
)
|
| 188 |
|
| 189 |
image_input.upload(
|
| 190 |
fn=generate_caption,
|
| 191 |
+
inputs=[image_input, max_new_tokens_slider],
|
| 192 |
outputs=[caption_output, token_output],
|
| 193 |
show_progress=True,
|
| 194 |
)
|
| 195 |
|
| 196 |
gr.Examples(
|
| 197 |
examples=[
|
| 198 |
+
["./examples/example_chinese.png", MAX_NEW_TOKENS],
|
| 199 |
+
["./examples/example_receipt.jpg", MAX_NEW_TOKENS],
|
| 200 |
+
["./examples/example_table.png", MAX_NEW_TOKENS],
|
| 201 |
],
|
| 202 |
+
inputs=[image_input, max_new_tokens_slider],
|
| 203 |
outputs=[caption_output, token_output],
|
| 204 |
fn=generate_caption,
|
| 205 |
cache_examples=True,
|
| 206 |
+
label="📸 Example Images",
|
| 207 |
)
|
| 208 |
|
| 209 |
gr.Markdown("### Citation")
|