yuhangzang
Update examples and defaults: replace example images with 1909.png, 44687.jpeg, natural.png; set default image to 1909.png; update default caption and token count to 826
3ebc4f1
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| MODEL_ID = "internlm/CapRL-3B" | |
| DEFAULT_PROMPT = "Describe the image in detail." | |
| MAX_NEW_TOKENS = 4096 | |
| # Defaults for UI | |
| DEFAULT_IMAGE_PATH = "./examples/1909.png" | |
| DEFAULT_CAPTION = """The image is a bar chart from the Pew Research Center that illustrates how older Republicans and Republican leaners view Donald Trump, specifically focusing on how many describe the phrase "fights for what I believe in" to describe Trump. The data is based on a survey conducted from February 4-15, 2020, among U.S. adults who identify as Republicans or Republican-leaning independents. | |
| ### Title: | |
| Older Republicans especially likely to see Trump as fighting for their beliefs | |
| ### Main Question: | |
| Among Republicans and Republican leaners, % who say the phrase 'fights for what I believe in' describes Trump ... | |
| ### Data Breakdown: | |
| 1. **All Rep/Lean Rep (Overall):** | |
| - Very well: 51% | |
| - Fairly well: 36% | |
| - NET: 87% | |
| 2. **Ages 18-29:** | |
| - Very well: 31% | |
| - Fairly well: 45% | |
| - NET: 76% | |
| 3. **30-49:** | |
| - Very well: 41% | |
| - Fairly well: 42% | |
| - NET: 82% | |
| 4. **50-64:** | |
| - Very well: 58% | |
| - Fairly well: 33% | |
| - NET: 92% | |
| 5. **65+:** | |
| - Very well: 68% | |
| - Fairly well: 26% | |
| - NET: 94% | |
| 6. **Postgrad:** | |
| - Very well: 42% | |
| - Fairly well: 38% | |
| - NET: 80% | |
| 7. **College grad:** | |
| - Very well: 45% | |
| - Fairly well: 40% | |
| - NET: 85% | |
| 8. **Some college:** | |
| - Very well: 51% | |
| - Fairly well: 36% | |
| - NET: 87% | |
| 9. **HS or less:** | |
| - Very well: 56% | |
| - Fairly well: 33% | |
| - NET: 89 | |
| 10. **Conserv (Conservative):** | |
| - Very well: 63% | |
| - Fairly well: 31% | |
| - NET: 94% | |
| 11. **Mod/Lib (Moderate/Liberal):** | |
| - Very well: 32% | |
| - Fairly well: 44% | |
| - NET: 75 | |
| 12. **Republican:** | |
| - Very well: 61% | |
| - Fairly well: 32% | |
| - NET: 93 | |
| 13. **Lean Republican:** | |
| - Very well: 36% | |
| - Fairly well: 41% | |
| - NET: 77 | |
| ### Notes: | |
| - The note at the bottom states that the data is based on Republicans and Republican-leaning independents. | |
| - The source is a survey of U.S. adults conducted from February 4-15, 2020. | |
| ### Key Observations: | |
| 1. Older Republicans (65+) are the most likely to see Trump as someone who "fights for what I believe in," with a net positive percentage of 94. | |
| 2. Younger age groups (18-29) have the lowest net positive percentage at 76. | |
| 3. Those with higher educational backgrounds (postgrad and college grad) have slightly lower net positive percentages compared to those with some college education (80 vs. 85). | |
| 4. Conservatives (63% very well) are the most likely to see Trump this way, followed by Republicans (61%). | |
| 5. Lean Republicans (36% very well) have the lowest percentage among the leaner categories. | |
| This detailed description should provide a pure text model with sufficient information to answer any related questions about the image.""" | |
| DEFAULT_TOKENS = 826 | |
| def load_model(): | |
| device = "cpu" | |
| dtype = torch.float32 | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| dtype=dtype, | |
| device_map="cpu", | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| return model, processor | |
| MODEL, PROCESSOR = load_model() | |
| def generate_caption(image: Image.Image, max_new_tokens: int = MAX_NEW_TOKENS): | |
| if image is None: | |
| return "", 0 | |
| try: | |
| if not isinstance(image, Image.Image): | |
| return "Error: Invalid image format", 0 | |
| max_size = 4096 | |
| if image.width > max_size or image.height > max_size: | |
| image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
| device = MODEL.device | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": DEFAULT_PROMPT}, | |
| ], | |
| } | |
| ] | |
| prompt_text = PROCESSOR.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = PROCESSOR( | |
| text=[prompt_text], | |
| images=[image], | |
| return_tensors="pt", | |
| ).to(device) | |
| # Ensure slider value is an integer within bounds | |
| try: | |
| max_tokens = int(max(32, min(4096, int(max_new_tokens)))) | |
| except Exception: | |
| max_tokens = MAX_NEW_TOKENS | |
| generated_ids = MODEL.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| do_sample=False, | |
| ) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = PROCESSOR.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| caption = output_text[0].strip() | |
| input_ids = inputs.get("input_ids") | |
| input_length = input_ids.shape[-1] if input_ids is not None else 0 | |
| total_length = generated_ids.shape[-1] | |
| num_generated_tokens = max(total_length - input_length, 0) | |
| return caption, int(num_generated_tokens) | |
| except RuntimeError as e: | |
| return f"Runtime error: {str(e)}", 0 | |
| except Exception as e: | |
| return f"Error generating caption: {str(e)}", 0 | |
| with gr.Blocks(title="CapRL Image Captioning (CPU)") as demo: | |
| gr.Markdown("# π¨ CapRL for Image Captioning (CPU)") | |
| gr.Markdown("### CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning") | |
| gr.Markdown("β¨ Upload an image to generate a detailed caption with CapRL-3B (CPU-only)! β¨") | |
| gr.Markdown( | |
| """ | |
| π <a href=\"https://arxiv.org/abs/2509.22647\">Paper</a> | π <a href=\"https://github.com/InternLM/CapRL\">Github</a> | π€ <a href=\"https://huggingface.co/internlm/CapRL-3B\">CapRL-3B Model</a> | π€ <a href=\"https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B\">CapRL-InternVL3.5-8B Model</a> | | |
| π€ <a href=\"https://huggingface.co/datasets/internlm/CapRL-2M\">CapRL-2M Dataset</a> | |
| π€ <a href=\"https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189\">CapRL Collection</a> | π° <a href=\"https://huggingface.co/papers/2509.22647\">Daily Paper</a> | πΎ <a href=\"https://huggingface.co/mradermacher/CapRL-3B-GGUF\">CapRL-3B-GGUF</a> | πΎ <a href=\"https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF\">CapRL-3B-i1-GGUF</a> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| <div style="font-size: 1.2rem; font-weight: 800; color: #e67300;"> | |
| π Prefer faster inference? Try the GPU Space: | |
| <a href="https://huggingface.co/spaces/yuhangzang/caprl" style="color: #e67300; text-decoration: underline; font-weight: 900;"> | |
| caprl (GPU Space) | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(value=DEFAULT_IMAGE_PATH, type="pil", label="Input Image") | |
| max_new_tokens_slider = gr.Slider( | |
| minimum=32, | |
| maximum=4096, | |
| step=1, | |
| value=MAX_NEW_TOKENS, | |
| label="Max New Tokens (32β4096)", | |
| ) | |
| generate_button = gr.Button("Generate Caption") | |
| with gr.Column(): | |
| caption_output = gr.Textbox(value=DEFAULT_CAPTION, label="Caption", lines=6) | |
| token_output = gr.Number(value=DEFAULT_TOKENS, label="Generated Tokens", precision=0) | |
| generate_button.click( | |
| fn=generate_caption, | |
| inputs=[image_input, max_new_tokens_slider], | |
| outputs=[caption_output, token_output], | |
| show_progress=True, | |
| ) | |
| image_input.upload( | |
| fn=generate_caption, | |
| inputs=[image_input, max_new_tokens_slider], | |
| outputs=[caption_output, token_output], | |
| show_progress=True, | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["./examples/1909.png", MAX_NEW_TOKENS], | |
| ["./examples/44687.jpeg", MAX_NEW_TOKENS], | |
| ["./examples/natural.png", MAX_NEW_TOKENS], | |
| ], | |
| inputs=[image_input, max_new_tokens_slider], | |
| outputs=[caption_output, token_output], | |
| fn=generate_caption, | |
| cache_examples=True, | |
| label="πΈ Example Images", | |
| ) | |
| gr.Markdown("### Citation") | |
| gr.Markdown("If you find this project useful, please kindly cite:") | |
| citation_text = """@article{xing2025caprl, | |
| title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning}, | |
| author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua}, | |
| journal={arXiv preprint arXiv:2509.22647}, | |
| year={2025} | |
| }""" | |
| gr.Code(value=citation_text, language="markdown", label="BibTeX Citation") | |
| demo.launch() | |