import os import torch import gradio as gr from PIL import Image from typing import List, Dict, Any from transformers import AutoModel, AutoTokenizer """ Gradio app to run MiniCPM-V-4_5 int4 on CPU for image+text chat. - Requires: pip install transformers accelerate gradio pillow - Model: openbmb/MiniCPM-V-4_5-int4 (quantized, CPU-friendly) - This script is self-contained and uses a simple multi-turn chat interface. """ MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5-int4") # Global model/tokenizer, loaded once model = None tokenizer = None def load_model(): global model, tokenizer if model is not None and tokenizer is not None: return # For CPU inference, keep it simple and avoid .cuda() / bfloat16 # trust_remote_code is required because MiniCPM implements custom .chat() tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModel.from_pretrained( MODEL_ID, trust_remote_code=True, attn_implementation="sdpa", # SDPA is fine on CPU; avoid flash-attn on CPU torch_dtype=torch.float32, # Safer default for CPU device_map="cpu", # Ensure CPU execution quantization_config=None, ) model.eval() def build_messages(history: List[Dict[str, Any]], image: Image.Image, user_input: str) -> List[Dict[str, Any]]: """ Convert Gradio chat history + current inputs into the message format expected by MiniCPM's .chat(). history: List of {"role": "user"/"assistant", "content": "..."} pairs (text-only transcript). image: PIL.Image or None for the current turn. user_input: current user text. Returns a msgs list with roles and content arrays [image?, text]. """ msgs = [] # Reconstruct multi-turn context: interleave user/assistant turns # We assume each user message is text-only and assistant reply is text-only in history. # For the current turn, we can attach an image (if provided) and the user's text. for turn in history: # Each turn in history is a tuple (user_text, assistant_text) from gr.Chatbot user_text, assistant_text = turn if user_text is not None: msgs.append({"role": "user", "content": [user_text]}) if assistant_text is not None: msgs.append({"role": "assistant", "content": [assistant_text]}) # Append current user turn (with optional image) content = [] if image is not None: # Ensure RGB if image.mode != "RGB": image = image.convert("RGB") content.append(image) if user_input and user_input.strip(): content.append(user_input.strip()) else: # Ensure there is at least something in the content content.append("") msgs.append({"role": "user", "content": content}) return msgs def respond(user_text: str, image: Image.Image, chat_history: List[List[str]], enable_thinking: bool): """ Inference handler for Gradio. Returns updated chat history and clears the user textbox. """ load_model() # Build MiniCPM messages msgs = build_messages(chat_history or [], image, user_text) # Run model.chat with torch.inference_mode(): answer = model.chat( msgs=msgs, tokenizer=tokenizer, enable_thinking=enable_thinking ) # Update history shown in Chatbot: append (user_text, answer) # If user_text is empty but image provided, show a placeholder text. shown_user_msg = user_text.strip() if (user_text and user_text.strip()) else "[Image]" chat_history = chat_history + [[shown_user_msg, answer]] return chat_history, "" def clear_history(): return [], None, "" def demo_app(): with gr.Blocks(title="MiniCPM-V-4_5-int4 (CPU) - Gradio", theme="soft") as demo: gr.Markdown("## MiniCPM-V-4_5-int4 (CPU) Demo\nUpload an image (optional) and ask a question.") with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(height=420, type="messages", avatar_images=(None, None)) with gr.Row(): img = gr.Image(type="pil", label="Image (optional)", height=240) user_in = gr.Textbox( label="Your message", placeholder="Ask something about the image or chat without an image...", lines=3 ) with gr.Row(): enable_thinking = gr.Checkbox(value=False, label="Enable thinking mode") send_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear") with gr.Column(scale=1): gr.Markdown("### Model") gr.Markdown(f"- ID: `{MODEL_ID}`\n- Device: CPU\n- Quant: int4") # Events send_btn.click( fn=respond, inputs=[user_in, img, chatbot, enable_thinking], outputs=[chatbot, user_in] ) user_in.submit( fn=respond, inputs=[user_in, img, chatbot, enable_thinking], outputs=[chatbot, user_in] ) clear_btn.click( fn=clear_history, inputs=[], outputs=[chatbot, img, user_in] ) return demo if __name__ == "__main__": # Make sure we don't accidentally spawn CUDA context os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") demo = demo_app() demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))