import os import gradio as gr from functools import lru_cache from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline import torch MODEL_NAME = "openbmb/MiniCPM-V-4" @lru_cache(maxsize=1) def load_pipeline(): tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) return TextGenerationPipeline( model=model, tokenizer=tokenizer, device=model.device.index if torch.cuda.is_available() else -1 ) def respond(user_message, chat_history): # 이전 대화와 새 메시지를 하나의 prompt로 합침 history_text = "" for u, a in chat_history: history_text += f"用户:{u}\n助理:{a}\n" prompt = history_text + f"用户:{user_message}\n助理:" pipeline = load_pipeline() output = pipeline( prompt, max_length=history_text.count("\n") + len(user_message.split()) + 50, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1 )[0]["generated_text"] # 모델이 반환한 전체 텍스트에서 어시스턴트 응답 부분만 추출 assistant_reply = output.split("助理:", 1)[-1].strip() chat_history.append((user_message, assistant_reply)) return chat_history with gr.Blocks() as demo: gr.Markdown("## MiniCPM-V-4 Chatbot Demo") chatbot = gr.Chatbot(label="대화") user_input = gr.Textbox( placeholder="메시지를 입력하세요...", lines=1 ) clear_btn = gr.Button("초기화") user_input.submit( fn=respond, inputs=[user_input, chatbot], outputs=chatbot ) clear_btn.click( lambda: [], None, chatbot ) demo.launch()