| import base64 | |
| import mimetypes | |
| import os | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| import gradio as gr | |
| from openai import OpenAI | |
| DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct") | |
| _client = OpenAI( | |
| base_url=os.getenv("BASE_URL", ""), | |
| api_key=os.getenv("API_KEY", ""), | |
| ) | |
| def _data_url(path: str) -> str: | |
| mime, _ = mimetypes.guess_type(path) | |
| mime = mime or "application/octet-stream" | |
| data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") | |
| return f"data:{mime};base64,{data}" | |
| def _image_content(path: str) -> Dict[str, Any]: | |
| return {"type": "image_url", "image_url": {"url": _data_url(path)}} | |
| def _text_content(text: str) -> Dict[str, Any]: | |
| return {"type": "text", "text": text} | |
| def _message(role: str, content: Any) -> Dict[str, Any]: | |
| return {"role": role, "content": content} | |
| def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: | |
| files = message.get("files") or [] | |
| text = (message.get("text") or "").strip() | |
| content: List[Dict[str, Any]] = [_image_content(p) for p in files] | |
| if text: | |
| content.append(_text_content(text)) | |
| return _message("user", content) | |
| def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| msgs: List[Dict[str, Any]] = [] | |
| user_content: List[Dict[str, Any]] = [] | |
| for turn in history or []: | |
| role, content = turn.get("role"), turn.get("content") | |
| if role == "user": | |
| if isinstance(content, str): | |
| user_content.append(_text_content(content)) | |
| elif isinstance(content, tuple): | |
| user_content.extend(_image_content(path) | |
| for path in content if path) | |
| elif role == "assistant": | |
| msgs.append(_message("user", user_content.copy())) | |
| user_content.clear() | |
| msgs.append(_message("assistant", content)) | |
| return msgs | |
| def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): | |
| messages = _convert_history(history) | |
| messages.append(_build_user_message(message)) | |
| try: | |
| stream = _client.chat.completions.create( | |
| model=model_name, | |
| messages=messages, | |
| temperature=0.000001, | |
| top_p=1, | |
| extra_body={ | |
| "repetition_penalty": 1.05, | |
| "frequency_penalty": 0, | |
| "presence_penalty": 0 | |
| }, | |
| stream=True | |
| ) | |
| partial = "" | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content | |
| if delta: | |
| partial += delta | |
| yield partial | |
| except Exception as e: | |
| yield f"Failed to get response: {e}" | |
| def build_demo() -> gr.Blocks: | |
| chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) | |
| textbox = gr.MultimodalTextbox( | |
| show_label=False, | |
| placeholder="Enter text, or upload one or more images...", | |
| file_types=["image"], | |
| file_count="single", | |
| max_plain_text_length=32768 | |
| ) | |
| model_selector = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"), | |
| ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"), | |
| ], | |
| value=DEFAULT_MODEL, | |
| ) | |
| return gr.ChatInterface( | |
| fn=stream_response, | |
| type="messages", | |
| multimodal=True, | |
| chatbot=chatbot, | |
| textbox=textbox, | |
| title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training", | |
| description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images. | |
| 🔗 **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""", | |
| additional_inputs=[model_selector], | |
| additional_inputs_accordion=gr.Accordion("Options", open=True), | |
| ).queue(default_concurrency_limit=8) | |
| def main(): | |
| build_demo().launch() | |
| if __name__ == "__main__": | |
| main() | |