|
|
import requests |
|
|
|
|
|
|
|
|
|
|
|
import base64 |
|
|
import mimetypes |
|
|
import os |
|
|
from pathlib import Path |
|
|
from typing import Any, Dict, List |
|
|
|
|
|
import gradio as gr |
|
|
from openai import OpenAI |
|
|
|
|
|
|
|
|
headers = {"Authorization": f"Bearer {API_KEY}"} |
|
|
payload = {"inputs": "Describe this image", "parameters": {}} |
|
|
res = requests.post(BASE_URL, headers=headers, json=payload) |
|
|
print(res.json()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_MODEL = "LLaVA-OneVision-1.5-8B-Instruct" |
|
|
|
|
|
|
|
|
_client = OpenAI( |
|
|
base_url=os.getenv("BASE_URL", ""), |
|
|
api_key=os.getenv("API_KEY", ""), |
|
|
) |
|
|
|
|
|
|
|
|
def _data_url(path: str) -> str: |
|
|
mime, _ = mimetypes.guess_type(path) |
|
|
mime = mime or "application/octet-stream" |
|
|
data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") |
|
|
return f"data:{mime};base64,{data}" |
|
|
|
|
|
|
|
|
def _image_content(path: str) -> Dict[str, Any]: |
|
|
return {"type": "image_url", "image_url": {"url": _data_url(path)}} |
|
|
|
|
|
|
|
|
def _text_content(text: str) -> Dict[str, Any]: |
|
|
return {"type": "text", "text": text} |
|
|
|
|
|
|
|
|
def _message(role: str, content: Any) -> Dict[str, Any]: |
|
|
return {"role": role, "content": content} |
|
|
|
|
|
|
|
|
def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: |
|
|
files = message.get("files") or [] |
|
|
text = (message.get("text") or "").strip() |
|
|
|
|
|
|
|
|
if not text: |
|
|
text = ( |
|
|
"Analiza la imagen del plato de comida y describe los alimentos que contiene. " |
|
|
"Indica una estimación de calorías, proteínas, carbohidratos y grasas. " |
|
|
"Responde en formato breve y estructurado." |
|
|
) |
|
|
|
|
|
content: List[Dict[str, Any]] = [_image_content(p) for p in files] |
|
|
if text: |
|
|
content.append(_text_content(text)) |
|
|
return _message("user", content) |
|
|
|
|
|
|
|
|
def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
msgs: List[Dict[str, Any]] = [] |
|
|
user_content: List[Dict[str, Any]] = [] |
|
|
|
|
|
for turn in history or []: |
|
|
role, content = turn.get("role"), turn.get("content") |
|
|
if role == "user": |
|
|
if isinstance(content, str): |
|
|
user_content.append(_text_content(content)) |
|
|
elif isinstance(content, tuple): |
|
|
user_content.extend(_image_content(path) for path in content if path) |
|
|
elif role == "assistant": |
|
|
msgs.append(_message("user", user_content.copy())) |
|
|
user_content.clear() |
|
|
msgs.append(_message("assistant", content)) |
|
|
return msgs |
|
|
|
|
|
|
|
|
def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): |
|
|
messages = _convert_history(history) |
|
|
messages.append(_build_user_message(message)) |
|
|
try: |
|
|
stream = _client.chat.completions.create( |
|
|
model=model_name, |
|
|
messages=messages, |
|
|
temperature=0.1, |
|
|
top_p=1, |
|
|
extra_body={ |
|
|
"repetition_penalty": 1.05, |
|
|
"frequency_penalty": 0, |
|
|
"presence_penalty": 0 |
|
|
}, |
|
|
stream=True |
|
|
) |
|
|
partial = "" |
|
|
for chunk in stream: |
|
|
delta = chunk.choices[0].delta.content |
|
|
if delta: |
|
|
partial += delta |
|
|
yield partial |
|
|
except Exception as e: |
|
|
yield f"⚠️ Error al obtener respuesta: {e}" |
|
|
|
|
|
|
|
|
def build_demo() -> gr.Blocks: |
|
|
chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) |
|
|
textbox = gr.MultimodalTextbox( |
|
|
show_label=False, |
|
|
placeholder="Subí una foto de tu comida para analizarla...", |
|
|
file_types=["image"], |
|
|
file_count="single", |
|
|
max_plain_text_length=32768 |
|
|
) |
|
|
model_selector = gr.Dropdown( |
|
|
label="Modelo", |
|
|
choices=[ |
|
|
("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"), |
|
|
("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"), |
|
|
], |
|
|
value=DEFAULT_MODEL, |
|
|
) |
|
|
return gr.ChatInterface( |
|
|
fn=stream_response, |
|
|
type="messages", |
|
|
multimodal=True, |
|
|
chatbot=chatbot, |
|
|
textbox=textbox, |
|
|
title="🍽️ NasFit Vision AI", |
|
|
description=( |
|
|
"Subí una foto de tu comida y NasFit IA estimará su contenido nutricional. " |
|
|
"Basado en **LLaVA-OneVision-1.5**, modelo multimodal open source con análisis visual avanzado. " |
|
|
"Ideal para tracking nutricional inteligente." |
|
|
), |
|
|
additional_inputs=[model_selector], |
|
|
additional_inputs_accordion=gr.Accordion("Opciones avanzadas", open=False), |
|
|
).queue(default_concurrency_limit=8) |
|
|
|
|
|
|
|
|
def main(): |
|
|
build_demo().launch() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |