Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from fastapi.responses import StreamingResponse | |
| import torch | |
| import threading | |
| app = FastAPI() | |
| # Cargar modelo y tokenizer de Phi-2 (usa el modelo de Hugging Face Hub) | |
| model_id = "HuggingFaceTB/SmolLM2-135M" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| # Modelo de entrada | |
| class ChatRequest(BaseModel): | |
| message: str | |
| async def chat_stream(request: ChatRequest): | |
| prompt = f"""Responde en español de forma clara y breve como un asistente IA. | |
| Usuario: {request.message} | |
| IA:""" | |
| # Tokenizar entrada | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| input_ids = inputs["input_ids"] | |
| attention_mask = inputs["attention_mask"] | |
| # Streamer para obtener tokens generados poco a poco | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=False, skip_special_tokens=False) | |
| # Iniciar la generación en un hilo aparte | |
| generation_kwargs = dict( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| max_new_tokens=48, # Puedes ajustar este valor para más/menos tokens | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| streamer=streamer, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # StreamingResponse espera un generador que devuelva texto | |
| async def event_generator(): | |
| for new_text in streamer: | |
| yield new_text | |
| return StreamingResponse(event_generator(), media_type="text/plain") | |