Spaces:

MalcomNavarro
/

hf-gaia-agents-course-MN

Sleeping

hf-gaia-agents-course-MN / model_llama_local.py

Mahynlo

Switch to Zephyr 7B (no gating required)

c3c998f 21 days ago

10.2 kB

	"""
	Wrapper para ejecutar Llama 2 (u otros modelos de HF) LOCALMENTE en el Space.
	Usa transformers + pipeline para inferencia en CPU/GPU.
	Compatible con la clase Agent (método generate_simple).
	"""

	import os
	import time
	import torch
	from functools import lru_cache
	from typing import List, Dict, Optional
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	try:
	from transformers import BitsAndBytesConfig
	BITSANDBYTES_AVAILABLE = True
	except ImportError:
	BITSANDBYTES_AVAILABLE = False
	print("⚠️ bitsandbytes no disponible, no se puede usar quantización 8-bit")


	class LocalHFModel:
	"""
	Modelo de HuggingFace cargado localmente en memoria.

	Ventajas:
	- ⚡ Más rápido (sin latencia de red)
	- 🔒 Sin rate limits
	- 💾 Control total sobre parámetros

	Desventajas:
	- 🧠 Usa RAM del Space (~7-14GB según modelo)
	- ⏳ Carga inicial lenta (30-60s)
	"""

	def __init__(
	self,
	model_id: str = "meta-llama/Llama-2-7b-chat-hf",
	max_new_tokens: int = 256,
	temperature: float = 0.0,
	device: str = "auto",
	load_in_8bit: bool = True,
	):
	"""
	Inicializa modelo local.

	Args:
	model_id: ID del modelo en HuggingFace Hub
	max_new_tokens: Tokens máximos a generar
	temperature: 0.0 = determinístico, >0 = creativo
	device: "auto", "cpu", "cuda"
	load_in_8bit: True = ~7GB RAM, False = ~14GB RAM
	"""
	self.model_id = model_id
	self.max_new_tokens = max_new_tokens
	self.temperature = temperature
	self.device = device
	self.load_in_8bit = load_in_8bit

	# HF Token (necesario para modelos gated como Llama)
	self.hf_token = os.getenv("HF_TOKEN")
	if not self.hf_token:
	raise ValueError(
	"❌ HF_TOKEN no configurado.\n"
	"Necesario para descargar modelos de HuggingFace.\n"
	"Configúralo en Settings → Repository secrets"
	)

	print(f"🦙 Cargando {model_id} localmente...")
	print(f" 📍 Device: {device}")
	print(f" 💾 8-bit quantization: {load_in_8bit}")
	print(f" 🎯 Max tokens: {max_new_tokens}")
	print(f" 🌡️ Temperature: {temperature}")

	start_time = time.time()

	# Configurar quantización
	quantization_config = None
	if load_in_8bit and BITSANDBYTES_AVAILABLE:
	try:
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	llm_int8_threshold=6.0,
	)
	print(" ✅ Quantización 8-bit configurada")
	except Exception as e:
	print(f" ⚠️ Error en 8-bit, usando float16: {e}")
	load_in_8bit = False
	elif load_in_8bit and not BITSANDBYTES_AVAILABLE:
	print(" ⚠️ bitsandbytes no instalado, usando float16")
	load_in_8bit = False

	# Cargar tokenizer
	print(" 📦 Cargando tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	token=self.hf_token,
	trust_remote_code=True
	)

	# Configurar pad_token si no existe
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# Cargar modelo
	print(" 🧠 Cargando modelo (30-60s)...")
	try:
	self.model = AutoModelForCausalLM.from_pretrained(
	model_id,
	token=self.hf_token,
	torch_dtype=torch.float16 if not load_in_8bit else torch.float32,
	device_map=device,
	quantization_config=quantization_config,
	low_cpu_mem_usage=True, # Importante para 16GB RAM
	trust_remote_code=True
	)
	except Exception as e:
	print(f" ❌ Error cargando modelo: {e}")
	print(" ℹ️ Verifica que HF_TOKEN tenga acceso al modelo")
	raise

	# Crear pipeline
	self.pipe = pipeline(
	"text-generation",
	model=self.model,
	tokenizer=self.tokenizer,
	max_new_tokens=max_new_tokens,
	temperature=temperature if temperature > 0 else 0.01, # 0.0 causa problemas
	do_sample=temperature > 0,
	top_p=0.95 if temperature > 0 else 1.0,
	repetition_penalty=1.15,
	return_full_text=False, # Solo nueva generación
	)

	load_time = time.time() - start_time
	print(f" ✅ Modelo cargado en {load_time:.1f}s")

	# Info de memoria
	if torch.cuda.is_available():
	mem_allocated = torch.cuda.memory_allocated() / 1024**3
	print(f" 📊 GPU Memory: {mem_allocated:.2f} GB")
	else:
	print(" 📊 Running on CPU")

	def _format_llama_prompt(self, messages: List[Dict[str, str]]) -> str:
	"""
	Formatea mensajes al formato correcto según el modelo.

	Soporta:
	- Llama 2: <s>[INST] <<SYS>>...
	- Zephyr: <\|system\|>...<\|user\|>...<\|assistant\|>
	"""
	system_msg = ""
	user_msg = ""

	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")

	if role == "system":
	system_msg = content
	elif role == "user":
	user_msg = content

	# Detectar formato según model_id
	if "zephyr" in self.model_id.lower():
	# Formato Zephyr
	if system_msg:
	prompt = f"<\|system\|>\n{system_msg}</s>\n<\|user\|>\n{user_msg}</s>\n<\|assistant\|>\n"
	else:
	prompt = f"<\|user\|>\n{user_msg}</s>\n<\|assistant\|>\n"
	else:
	# Formato Llama 2 (default)
	if system_msg:
	prompt = f"<s>[INST] <<SYS>>\n{system_msg}\n<</SYS>>\n\n{user_msg} [/INST]"
	else:
	prompt = f"<s>[INST] {user_msg} [/INST]"

	return prompt

	def __call__(
	self,
	messages: List[Dict[str, str]],
	max_new_tokens: Optional[int] = None,
	temperature: Optional[float] = None,
	**kwargs
	) -> str:
	"""
	Genera respuesta.

	Args:
	messages: [{"role": "user", "content": "..."}]
	max_new_tokens: Override de tokens
	temperature: Override de temperatura

	Returns:
	Texto generado
	"""
	try:
	# Formatear prompt
	prompt = self._format_llama_prompt(messages)

	# Override parámetros
	gen_kwargs = {}
	if max_new_tokens:
	gen_kwargs["max_new_tokens"] = max_new_tokens
	if temperature is not None:
	gen_kwargs["temperature"] = temperature if temperature > 0 else 0.01
	gen_kwargs["do_sample"] = temperature > 0

	# Generar
	start_time = time.time()
	result = self.pipe(prompt, **gen_kwargs)
	gen_time = time.time() - start_time

	# Extraer texto
	if isinstance(result, list) and len(result) > 0:
	generated_text = result[0].get("generated_text", "")
	else:
	generated_text = str(result)

	print(f" ⚡ Generado en {gen_time:.2f}s ({len(generated_text)} chars)")

	return generated_text.strip()

	except Exception as e:
	error_msg = f"ERROR: {str(e)}"
	print(f" ❌ {error_msg}")
	return error_msg

	def generate_simple(
	self,
	prompt: str,
	system: Optional[str] = None,
	**kwargs
	) -> str:
	"""
	Interfaz simplificada compatible con Agent.

	Args:
	prompt: Texto del usuario
	system: Prompt de sistema (opcional)

	Returns:
	Respuesta generada
	"""
	messages = []
	if system:
	messages.append({"role": "system", "content": system})
	messages.append({"role": "user", "content": prompt})

	return self(messages, **kwargs)


	@lru_cache(maxsize=1) # Solo 1 modelo en cache (usa mucha RAM)
	def get_local_model(
	model_id: str = "meta-llama/Llama-2-7b-chat-hf",
	load_in_8bit: bool = True,
	max_new_tokens: int = 256,
	temperature: float = 0.0,
	) -> LocalHFModel:
	"""
	Factory con cache para modelo local.

	IMPORTANTE: maxsize=1 porque cada modelo usa ~7-14GB RAM.
	"""
	return LocalHFModel(
	model_id=model_id,
	load_in_8bit=load_in_8bit,
	max_new_tokens=max_new_tokens,
	temperature=temperature
	)


	# Alias para compatibilidad con app.py
	def get_model(model_id: str = "meta-llama/Llama-2-7b-chat-hf", **kwargs) -> LocalHFModel:
	"""
	Factory principal para obtener modelo local.

	Args:
	model_id: Modelo de HuggingFace
	**kwargs: Parámetros adicionales (load_in_8bit, max_new_tokens, etc.)

	Returns:
	LocalHFModel listo para usar
	"""
	# Obtener parámetros con defaults
	load_in_8bit = kwargs.pop("load_in_8bit", True)
	max_new_tokens = kwargs.pop("max_new_tokens", 256)
	temperature = kwargs.pop("temperature", 0.0)

	return get_local_model(
	model_id=model_id,
	load_in_8bit=load_in_8bit,
	max_new_tokens=max_new_tokens,
	temperature=temperature
	)


	if __name__ == "__main__":
	# Test
	print("=== Test de Modelo Local ===")

	model = get_model(load_in_8bit=True)

	response = model.generate_simple(
	"What is 2+2?",
	system="You are a helpful math assistant."
	)

	print(f"\n📝 Respuesta: {response}")