Spaces:

neuralworm
/

llm_qualia

Sleeping

App Files Files Community

llm_qualia / bp_phi /llm_iface.py

neuralworm

add debug

7f0c9e6 25 days ago

raw

history blame

3.2 kB

	# bp_phi/llm_iface.py
	import os
	os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
	import torch, random, numpy as np
	from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
	from typing import List, Optional

	DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"

	def dbg(*args):
	if DEBUG:
	print("[DEBUG:llm_iface]", *args, flush=True)

	class LLM:
	def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
	self.model_id = model_id
	self.seed = seed

	# Set all seeds for reproducibility
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)
	try:
	torch.use_deterministic_algorithms(True)
	except Exception as e:
	dbg(f"Could not set deterministic algorithms: {e}")
	set_seed(seed)

	token = os.environ.get("HF_TOKEN")
	if not token and "gemma-3" in model_id:
	print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")

	self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
	kwargs = {}
	if dtype == "float16": kwargs["torch_dtype"] = torch.float16
	elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16

	self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
	self.model.eval()
	self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template

	dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")

	def generate_json(self, system_prompt: str, user_prompt: str,
	max_new_tokens: int = 256, temperature: float = 0.7,
	top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
	set_seed(self.seed) # Re-seed for each call for full determinism

	if self.is_instruction_tuned:
	messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
	prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	else:
	prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"

	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
	input_token_length = inputs.input_ids.shape[1]

	with torch.no_grad():
	out = self.model.generate(
	**inputs,
	do_sample=(temperature > 0),
	temperature=temperature,
	top_p=top_p,
	max_new_tokens=max_new_tokens,
	num_return_sequences=num_return_sequences,
	pad_token_id=self.tokenizer.eos_token_id
	)

	# ✅ Decode ONLY the newly generated tokens, not the prompt
	new_tokens = out[:, input_token_length:]
	completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

	dbg("Cleaned model completions:", completions)
	return completions