Spaces:
Sleeping
Sleeping
| # bp_phi/llm_iface.py | |
| import os | |
| os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" | |
| import torch, random, numpy as np | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed | |
| from typing import List, Optional | |
| DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1" | |
| def dbg(*args): | |
| if DEBUG: | |
| print("[DEBUG:llm_iface]", *args, flush=True) | |
| class LLM: | |
| def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42): | |
| self.model_id = model_id | |
| self.seed = seed | |
| # Set all seeds for reproducibility | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| try: | |
| torch.use_deterministic_algorithms(True, warn_only=True) | |
| except Exception as e: | |
| dbg(f"Could not set deterministic algorithms: {e}") | |
| set_seed(seed) | |
| token = os.environ.get("HF_TOKEN") | |
| if not token and ("gemma-3" in model_id or "llama" in model_id): | |
| print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.") | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token) | |
| kwargs = {} | |
| if dtype == "float16": kwargs["torch_dtype"] = torch.float16 | |
| elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16 | |
| self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs) | |
| self.model.eval() | |
| self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template | |
| dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}") | |
| def generate_json(self, system_prompt: str, user_prompt: str, | |
| max_new_tokens: int = 256, temperature: float = 0.7, | |
| top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]: | |
| set_seed(self.seed) | |
| if self.is_instruction_tuned: | |
| messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}] | |
| prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| else: | |
| prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n" | |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) | |
| input_token_length = inputs.input_ids.shape[1] | |
| with torch.no_grad(): | |
| out = self.model.generate( | |
| **inputs, | |
| do_sample=(temperature > 0), | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_new_tokens=max_new_tokens, | |
| num_return_sequences=num_return_sequences, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| new_tokens = out[:, input_token_length:] | |
| completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True) | |
| dbg("Cleaned model completions:", completions) | |
| return completions | |