Spaces:
Sleeping
Sleeping
File size: 3,111 Bytes
7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 e593b84 7f0c9e6 e593b84 7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 2f0addb e593b84 7f0c9e6 2f0addb 7f0c9e6 2f0addb e593b84 7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 2f0addb 7f0c9e6 2f0addb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# bp_phi/llm_iface.py
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import torch, random, numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from typing import List, Optional
DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
def dbg(*args):
if DEBUG:
print("[DEBUG:llm_iface]", *args, flush=True)
class LLM:
def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
self.model_id = model_id
self.seed = seed
# Set all seeds for reproducibility
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
try:
torch.use_deterministic_algorithms(True, warn_only=True)
except Exception as e:
dbg(f"Could not set deterministic algorithms: {e}")
set_seed(seed)
token = os.environ.get("HF_TOKEN")
if not token and ("gemma-3" in model_id or "llama" in model_id):
print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")
self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
kwargs = {}
if dtype == "float16": kwargs["torch_dtype"] = torch.float16
elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
self.model.eval()
self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
def generate_json(self, system_prompt: str, user_prompt: str,
max_new_tokens: int = 256, temperature: float = 0.7,
top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
set_seed(self.seed)
if self.is_instruction_tuned:
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
input_token_length = inputs.input_ids.shape[1]
with torch.no_grad():
out = self.model.generate(
**inputs,
do_sample=(temperature > 0),
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
num_return_sequences=num_return_sequences,
pad_token_id=self.tokenizer.eos_token_id
)
new_tokens = out[:, input_token_length:]
completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
dbg("Cleaned model completions:", completions)
return completions
|