File size: 3,111 Bytes
7f0c9e6
2f0addb
 
7f0c9e6
 
2f0addb
 
7f0c9e6
 
 
 
 
 
2f0addb
7f0c9e6
2f0addb
7f0c9e6
 
 
 
 
 
 
 
 
e593b84
7f0c9e6
 
 
 
 
e593b84
 
7f0c9e6
 
2f0addb
7f0c9e6
 
 
 
2f0addb
7f0c9e6
 
 
2f0addb
 
 
 
e593b84
7f0c9e6
2f0addb
7f0c9e6
2f0addb
 
e593b84
7f0c9e6
2f0addb
7f0c9e6
 
2f0addb
 
 
7f0c9e6
2f0addb
 
 
 
 
 
7f0c9e6
 
 
 
 
2f0addb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# bp_phi/llm_iface.py
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import torch, random, numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from typing import List, Optional

DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"

def dbg(*args):
    if DEBUG:
        print("[DEBUG:llm_iface]", *args, flush=True)

class LLM:
    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
        self.model_id = model_id
        self.seed = seed

        # Set all seeds for reproducibility
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
        try:
            torch.use_deterministic_algorithms(True, warn_only=True)
        except Exception as e:
            dbg(f"Could not set deterministic algorithms: {e}")
        set_seed(seed)

        token = os.environ.get("HF_TOKEN")
        if not token and ("gemma-3" in model_id or "llama" in model_id):
            print(f"[WARN] No HF_TOKEN set for gated model {model_id}. This may fail.")

        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
        kwargs = {}
        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16

        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
        self.model.eval()
        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template

        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")

    def generate_json(self, system_prompt: str, user_prompt: str,
                      max_new_tokens: int = 256, temperature: float = 0.7,
                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
        set_seed(self.seed)

        if self.is_instruction_tuned:
            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        else:
            prompt = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:\n"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        input_token_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                do_sample=(temperature > 0),
                temperature=temperature,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_return_sequences,
                pad_token_id=self.tokenizer.eos_token_id
            )

        new_tokens = out[:, input_token_length:]
        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

        dbg("Cleaned model completions:", completions)
        return completions