llm_qualia_2

Sleeping

File size: 3,201 Bytes

7f0c9e6
2f0addb
 
7f0c9e6
 
2f0addb
 
7f0c9e6
 
 
 
 
 
2f0addb
7f0c9e6
2f0addb
7f0c9e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f0addb
7f0c9e6
 
 
 
2f0addb
7f0c9e6
 
 
2f0addb
 
 
 
7f0c9e6
 
2f0addb
7f0c9e6
2f0addb
 
 
7f0c9e6
2f0addb
7f0c9e6
 
2f0addb
 
 
7f0c9e6
2f0addb
 
 
 
 
 
7f0c9e6
 
 
 
 
 
2f0addb

# bp_phi/llm_iface.py
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import torch, random, numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from typing import List, Optional

DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"

def dbg(*args):
    if DEBUG:
        print("[DEBUG:llm_iface]", *args, flush=True)

class LLM:
    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
        self.model_id = model_id
        self.seed = seed

        # Set all seeds for reproducibility
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
        try:
            torch.use_deterministic_algorithms(True)
        except Exception as e:
            dbg(f"Could not set deterministic algorithms: {e}")
        set_seed(seed)

        token = os.environ.get("HF_TOKEN")
        if not token and "gemma-3" in model_id:
            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")

        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
        kwargs = {}
        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16

        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
        self.model.eval()
        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template

        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")

    def generate_json(self, system_prompt: str, user_prompt: str,
                      max_new_tokens: int = 256, temperature: float = 0.7,
                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
        set_seed(self.seed) # Re-seed for each call for full determinism

        if self.is_instruction_tuned:
            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        else:
            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        input_token_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                do_sample=(temperature > 0),
                temperature=temperature,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                num_return_sequences=num_return_sequences,
                pad_token_id=self.tokenizer.eos_token_id
            )

        # ✅ Decode ONLY the newly generated tokens, not the prompt
        new_tokens = out[:, input_token_length:]
        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

        dbg("Cleaned model completions:", completions)
        return completions