llm_qualia_2 / bp_phi /llm_iface.py
neuralworm's picture
initial commit
2f0addb
raw
history blame
2.47 kB
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Optional
class LLM:
def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None):
self.model_id = model_id
self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
kwargs = {}
if dtype == "float16":
kwargs["torch_dtype"] = torch.float16
elif dtype == "bfloat16":
kwargs["torch_dtype"] = torch.bfloat16
self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, **kwargs)
self.model.eval()
self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None)
print(f"[BP-Φ] Loaded model: {model_id}")
print(f"[BP-Φ] Chat-template detected: {bool(self.is_instruction_tuned)}")
def generate_json(self, system_prompt: str, user_prompt: str,
max_new_tokens: int = 256, temperature: float = 0.7,
top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
if self.is_instruction_tuned:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
with torch.no_grad():
out = self.model.generate(
**inputs,
do_sample=True,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_new_tokens,
num_return_sequences=num_return_sequences,
pad_token_id=self.tokenizer.eos_token_id
)
texts = self.tokenizer.batch_decode(out, skip_special_tokens=True)
completions = []
for t in texts:
for marker in ["<end_of_turn>", "<end_of_text>", "</s>"]:
if marker in t:
t = t.split(marker)[0]
if "Assistant:" in t:
t = t.split("Assistant:")[-1]
completions.append(t.strip())
return completions