|
|
import os |
|
|
import requests |
|
|
from llama_cpp import Llama |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
MODEL_REPO = "afrideva/TinyMistral-248M-SFT-v4-GGUF" |
|
|
MODEL_FILENAME = "tinymistral-248m-sft-v4.q8_0.gguf" |
|
|
MODEL_PATH = f"./models/{MODEL_FILENAME}" |
|
|
|
|
|
|
|
|
if not os.path.exists(MODEL_PATH): |
|
|
print("📦 Downloading GGUF model manually from Hugging Face...") |
|
|
|
|
|
url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}" |
|
|
headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
|
|
|
|
|
os.makedirs("./models", exist_ok=True) |
|
|
with requests.get(url, headers=headers, stream=True) as r: |
|
|
r.raise_for_status() |
|
|
with open(MODEL_PATH, "wb") as f: |
|
|
for chunk in r.iter_content(chunk_size=8192): |
|
|
f.write(chunk) |
|
|
|
|
|
print(f"✅ Model downloaded to {MODEL_PATH}") |
|
|
|
|
|
|
|
|
llm = Llama( |
|
|
model_path=MODEL_PATH, |
|
|
n_ctx=256, |
|
|
n_threads=2, |
|
|
n_batch=32, |
|
|
n_gpu_layers=0, |
|
|
chat_format=None |
|
|
) |
|
|
|
|
|
def generate_structure(prompt: str) -> str: |
|
|
output = llm.create_completion( |
|
|
prompt=prompt, |
|
|
temperature=0.7, |
|
|
max_tokens=512, |
|
|
) |
|
|
return output["choices"][0]["text"].strip() |
|
|
|