ThongCoding's picture
wda
77dfe34
raw
history blame
1.22 kB
import os
import requests
from llama_cpp import Llama
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_REPO = "afrideva/TinyMistral-248M-SFT-v4-GGUF"
MODEL_FILENAME = "tinymistral-248m-sft-v4.q8_0.gguf"
MODEL_PATH = f"./models/{MODEL_FILENAME}"
# Manual download with fallback
if not os.path.exists(MODEL_PATH):
print("📦 Downloading GGUF model manually from Hugging Face...")
url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILENAME}"
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
os.makedirs("./models", exist_ok=True)
with requests.get(url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Model downloaded to {MODEL_PATH}")
# Load with llama-cpp
llm = Llama(
model_path=MODEL_PATH,
n_ctx=256,
n_threads=2,
n_batch=32,
n_gpu_layers=0,
chat_format=None # Not using llama-2 format
)
def generate_structure(prompt: str) -> str:
output = llm.create_completion(
prompt=prompt,
temperature=0.7,
max_tokens=512,
)
return output["choices"][0]["text"].strip()