ThongCoding commited on
Commit
3dc4bd8
·
1 Parent(s): 96e3318
Files changed (1) hide show
  1. model.py +7 -7
model.py CHANGED
@@ -3,8 +3,8 @@ import requests
3
  from llama_cpp import Llama
4
 
5
  HF_TOKEN = os.getenv("HF_TOKEN")
6
- MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
7
- MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
8
  MODEL_PATH = f"./models/{MODEL_FILENAME}"
9
 
10
  # Manual download with fallback
@@ -25,12 +25,12 @@ if not os.path.exists(MODEL_PATH):
25
 
26
  # Load with llama-cpp
27
  llm = Llama(
28
- model_path="./models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf",
29
- n_ctx=128, # Limit context for smaller RAM/CPU
 
30
  n_batch=32,
31
- n_threads=2, # Use 2 threads (you can try 1 if needed)
32
- n_gpu_layers=0, # CPU-only
33
- chat_format="llama-2"
34
  )
35
 
36
  def generate_structure(prompt: str) -> str:
 
3
  from llama_cpp import Llama
4
 
5
  HF_TOKEN = os.getenv("HF_TOKEN")
6
+ MODEL_REPO = "afrideva/TinyMistral-248M-SFT-v4-GGUF"
7
+ MODEL_FILENAME = "TinyMistral-248M-SFT-v4.Q4_K_M.gguf"
8
  MODEL_PATH = f"./models/{MODEL_FILENAME}"
9
 
10
  # Manual download with fallback
 
25
 
26
  # Load with llama-cpp
27
  llm = Llama(
28
+ model_path=MODEL_PATH,
29
+ n_ctx=256,
30
+ n_threads=2,
31
  n_batch=32,
32
+ n_gpu_layers=0,
33
+ chat_format=None # Not using llama-2 format
 
34
  )
35
 
36
  def generate_structure(prompt: str) -> str: