desert
commited on
Commit
·
5ccb54c
1
Parent(s):
d67d04a
del
Browse files
app.py
CHANGED
|
@@ -6,9 +6,10 @@ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
|
| 6 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
| 7 |
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
device = "
|
| 11 |
|
|
|
|
| 12 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 13 |
model_name="llama_lora_model_1",
|
| 14 |
max_seq_length=max_seq_length,
|
|
@@ -16,7 +17,8 @@ model, tokenizer = FastLanguageModel.from_pretrained(
|
|
| 16 |
load_in_4bit=load_in_4bit,
|
| 17 |
)
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
# Respond function
|
| 22 |
def respond(
|
|
@@ -48,9 +50,9 @@ def respond(
|
|
| 48 |
return_tensors="pt",
|
| 49 |
)
|
| 50 |
|
| 51 |
-
# Generate the response using your model
|
| 52 |
outputs = model.generate(
|
| 53 |
-
input_ids=inputs["input_ids"].to(device), # Ensure input is on the
|
| 54 |
max_new_tokens=max_tokens,
|
| 55 |
temperature=temperature,
|
| 56 |
top_p=top_p,
|
|
|
|
| 6 |
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
| 7 |
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
| 8 |
|
| 9 |
+
# Force the model to run on CPU only by setting the device to "cpu"
|
| 10 |
+
device = "cpu"
|
| 11 |
|
| 12 |
+
# Load model and tokenizer with the device set to "cpu"
|
| 13 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 14 |
model_name="llama_lora_model_1",
|
| 15 |
max_seq_length=max_seq_length,
|
|
|
|
| 17 |
load_in_4bit=load_in_4bit,
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# Move the model to CPU (even if it was initially loaded with GPU support)
|
| 21 |
+
model.to(device)
|
| 22 |
|
| 23 |
# Respond function
|
| 24 |
def respond(
|
|
|
|
| 50 |
return_tensors="pt",
|
| 51 |
)
|
| 52 |
|
| 53 |
+
# Generate the response using your model on CPU
|
| 54 |
outputs = model.generate(
|
| 55 |
+
input_ids=inputs["input_ids"].to(device), # Ensure input is on the CPU
|
| 56 |
max_new_tokens=max_tokens,
|
| 57 |
temperature=temperature,
|
| 58 |
top_p=top_p,
|