Spaces:

Mat17892
/

iris

Runtime error

desert commited on Nov 29, 2024

Commit

5ccb54c

1 Parent(s): d67d04a

del

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,9 +6,10 @@ max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
 dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
 load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
-# Check for GPU availability and use the appropriate device
-device = "cuda" if torch.cuda.is_available() else "cpu"
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name="llama_lora_model_1",
     max_seq_length=max_seq_length,
@@ -16,7 +17,8 @@ model, tokenizer = FastLanguageModel.from_pretrained(
     load_in_4bit=load_in_4bit,
 )
-model.to(device)  # Move model to the appropriate device
 # Respond function
 def respond(
@@ -48,9 +50,9 @@ def respond(
         return_tensors="pt",
     )
-    # Generate the response using your model
     outputs = model.generate(
-        input_ids=inputs["input_ids"].to(device),  # Ensure input is on the correct device
         max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,

 dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
 load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
+# Force the model to run on CPU only by setting the device to "cpu"
+device = "cpu"
+# Load model and tokenizer with the device set to "cpu"
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name="llama_lora_model_1",
     max_seq_length=max_seq_length,
     load_in_4bit=load_in_4bit,
 )
+# Move the model to CPU (even if it was initially loaded with GPU support)
+model.to(device)
 # Respond function
 def respond(
         return_tensors="pt",
     )
+    # Generate the response using your model on CPU
     outputs = model.generate(
+        input_ids=inputs["input_ids"].to(device),  # Ensure input is on the CPU
         max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,