Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,25 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import torch
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 7 |
|
| 8 |
# Global variables for model and tokenizer
|
|
@@ -202,8 +219,9 @@ def load_models():
|
|
| 202 |
token=hf_token,
|
| 203 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 204 |
device_map=device_map,
|
| 205 |
-
|
| 206 |
-
|
|
|
|
| 207 |
)
|
| 208 |
|
| 209 |
# Cache the loaded model and tokenizer
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
import gradio as gr
|
| 4 |
+
import spaces
|
| 5 |
+
|
| 6 |
+
# CRITICAL: Disable PyTorch compiler BEFORE importing torch
|
| 7 |
+
os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
|
| 8 |
+
os.environ["TORCH_COMPILE_DISABLE"] = "1"
|
| 9 |
+
os.environ["TORCH_INDUCTOR_DISABLE"] = "1"
|
| 10 |
+
os.environ["TORCHINDUCTOR_DISABLE_CUDAGRAPHS"] = "1"
|
| 11 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 12 |
+
os.environ["TORCH_USE_CUDA_DSA"] = "0"
|
| 13 |
+
|
| 14 |
+
# Now import torch and disable its compiler features
|
| 15 |
import torch
|
| 16 |
+
if hasattr(torch, "_dynamo"):
|
| 17 |
+
if hasattr(torch._dynamo, "config"):
|
| 18 |
+
torch._dynamo.config.suppress_errors = True
|
| 19 |
+
if hasattr(torch._dynamo, "disable"):
|
| 20 |
+
torch._dynamo.disable()
|
| 21 |
+
print("Disabled torch._dynamo")
|
| 22 |
+
|
| 23 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 24 |
|
| 25 |
# Global variables for model and tokenizer
|
|
|
|
| 219 |
token=hf_token,
|
| 220 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 221 |
device_map=device_map,
|
| 222 |
+
use_cache=True,
|
| 223 |
+
use_flash_attention_2=False,
|
| 224 |
+
_attn_implementation="eager" # Use eager mode to avoid compiler issues
|
| 225 |
)
|
| 226 |
|
| 227 |
# Cache the loaded model and tokenizer
|