Remostart commited on
Commit
2984b8e
Β·
verified Β·
1 Parent(s): 0839006

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -37
app.py CHANGED
@@ -8,59 +8,70 @@ import logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
- # Load model & tokenizer
 
 
12
  MODEL_NAME = "ubiodee/Test_Plutus"
 
13
 
 
14
  try:
15
- logger.info("Loading tokenizer with use_fast=False...")
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  MODEL_NAME,
18
- use_fast=False, # Use slow tokenizer to avoid fast tokenizer errors
19
- use_safetensors=True,
20
- trust_remote_code=True, # Allow custom tokenizer code
21
  )
22
- logger.info("Tokenizer loaded successfully.")
23
  except Exception as e:
24
- logger.error(f"Tokenizer loading failed: {str(e)}")
25
- raise
26
-
27
- try:
28
- logger.info("Loading model with 8-bit quantization...")
29
- model = AutoModelForCausalLM.from_pretrained(
30
- MODEL_NAME,
31
- device_map="auto", # Automatically map to GPU/CPU
32
- load_in_8bit=True, # Use 8-bit quantization to match model
33
- torch_dtype=torch.bfloat16, # Use bfloat16 for efficiency
34
- use_safetensors=True,
35
- low_cpu_mem_usage=True, # Reduce CPU memory during loading
36
- trust_remote_code=True, # Allow custom model code
37
  )
38
- model.eval()
39
- logger.info("Model loaded successfully.")
40
- except Exception as e:
41
- logger.error(f"Model loading failed: {str(e)}")
42
- raise
43
 
44
- # Set pad token if not defined
45
  if tokenizer.pad_token_id is None:
46
  tokenizer.pad_token_id = tokenizer.eos_token_id
47
  logger.info("Set pad_token_id to eos_token_id.")
48
 
49
- # Move model to GPU if available
50
- if torch.cuda.is_available():
51
- model.to("cuda")
52
- logger.info("Model moved to GPU.")
53
- else:
54
- logger.warning("No GPU available, using CPU.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Response function with GPU decorator
57
- @spaces.GPU
58
  def generate_response(prompt, progress=gr.Progress()):
59
- progress(0.1, desc="Tokenizing input...")
 
 
 
 
60
  try:
61
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
62
 
63
- progress(0.5, desc="Generating response...")
64
  with torch.no_grad():
65
  outputs = model.generate(
66
  **inputs,
@@ -73,7 +84,7 @@ def generate_response(prompt, progress=gr.Progress()):
73
  )
74
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
 
76
- # Remove the prompt from the output
77
  if response.startswith(prompt):
78
  response = response[len(prompt):].strip()
79
 
@@ -93,4 +104,4 @@ demo = gr.Interface(
93
  )
94
 
95
  # Launch with queueing
96
- demo.queue(max_size=10).launch(enable_queue=True, max_threads=1)
 
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
+ # Global variables for model and tokenizer (lazy loading)
12
+ model = None
13
+ tokenizer = None
14
  MODEL_NAME = "ubiodee/Test_Plutus"
15
+ FALLBACK_TOKENIZER = "gpt2"
16
 
17
+ # Load tokenizer at startup (lightweight, no model yet)
18
  try:
19
+ logger.info("Loading tokenizer at startup with legacy versions...")
20
  tokenizer = AutoTokenizer.from_pretrained(
21
  MODEL_NAME,
22
+ use_fast=False,
23
+ trust_remote_code=True,
 
24
  )
25
+ logger.info("Primary tokenizer loaded successfully.")
26
  except Exception as e:
27
+ logger.warning(f"Primary tokenizer failed: {str(e)}. Using fallback.")
28
+ tokenizer = AutoTokenizer.from_pretrained(
29
+ FALLBACK_TOKENIZER,
30
+ use_fast=False,
31
+ trust_remote_code=True,
 
 
 
 
 
 
 
 
32
  )
33
+ logger.info("Fallback tokenizer loaded.")
 
 
 
 
34
 
35
+ # Set pad token
36
  if tokenizer.pad_token_id is None:
37
  tokenizer.pad_token_id = tokenizer.eos_token_id
38
  logger.info("Set pad_token_id to eos_token_id.")
39
 
40
+ def load_model():
41
+ """Load model inside GPU context to enable quantization."""
42
+ global model
43
+ if model is None:
44
+ try:
45
+ logger.info("Loading model with CPU fallback (full precision)...")
46
+ model = AutoModelForCausalLM.from_pretrained(
47
+ MODEL_NAME,
48
+ torch_dtype=torch.float16, # Use fp16 for memory efficiency without bitsandbytes
49
+ low_cpu_mem_usage=True,
50
+ trust_remote_code=True,
51
+ )
52
+ model.eval()
53
+ if torch.cuda.is_available():
54
+ model.to("cuda")
55
+ logger.info("Model loaded and moved to GPU.")
56
+ else:
57
+ logger.warning("GPU not available; using CPU.")
58
+ except Exception as e:
59
+ logger.error(f"Model loading failed: {str(e)}")
60
+ raise
61
+ return model
62
 
63
+ # Response function: Load model on first call, then reuse
64
+ @spaces.GPU(duration=300) # Allow up to 5min for loading + inference
65
  def generate_response(prompt, progress=gr.Progress()):
66
+ global model
67
+ progress(0.1, desc="Loading model if needed...")
68
+ model = load_model() # Ensures model is loaded in GPU context
69
+
70
+ progress(0.3, desc="Tokenizing input...")
71
  try:
72
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
73
 
74
+ progress(0.6, desc="Generating response...")
75
  with torch.no_grad():
76
  outputs = model.generate(
77
  **inputs,
 
84
  )
85
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
86
 
87
+ # Remove prompt from output
88
  if response.startswith(prompt):
89
  response = response[len(prompt):].strip()
90
 
 
104
  )
105
 
106
  # Launch with queueing
107
+ demo.queue(max_size=5).launch(enable_queue=True, max_threads=1)