Remostart commited on
Commit
2499bf4
·
verified ·
1 Parent(s): 8b416f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -43
app.py CHANGED
@@ -1,79 +1,72 @@
1
  import gradio as gr
2
  import torch
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import spaces
5
  import logging
 
 
 
 
6
 
7
  # Set up logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
- # Global variables for model and tokenizer (lazy loading)
12
  model = None
13
  tokenizer = None
14
  MODEL_NAME = "ubiodee/Test_Plutus"
15
  FALLBACK_TOKENIZER = "NousResearch/Meta-Llama-3-8B"
16
 
17
- # Load tokenizer at startup (lightweight, no model yet)
18
  try:
19
  logger.info("Loading tokenizer at startup for %s...", MODEL_NAME)
20
  tokenizer = AutoTokenizer.from_pretrained(
21
  MODEL_NAME,
22
- use_fast=True, # Llama-3 uses fast tokenizer
23
  trust_remote_code=True,
24
  )
25
  logger.info("Primary tokenizer loaded successfully.")
26
  except Exception as e:
27
  logger.warning(f"Primary tokenizer failed: {str(e)}. Using fallback: {FALLBACK_TOKENIZER}")
28
- try:
29
- tokenizer = AutoTokenizer.from_pretrained(
30
- FALLBACK_TOKENIZER,
31
- use_fast=True,
32
- trust_remote_code=True,
33
- )
34
- logger.info("Fallback tokenizer loaded successfully.")
35
- except Exception as fallback_e:
36
- logger.error(f"Fallback tokenizer failed: {str(fallback_e)}")
37
- raise
38
 
39
  # Set pad token
40
  if tokenizer.pad_token_id is None:
41
  tokenizer.pad_token_id = tokenizer.eos_token_id
42
  logger.info("Set pad_token_id to eos_token_id: %s", tokenizer.eos_token_id)
43
 
44
- def load_model():
45
- """Load model inside GPU context."""
46
- global model
47
- if model is None:
48
- try:
49
- logger.info("Loading model %s with torch.float16...", MODEL_NAME)
50
- model = AutoModelForCausalLM.from_pretrained(
51
- MODEL_NAME,
52
- torch_dtype=torch.float16, # Use fp16 for ZeroGPU
53
- low_cpu_mem_usage=True,
54
- trust_remote_code=True,
55
- )
56
- model.eval()
57
- if torch.cuda.is_available():
58
- model.to("cuda")
59
- logger.info("Model loaded and moved to GPU.")
60
- else:
61
- logger.warning("GPU not available; using CPU.")
62
- except Exception as e:
63
- logger.error(f"Model loading failed: {str(e)}")
64
- raise
65
- return model
66
 
67
- # Response function: Load model on first call, then reuse
68
- @spaces.GPU(duration=300) # Allow up to 5min for loading + inference
69
  def generate_response(prompt, progress=gr.Progress()):
70
  global model
71
- progress(0.1, desc="Loading model if needed...")
72
- model = load_model() # Ensures model is loaded in GPU context
73
-
74
- progress(0.3, desc="Tokenizing input...")
75
  try:
76
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
 
 
77
 
78
  progress(0.6, desc="Generating response...")
79
  with torch.no_grad():
@@ -97,6 +90,10 @@ def generate_response(prompt, progress=gr.Progress()):
97
  except Exception as e:
98
  logger.error(f"Inference failed: {str(e)}")
99
  return f"Error during generation: {str(e)}"
 
 
 
 
100
 
101
  # Gradio UI
102
  demo = gr.Interface(
@@ -107,5 +104,5 @@ demo = gr.Interface(
107
  description="Write Plutus smart contracts on Cardano blockchain."
108
  )
109
 
110
- # Launch with ZeroGPU-compatible settings
111
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import torch.multiprocessing as mp
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import spaces
6
  import logging
7
+ import os
8
+
9
+ # Set multiprocessing to 'spawn' for ZeroGPU compatibility
10
+ mp.set_start_method('spawn', force=True)
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
+ # Global variables for model and tokenizer (load at startup)
17
  model = None
18
  tokenizer = None
19
  MODEL_NAME = "ubiodee/Test_Plutus"
20
  FALLBACK_TOKENIZER = "NousResearch/Meta-Llama-3-8B"
21
 
22
+ # Load tokenizer at startup
23
  try:
24
  logger.info("Loading tokenizer at startup for %s...", MODEL_NAME)
25
  tokenizer = AutoTokenizer.from_pretrained(
26
  MODEL_NAME,
27
+ use_fast=True,
28
  trust_remote_code=True,
29
  )
30
  logger.info("Primary tokenizer loaded successfully.")
31
  except Exception as e:
32
  logger.warning(f"Primary tokenizer failed: {str(e)}. Using fallback: {FALLBACK_TOKENIZER}")
33
+ tokenizer = AutoTokenizer.from_pretrained(
34
+ FALLBACK_TOKENIZER,
35
+ use_fast=True,
36
+ trust_remote_code=True,
37
+ )
38
+ logger.info("Fallback tokenizer loaded successfully.")
 
 
 
 
39
 
40
  # Set pad token
41
  if tokenizer.pad_token_id is None:
42
  tokenizer.pad_token_id = tokenizer.eos_token_id
43
  logger.info("Set pad_token_id to eos_token_id: %s", tokenizer.eos_token_id)
44
 
45
+ # Load model at startup (CPU/fp16, move to GPU in decorated function)
46
+ try:
47
+ logger.info("Loading model %s with torch.float16 on CPU...", MODEL_NAME)
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ MODEL_NAME,
50
+ torch_dtype=torch.float16,
51
+ low_cpu_mem_usage=True,
52
+ trust_remote_code=True,
53
+ device_map="cpu", # Load on CPU to avoid CUDA init issues
54
+ )
55
+ model.eval()
56
+ logger.info("Model loaded successfully on CPU.")
57
+ except Exception as e:
58
+ logger.error(f"Model loading failed: {str(e)}")
59
+ raise
 
 
 
 
 
 
 
60
 
61
+ # Response function: Transfer to GPU and infer (no CUDA init here)
62
+ @spaces.GPU(duration=120) # Reduced for quota efficiency
63
  def generate_response(prompt, progress=gr.Progress()):
64
  global model
65
+ progress(0.1, desc="Moving model to GPU...")
 
 
 
66
  try:
67
+ model = model.to("cuda") # Move to GPU in decorated context
68
+ progress(0.3, desc="Tokenizing input...")
69
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
70
 
71
  progress(0.6, desc="Generating response...")
72
  with torch.no_grad():
 
90
  except Exception as e:
91
  logger.error(f"Inference failed: {str(e)}")
92
  return f"Error during generation: {str(e)}"
93
+ finally:
94
+ # Clean up GPU memory
95
+ if torch.cuda.is_available():
96
+ torch.cuda.empty_cache()
97
 
98
  # Gradio UI
99
  demo = gr.Interface(
 
104
  description="Write Plutus smart contracts on Cardano blockchain."
105
  )
106
 
107
+ # Launch without queue args (ZeroGPU handles it)
108
  demo.launch()