Spestly commited on
Commit
2c56220
·
verified ·
1 Parent(s): 330c803

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -30
app.py CHANGED
@@ -1,11 +1,12 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import pipeline
4
  import torch
5
  from typing import List, Dict, Optional
6
 
7
  # Global variable to store pipelines
8
  model_cache = {}
 
9
 
10
  # Available models (only Daedalus)
11
  AVAILABLE_MODELS = {
@@ -14,7 +15,7 @@ AVAILABLE_MODELS = {
14
 
15
  @spaces.GPU
16
  def initialize_model(model_name):
17
- global model_cache
18
 
19
  if model_name not in AVAILABLE_MODELS:
20
  raise ValueError(f"Model {model_name} not found in available models")
@@ -24,31 +25,80 @@ def initialize_model(model_name):
24
  # Check if model is already cached
25
  if model_id not in model_cache:
26
  try:
 
 
 
 
 
 
27
  model_cache[model_id] = pipeline(
28
  "text-generation",
29
  model=model_id,
 
30
  torch_dtype=torch.float16,
31
  device_map="auto",
32
  trust_remote_code=True
33
  )
34
  except Exception:
35
  # Fallback to CPU if GPU fails
 
 
 
 
 
36
  model_cache[model_id] = pipeline(
37
  "text-generation",
38
  model=model_id,
 
39
  torch_dtype=torch.float32,
40
  device_map="cpu",
41
  trust_remote_code=True
42
  )
43
 
44
- return model_cache[model_id]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  @spaces.GPU
47
  def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
48
  """Generate response using the selected model"""
49
 
50
  try:
51
- model_pipe = initialize_model(model_name)
52
  except Exception as e:
53
  return f"Error loading model {model_name}: {str(e)}"
54
 
@@ -62,48 +112,65 @@ def generate_response(message, history, model_name, max_length=512, temperature=
62
  messages.append({"role": "user", "content": message})
63
 
64
  try:
 
65
  try:
 
 
 
66
  response = model_pipe(
67
- messages,
68
- max_length=max_length,
69
  temperature=temperature,
70
  top_p=top_p,
71
  do_sample=True,
72
- pad_token_id=model_pipe.tokenizer.eos_token_id,
 
73
  return_full_text=False
74
  )
75
- except:
76
- conversation_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  for msg in messages:
78
  if msg["role"] == "user":
79
- conversation_text += f"User: {msg['content']}\n"
80
  else:
81
- conversation_text += f"Assistant: {msg['content']}\n"
82
- conversation_text += "Assistant:"
83
 
84
  response = model_pipe(
85
  conversation_text,
86
- max_length=max_length,
87
  temperature=temperature,
88
  top_p=top_p,
89
  do_sample=True,
90
- pad_token_id=model_pipe.tokenizer.eos_token_id,
91
  return_full_text=False
92
  )
93
-
94
- if isinstance(response, list) and len(response) > 0:
95
- generated_text = response[0]['generated_text']
96
- else:
97
- generated_text = str(response)
98
-
99
- if isinstance(generated_text, list):
100
- assistant_response = generated_text[-1]['content']
101
- else:
102
  assistant_response = str(generated_text).strip()
103
- if "Assistant:" in assistant_response:
104
- assistant_response = assistant_response.split("Assistant:")[-1].strip()
105
-
106
- return assistant_response
107
 
108
  except Exception as e:
109
  return f"Error generating response: {str(e)}"
@@ -141,8 +208,8 @@ def create_interface():
141
  maximum=8192,
142
  value=2048,
143
  step=50,
144
- label="Max Length",
145
- info="Maximum length of generated response"
146
  )
147
  temperature = gr.Slider(
148
  minimum=0.1,
@@ -207,4 +274,4 @@ def create_interface():
207
  # Launch the app
208
  if __name__ == "__main__":
209
  demo = create_interface()
210
- demo.launch(share=True)
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import pipeline, AutoTokenizer
4
  import torch
5
  from typing import List, Dict, Optional
6
 
7
  # Global variable to store pipelines
8
  model_cache = {}
9
+ tokenizer_cache = {}
10
 
11
  # Available models (only Daedalus)
12
  AVAILABLE_MODELS = {
 
15
 
16
  @spaces.GPU
17
  def initialize_model(model_name):
18
+ global model_cache, tokenizer_cache
19
 
20
  if model_name not in AVAILABLE_MODELS:
21
  raise ValueError(f"Model {model_name} not found in available models")
 
25
  # Check if model is already cached
26
  if model_id not in model_cache:
27
  try:
28
+ # Load tokenizer separately to handle chat template properly
29
+ tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
30
+ model_id,
31
+ trust_remote_code=True
32
+ )
33
+
34
  model_cache[model_id] = pipeline(
35
  "text-generation",
36
  model=model_id,
37
+ tokenizer=tokenizer_cache[model_id],
38
  torch_dtype=torch.float16,
39
  device_map="auto",
40
  trust_remote_code=True
41
  )
42
  except Exception:
43
  # Fallback to CPU if GPU fails
44
+ tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
45
+ model_id,
46
+ trust_remote_code=True
47
+ )
48
+
49
  model_cache[model_id] = pipeline(
50
  "text-generation",
51
  model=model_id,
52
+ tokenizer=tokenizer_cache[model_id],
53
  torch_dtype=torch.float32,
54
  device_map="cpu",
55
  trust_remote_code=True
56
  )
57
 
58
+ return model_cache[model_id], tokenizer_cache[model_id]
59
+
60
+ def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
61
+ """Manually apply the chat template to ensure proper formatting"""
62
+
63
+ # Get the chat template
64
+ if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
65
+ try:
66
+ # Use the tokenizer's apply_chat_template method
67
+ formatted = tokenizer.apply_chat_template(
68
+ messages,
69
+ tokenize=False,
70
+ add_generation_prompt=True
71
+ )
72
+ return formatted
73
+ except Exception as e:
74
+ print(f"Chat template application failed: {e}")
75
+ # Fall back to manual formatting
76
+ pass
77
+
78
+ # Manual fallback formatting based on your template
79
+ bos_token = tokenizer.bos_token if tokenizer.bos_token else "<s>"
80
+ eos_token = tokenizer.eos_token if tokenizer.eos_token else "</s>"
81
+
82
+ # Start with system message
83
+ formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"
84
+
85
+ # Add each message
86
+ for msg in messages:
87
+ role = msg.get('role', 'user')
88
+ content = msg.get('content', '').strip()
89
+ formatted += f"{bos_token}{role}\n{content}{eos_token}"
90
+
91
+ # Add generation prompt
92
+ formatted += f"{bos_token}assistant\n"
93
+
94
+ return formatted
95
 
96
  @spaces.GPU
97
  def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
98
  """Generate response using the selected model"""
99
 
100
  try:
101
+ model_pipe, tokenizer = initialize_model(model_name)
102
  except Exception as e:
103
  return f"Error loading model {model_name}: {str(e)}"
104
 
 
112
  messages.append({"role": "user", "content": message})
113
 
114
  try:
115
+ # Method 1: Try using the pipeline with proper chat template
116
  try:
117
+ # Format the conversation using the chat template
118
+ formatted_prompt = format_conversation_with_template(messages, tokenizer)
119
+
120
  response = model_pipe(
121
+ formatted_prompt,
122
+ max_new_tokens=max_length,
123
  temperature=temperature,
124
  top_p=top_p,
125
  do_sample=True,
126
+ pad_token_id=tokenizer.eos_token_id,
127
+ eos_token_id=tokenizer.eos_token_id,
128
  return_full_text=False
129
  )
130
+
131
+ if isinstance(response, list) and len(response) > 0:
132
+ generated_text = response[0]['generated_text']
133
+ else:
134
+ generated_text = str(response)
135
+
136
+ # Clean up the response
137
+ assistant_response = str(generated_text).strip()
138
+
139
+ # Remove any residual formatting artifacts
140
+ if assistant_response.startswith("assistant\n"):
141
+ assistant_response = assistant_response[10:].strip()
142
+
143
+ return assistant_response
144
+
145
+ except Exception as template_error:
146
+ print(f"Chat template method failed: {template_error}")
147
+
148
+ # Method 2: Fallback to simple string formatting
149
+ conversation_text = "system\nYou are an AI Coding model called Daedalus, developed by Noema Research\n\n"
150
  for msg in messages:
151
  if msg["role"] == "user":
152
+ conversation_text += f"user\n{msg['content']}\n\n"
153
  else:
154
+ conversation_text += f"assistant\n{msg['content']}\n\n"
155
+ conversation_text += "assistant\n"
156
 
157
  response = model_pipe(
158
  conversation_text,
159
+ max_new_tokens=max_length,
160
  temperature=temperature,
161
  top_p=top_p,
162
  do_sample=True,
163
+ pad_token_id=tokenizer.eos_token_id,
164
  return_full_text=False
165
  )
166
+
167
+ if isinstance(response, list) and len(response) > 0:
168
+ generated_text = response[0]['generated_text']
169
+ else:
170
+ generated_text = str(response)
171
+
 
 
 
172
  assistant_response = str(generated_text).strip()
173
+ return assistant_response
 
 
 
174
 
175
  except Exception as e:
176
  return f"Error generating response: {str(e)}"
 
208
  maximum=8192,
209
  value=2048,
210
  step=50,
211
+ label="Max New Tokens",
212
+ info="Maximum number of new tokens to generate"
213
  )
214
  temperature = gr.Slider(
215
  minimum=0.1,
 
274
  # Launch the app
275
  if __name__ == "__main__":
276
  demo = create_interface()
277
+ demo.launch(share=True)