hamxaameer commited on
Commit
b7bd99f
·
verified ·
1 Parent(s): a95d035

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -13
app.py CHANGED
@@ -104,6 +104,44 @@ Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'
104
  return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
105
  "'model', 'tokenizer', and 'config' (or the model object itself).")
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Set model to evaluation mode and move to appropriate device
108
  try:
109
  loaded_model.eval()
@@ -207,23 +245,38 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
207
  # Format input
208
  prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> <CODE>"
209
 
210
- # Tokenize
211
  device = next(loaded_model.parameters()).device
212
- inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  # Generate (ensure type safety for parameters)
215
  with torch.no_grad():
216
- outputs = loaded_model.generate(
217
- **inputs,
218
- max_length=int(max_length),
219
- temperature=float(temperature),
220
- top_k=int(top_k),
221
- top_p=float(top_p),
222
- do_sample=True,
223
- num_return_sequences=int(num_sequences),
224
- pad_token_id=loaded_tokenizer.pad_token_id,
225
- eos_token_id=loaded_tokenizer.eos_token_id,
226
- )
 
 
 
227
 
228
  generation_time = time.time() - start_time
229
 
 
104
  return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
105
  "'model', 'tokenizer', and 'config' (or the model object itself).")
106
 
107
+ # Fix tokenizer compatibility issues
108
+ if loaded_tokenizer is not None:
109
+ try:
110
+ # Ensure tokenizer has required attributes for generation
111
+ if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None:
112
+ loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
113
+
114
+ # Fix missing _unk_token attribute (common in older tokenizers)
115
+ if not hasattr(loaded_tokenizer, '_unk_token'):
116
+ if hasattr(loaded_tokenizer, 'unk_token'):
117
+ loaded_tokenizer._unk_token = loaded_tokenizer.unk_token
118
+ else:
119
+ loaded_tokenizer._unk_token = '<unk>'
120
+
121
+ # Ensure other critical attributes exist
122
+ if not hasattr(loaded_tokenizer, '_bos_token'):
123
+ loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>')
124
+ if not hasattr(loaded_tokenizer, '_eos_token'):
125
+ loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>')
126
+
127
+ # Test tokenizer basic functionality
128
+ test_encode = loaded_tokenizer("test", return_tensors='pt')
129
+ test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0])
130
+
131
+ except Exception as tokenizer_error:
132
+ # Tokenizer is broken, try to recreate it
133
+ try:
134
+ from transformers import GPT2Tokenizer
135
+ print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...")
136
+ loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
137
+
138
+ # Ensure pad token is set
139
+ if loaded_tokenizer.pad_token_id is None:
140
+ loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
141
+
142
+ except Exception as recreate_error:
143
+ return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version."
144
+
145
  # Set model to evaluation mode and move to appropriate device
146
  try:
147
  loaded_model.eval()
 
245
  # Format input
246
  prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> <CODE>"
247
 
248
+ # Tokenize with error handling
249
  device = next(loaded_model.parameters()).device
250
+ try:
251
+ inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
252
+ except Exception as tokenize_error:
253
+ # Try to fix tokenizer on the fly
254
+ try:
255
+ from transformers import GPT2Tokenizer
256
+ print("Fixing tokenizer compatibility...")
257
+ loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
258
+ if loaded_tokenizer.pad_token_id is None:
259
+ loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
260
+ inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
261
+ except Exception as fix_error:
262
+ return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", ""
263
 
264
  # Generate (ensure type safety for parameters)
265
  with torch.no_grad():
266
+ try:
267
+ outputs = loaded_model.generate(
268
+ **inputs,
269
+ max_length=int(max_length),
270
+ temperature=float(temperature),
271
+ top_k=int(top_k),
272
+ top_p=float(top_p),
273
+ do_sample=True,
274
+ num_return_sequences=int(num_sequences),
275
+ pad_token_id=loaded_tokenizer.pad_token_id,
276
+ eos_token_id=loaded_tokenizer.eos_token_id,
277
+ )
278
+ except Exception as generation_error:
279
+ return f"❌ Generation failed: {str(generation_error)}", "", "", ""
280
 
281
  generation_time = time.time() - start_time
282