Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -104,6 +104,44 @@ Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'
|
|
| 104 |
return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
|
| 105 |
"'model', 'tokenizer', and 'config' (or the model object itself).")
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Set model to evaluation mode and move to appropriate device
|
| 108 |
try:
|
| 109 |
loaded_model.eval()
|
|
@@ -207,23 +245,38 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
|
|
| 207 |
# Format input
|
| 208 |
prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> <CODE>"
|
| 209 |
|
| 210 |
-
# Tokenize
|
| 211 |
device = next(loaded_model.parameters()).device
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
# Generate (ensure type safety for parameters)
|
| 215 |
with torch.no_grad():
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
generation_time = time.time() - start_time
|
| 229 |
|
|
|
|
| 104 |
return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
|
| 105 |
"'model', 'tokenizer', and 'config' (or the model object itself).")
|
| 106 |
|
| 107 |
+
# Fix tokenizer compatibility issues
|
| 108 |
+
if loaded_tokenizer is not None:
|
| 109 |
+
try:
|
| 110 |
+
# Ensure tokenizer has required attributes for generation
|
| 111 |
+
if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None:
|
| 112 |
+
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
|
| 113 |
+
|
| 114 |
+
# Fix missing _unk_token attribute (common in older tokenizers)
|
| 115 |
+
if not hasattr(loaded_tokenizer, '_unk_token'):
|
| 116 |
+
if hasattr(loaded_tokenizer, 'unk_token'):
|
| 117 |
+
loaded_tokenizer._unk_token = loaded_tokenizer.unk_token
|
| 118 |
+
else:
|
| 119 |
+
loaded_tokenizer._unk_token = '<unk>'
|
| 120 |
+
|
| 121 |
+
# Ensure other critical attributes exist
|
| 122 |
+
if not hasattr(loaded_tokenizer, '_bos_token'):
|
| 123 |
+
loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>')
|
| 124 |
+
if not hasattr(loaded_tokenizer, '_eos_token'):
|
| 125 |
+
loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>')
|
| 126 |
+
|
| 127 |
+
# Test tokenizer basic functionality
|
| 128 |
+
test_encode = loaded_tokenizer("test", return_tensors='pt')
|
| 129 |
+
test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0])
|
| 130 |
+
|
| 131 |
+
except Exception as tokenizer_error:
|
| 132 |
+
# Tokenizer is broken, try to recreate it
|
| 133 |
+
try:
|
| 134 |
+
from transformers import GPT2Tokenizer
|
| 135 |
+
print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...")
|
| 136 |
+
loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
| 137 |
+
|
| 138 |
+
# Ensure pad token is set
|
| 139 |
+
if loaded_tokenizer.pad_token_id is None:
|
| 140 |
+
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
|
| 141 |
+
|
| 142 |
+
except Exception as recreate_error:
|
| 143 |
+
return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version."
|
| 144 |
+
|
| 145 |
# Set model to evaluation mode and move to appropriate device
|
| 146 |
try:
|
| 147 |
loaded_model.eval()
|
|
|
|
| 245 |
# Format input
|
| 246 |
prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> <CODE>"
|
| 247 |
|
| 248 |
+
# Tokenize with error handling
|
| 249 |
device = next(loaded_model.parameters()).device
|
| 250 |
+
try:
|
| 251 |
+
inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
|
| 252 |
+
except Exception as tokenize_error:
|
| 253 |
+
# Try to fix tokenizer on the fly
|
| 254 |
+
try:
|
| 255 |
+
from transformers import GPT2Tokenizer
|
| 256 |
+
print("Fixing tokenizer compatibility...")
|
| 257 |
+
loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
| 258 |
+
if loaded_tokenizer.pad_token_id is None:
|
| 259 |
+
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
|
| 260 |
+
inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
|
| 261 |
+
except Exception as fix_error:
|
| 262 |
+
return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", ""
|
| 263 |
|
| 264 |
# Generate (ensure type safety for parameters)
|
| 265 |
with torch.no_grad():
|
| 266 |
+
try:
|
| 267 |
+
outputs = loaded_model.generate(
|
| 268 |
+
**inputs,
|
| 269 |
+
max_length=int(max_length),
|
| 270 |
+
temperature=float(temperature),
|
| 271 |
+
top_k=int(top_k),
|
| 272 |
+
top_p=float(top_p),
|
| 273 |
+
do_sample=True,
|
| 274 |
+
num_return_sequences=int(num_sequences),
|
| 275 |
+
pad_token_id=loaded_tokenizer.pad_token_id,
|
| 276 |
+
eos_token_id=loaded_tokenizer.eos_token_id,
|
| 277 |
+
)
|
| 278 |
+
except Exception as generation_error:
|
| 279 |
+
return f"❌ Generation failed: {str(generation_error)}", "", "", ""
|
| 280 |
|
| 281 |
generation_time = time.time() - start_time
|
| 282 |
|