hamxaameer commited on
Commit
a52cd7c
·
verified ·
1 Parent(s): e519124

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -15
app.py CHANGED
@@ -377,32 +377,104 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
377
  generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
378
  continue
379
 
380
- # Decode with skip_special_tokens=True for cleaner output
381
  try:
 
382
  generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False)
 
 
 
 
 
 
 
 
 
383
  except Exception as decode_error:
384
- # Fallback: try with skip_special_tokens=True
385
  try:
386
  generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True)
 
 
 
 
387
  except Exception as decode_error2:
388
- # Last resort: convert tokens to string manually
389
- generated = f"# Decode failed: {str(decode_error2)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
  # Handle None result from decode
392
  if generated is None:
393
  generated = f"# Generation {i+1}: Decode returned None"
394
 
395
- # Extract code part
396
- if '<CODE>' in generated:
397
- code = generated.split('<CODE>')[-1].strip()
398
- # Remove special tokens
399
- code = code.replace('<PAD>', '').replace('<SEP>', '').replace('</s>', '').replace('<s>', '').strip()
400
- else:
401
- code = generated.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
- # Ensure we have some content
404
- if not code or code.isspace():
405
- code = f"# Generated sequence {i+1} was empty"
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  generated_codes.append(code)
408
 
@@ -413,7 +485,16 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
413
 
414
  # Ensure we have at least one result
415
  if not generated_codes:
416
- generated_codes = ["# No valid generations produced"]
 
 
 
 
 
 
 
 
 
417
 
418
  # Use the first generated code as primary output
419
  primary_code = generated_codes[0] if generated_codes else "# No code generated"
 
377
  generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
378
  continue
379
 
380
+ # Decode with comprehensive error handling
381
  try:
382
+ # First attempt: decode with skip_special_tokens=False
383
  generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False)
384
+
385
+ # Check if decode returned None or contains None
386
+ if generated is None:
387
+ raise ValueError("Tokenizer decode returned None")
388
+
389
+ # Check for None in the string (shouldn't happen but be safe)
390
+ if 'None' in str(generated) or '\x00' in str(generated):
391
+ raise ValueError("Decoded string contains invalid characters")
392
+
393
  except Exception as decode_error:
394
+ # Second attempt: decode with skip_special_tokens=True
395
  try:
396
  generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True)
397
+ if generated is None:
398
+ raise ValueError("Tokenizer decode (skip_special) returned None")
399
+ if 'None' in str(generated) or '\x00' in str(generated):
400
+ raise ValueError("Decoded string contains invalid characters")
401
  except Exception as decode_error2:
402
+ # Third attempt: manual token-to-string conversion
403
+ try:
404
+ # Convert tokens to string manually using vocab
405
+ if hasattr(loaded_tokenizer, 'get_vocab'):
406
+ vocab = loaded_tokenizer.get_vocab()
407
+ inv_vocab = {v: k for k, v in vocab.items()}
408
+
409
+ # Convert tokens to strings, skip unknown tokens
410
+ token_strings = []
411
+ for token_id in valid_tokens:
412
+ if token_id in inv_vocab:
413
+ token_str = inv_vocab[token_id]
414
+ # Skip special tokens that might cause issues
415
+ if token_str not in ['<pad>', '<unk>', '<mask>', '<s>', '</s>', '<PAD>', '<SEP>', '<CODE>', '<PSEUDO>']:
416
+ token_strings.append(token_str)
417
+
418
+ generated = ''.join(token_strings)
419
+
420
+ if not generated or generated.isspace():
421
+ raise ValueError("Manual conversion produced empty string")
422
+ else:
423
+ raise ValueError("Tokenizer has no get_vocab method")
424
+
425
+ except Exception as manual_error:
426
+ # Final fallback: create a safe representation
427
+ generated = f"# Decode failed: {str(decode_error)}\n# Manual conversion failed: {str(manual_error)}\n# Raw tokens: {valid_tokens[:10]}..."
428
+
429
+ # Final safety check: ensure we have a string
430
+ if not isinstance(generated, str):
431
+ generated = str(generated) if generated is not None else "# Decode returned non-string object"
432
 
433
  # Handle None result from decode
434
  if generated is None:
435
  generated = f"# Generation {i+1}: Decode returned None"
436
 
437
+ # Extract code part with safety checks
438
+ try:
439
+ if '<CODE>' in generated:
440
+ code_parts = generated.split('<CODE>')
441
+ if len(code_parts) > 1:
442
+ code = code_parts[-1].strip()
443
+ else:
444
+ code = generated.strip()
445
+ else:
446
+ code = generated.strip()
447
+
448
+ # Remove special tokens safely
449
+ special_tokens = ['<PAD>', '<SEP>', '</s>', '<s>', '<unk>', '<mask>', '<|endoftext|>']
450
+ for token in special_tokens:
451
+ code = code.replace(token, '')
452
+
453
+ # Clean up extra whitespace
454
+ code = ' '.join(code.split())
455
+
456
+ # Ensure we have some content
457
+ if not code or code.isspace():
458
+ code = f"# Generated sequence {i+1} was empty after cleaning"
459
+
460
+ except Exception as extract_error:
461
+ code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}"
462
 
463
+ # Final validation: ensure code is meaningful
464
+ try:
465
+ # Check if code contains at least some alphanumeric characters
466
+ if not any(c.isalnum() for c in code):
467
+ code = f"# Generated sequence {i+1} contains no readable content"
468
+ elif len(code) < 5: # Too short to be meaningful
469
+ code = f"# Generated sequence {i+1} too short: {code}"
470
+ elif code.count('#') > len(code) * 0.8: # Mostly error messages
471
+ code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..."
472
+ else:
473
+ # Looks good, keep as is
474
+ pass
475
+
476
+ except Exception as validation_error:
477
+ code = f"# Validation error for sequence {i+1}: {str(validation_error)}"
478
 
479
  generated_codes.append(code)
480
 
 
485
 
486
  # Ensure we have at least one result
487
  if not generated_codes:
488
+ generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"]
489
+
490
+ # Log generation summary for debugging
491
+ valid_generations = [code for code in generated_codes if not code.startswith('#')]
492
+ error_generations = [code for code in generated_codes if code.startswith('#')]
493
+
494
+ if error_generations:
495
+ print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors")
496
+ for error in error_generations[:3]: # Log first 3 errors
497
+ print(f" Error: {error[:100]}...")
498
 
499
  # Use the first generated code as primary output
500
  primary_code = generated_codes[0] if generated_codes else "# No code generated"