hamxaameer commited on
Commit
16b9485
·
verified ·
1 Parent(s): 0958ebc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -7
app.py CHANGED
@@ -386,13 +386,16 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
386
  if generated is None:
387
  raise ValueError("Tokenizer decode returned None")
388
 
389
- # Clean up common GPT-2 artifacts
390
  generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
391
  generated = ' '.join(generated.split()) # Normalize whitespace
392
 
 
 
 
393
  # Check for gibberish (too many special characters)
394
- special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
395
- if special_ratio > 0.5: # More than 50% special chars = likely gibberish
396
  raise ValueError("Decoded output appears to be gibberish")
397
 
398
  except Exception as decode_error:
@@ -407,8 +410,8 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
407
  generated = ' '.join(generated.split())
408
 
409
  # Check for gibberish again
410
- special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
411
- if special_ratio > 0.5:
412
  raise ValueError("Decoded output still appears to be gibberish")
413
 
414
  except Exception as decode_error2:
@@ -509,8 +512,11 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
509
 
510
  # Final validation: ensure code is meaningful
511
  try:
512
- # Check if code contains at least some alphanumeric characters
513
- if not any(c.isalnum() for c in code):
 
 
 
514
  code = f"# Generated sequence {i+1} contains no readable content"
515
  elif len(code) < 5: # Too short to be meaningful
516
  code = f"# Generated sequence {i+1} too short: {code}"
 
386
  if generated is None:
387
  raise ValueError("Tokenizer decode returned None")
388
 
389
+ # Clean up common GPT-2 artifacts - more aggressive cleaning
390
  generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
391
  generated = ' '.join(generated.split()) # Normalize whitespace
392
 
393
+ # Additional cleaning for common BPE artifacts
394
+ generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ')
395
+
396
  # Check for gibberish (too many special characters)
397
+ special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
398
+ if special_ratio > 0.7: # More than 70% special chars = likely gibberish
399
  raise ValueError("Decoded output appears to be gibberish")
400
 
401
  except Exception as decode_error:
 
410
  generated = ' '.join(generated.split())
411
 
412
  # Check for gibberish again
413
+ special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
414
+ if special_ratio > 0.7:
415
  raise ValueError("Decoded output still appears to be gibberish")
416
 
417
  except Exception as decode_error2:
 
512
 
513
  # Final validation: ensure code is meaningful
514
  try:
515
+ # Check if code contains at least some alphanumeric characters or code keywords
516
+ has_alnum = any(c.isalnum() for c in code)
517
+ has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list'])
518
+
519
+ if not has_alnum and not has_code_indicators:
520
  code = f"# Generated sequence {i+1} contains no readable content"
521
  elif len(code) < 5: # Too short to be meaningful
522
  code = f"# Generated sequence {i+1} too short: {code}"