Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -386,13 +386,16 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
|
|
| 386 |
if generated is None:
|
| 387 |
raise ValueError("Tokenizer decode returned None")
|
| 388 |
|
| 389 |
-
# Clean up common GPT-2 artifacts
|
| 390 |
generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
|
| 391 |
generated = ' '.join(generated.split()) # Normalize whitespace
|
| 392 |
|
|
|
|
|
|
|
|
|
|
| 393 |
# Check for gibberish (too many special characters)
|
| 394 |
-
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
|
| 395 |
-
if special_ratio > 0.
|
| 396 |
raise ValueError("Decoded output appears to be gibberish")
|
| 397 |
|
| 398 |
except Exception as decode_error:
|
|
@@ -407,8 +410,8 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
|
|
| 407 |
generated = ' '.join(generated.split())
|
| 408 |
|
| 409 |
# Check for gibberish again
|
| 410 |
-
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"') / max(len(generated), 1)
|
| 411 |
-
if special_ratio > 0.
|
| 412 |
raise ValueError("Decoded output still appears to be gibberish")
|
| 413 |
|
| 414 |
except Exception as decode_error2:
|
|
@@ -509,8 +512,11 @@ def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p
|
|
| 509 |
|
| 510 |
# Final validation: ensure code is meaningful
|
| 511 |
try:
|
| 512 |
-
# Check if code contains at least some alphanumeric characters
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
| 514 |
code = f"# Generated sequence {i+1} contains no readable content"
|
| 515 |
elif len(code) < 5: # Too short to be meaningful
|
| 516 |
code = f"# Generated sequence {i+1} too short: {code}"
|
|
|
|
| 386 |
if generated is None:
|
| 387 |
raise ValueError("Tokenizer decode returned None")
|
| 388 |
|
| 389 |
+
# Clean up common GPT-2 artifacts - more aggressive cleaning
|
| 390 |
generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
|
| 391 |
generated = ' '.join(generated.split()) # Normalize whitespace
|
| 392 |
|
| 393 |
+
# Additional cleaning for common BPE artifacts
|
| 394 |
+
generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ')
|
| 395 |
+
|
| 396 |
# Check for gibberish (too many special characters)
|
| 397 |
+
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
|
| 398 |
+
if special_ratio > 0.7: # More than 70% special chars = likely gibberish
|
| 399 |
raise ValueError("Decoded output appears to be gibberish")
|
| 400 |
|
| 401 |
except Exception as decode_error:
|
|
|
|
| 410 |
generated = ' '.join(generated.split())
|
| 411 |
|
| 412 |
# Check for gibberish again
|
| 413 |
+
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
|
| 414 |
+
if special_ratio > 0.7:
|
| 415 |
raise ValueError("Decoded output still appears to be gibberish")
|
| 416 |
|
| 417 |
except Exception as decode_error2:
|
|
|
|
| 512 |
|
| 513 |
# Final validation: ensure code is meaningful
|
| 514 |
try:
|
| 515 |
+
# Check if code contains at least some alphanumeric characters or code keywords
|
| 516 |
+
has_alnum = any(c.isalnum() for c in code)
|
| 517 |
+
has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list'])
|
| 518 |
+
|
| 519 |
+
if not has_alnum and not has_code_indicators:
|
| 520 |
code = f"# Generated sequence {i+1} contains no readable content"
|
| 521 |
elif len(code) < 5: # Too short to be meaningful
|
| 522 |
code = f"# Generated sequence {i+1} too short: {code}"
|