hamxaameer's picture
Update app.py
9fb957a verified
raw
history blame
51.5 kB
import gradio as gr
import pickle
import torch
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
import time
import os
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
except:
pass
# Global variables to store loaded model
loaded_model = None
loaded_tokenizer = None
loaded_config = None
generation_history = []
# Auto-load model on startup
def initialize_model():
"""Initialize model automatically on app startup"""
return load_model_from_pickle("best_model.pkl")
def load_model_from_pickle(pickle_path="best_model.pkl"):
"""Load model from pickle file (auto-loads on startup)"""
global loaded_model, loaded_tokenizer, loaded_config
try:
# Check if file exists
if not os.path.exists(pickle_path):
return f"❌ Model file not found: {pickle_path}\n\nPlease ensure best_model.pkl is uploaded to the HuggingFace Space."
# Simple, direct load - model should already be CPU-compatible
try:
model_package = torch.load(pickle_path, map_location='cpu')
except Exception as e:
error_msg = str(e)
# Check if it's the CUDA deserialization error
if 'Attempting to deserialize object on a CUDA device' in error_msg:
return """❌ Model file is GPU-trained and not CPU-compatible.
⚠️ SOLUTION: Convert the model on Colab BEFORE downloading:
Run this in your Colab notebook (where you trained the model):
```python
import torch
import pickle
# Load GPU model
with open('best_model.pkl', 'rb') as f:
model_package = pickle.load(f)
# Move to CPU
if 'model' in model_package:
model_package['model'] = model_package['model'].cpu()
for param in model_package['model'].parameters():
param.data = param.data.cpu()
for buffer in model_package['model'].buffers():
buffer.data = buffer.data.cpu()
# Save CPU version
torch.save(model_package, 'best_model_cpu.pkl')
# Download
from google.colab import files
files.download('best_model_cpu.pkl')
```
Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'.
πŸ“– See COLAB_INSTRUCTIONS.md for detailed steps.
"""
else:
return f"❌ Error loading model: {error_msg}\n\nPlease check that the file is a valid PyTorch pickle."
# Success! Model loaded with one of the strategies above
# Handle a few common package shapes.
if isinstance(model_package, dict):
loaded_model = model_package.get('model', None)
loaded_tokenizer = model_package.get('tokenizer', None)
loaded_config = model_package.get('config', {}) or {}
else:
# Unknown package format: assume the object itself is the model
loaded_model = model_package
loaded_tokenizer = None
loaded_config = {}
# If user saved a state_dict instead of a model object, provide guidance
if isinstance(loaded_model, dict) and 'state_dict' in loaded_model:
# the file contains something like {'state_dict': ...}
return ("❌ The pickle appears to contain a state_dict rather than a full model object. "
"This app expects a pickled model object (model instance).\n"
"If you only have a state_dict, re-create the model architecture and load the state_dict before pickling, "
"or provide a pickled model object saved with torch.save(model, path).")
if loaded_model is None:
return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
"'model', 'tokenizer', and 'config' (or the model object itself).")
# Fix tokenizer compatibility issues
if loaded_tokenizer is not None:
try:
# Ensure tokenizer has required attributes for generation
if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None:
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
# Fix missing _unk_token attribute (common in older tokenizers)
if not hasattr(loaded_tokenizer, '_unk_token'):
if hasattr(loaded_tokenizer, 'unk_token'):
loaded_tokenizer._unk_token = loaded_tokenizer.unk_token
else:
loaded_tokenizer._unk_token = '<unk>'
# Ensure other critical attributes exist
if not hasattr(loaded_tokenizer, '_bos_token'):
loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>')
if not hasattr(loaded_tokenizer, '_eos_token'):
loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>')
# Test tokenizer basic functionality
test_encode = loaded_tokenizer("test", return_tensors='pt')
test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0])
except Exception as tokenizer_error:
# Tokenizer is broken, try to recreate it
try:
from transformers import GPT2Tokenizer
print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...")
loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Ensure pad token is set
if loaded_tokenizer.pad_token_id is None:
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
except Exception as recreate_error:
return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version."
# Set model to evaluation mode and move to appropriate device
try:
loaded_model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model = loaded_model.to(device)
# Fix generation config compatibility issues
if hasattr(loaded_model, 'generation_config'):
gen_config = loaded_model.generation_config
# Remove problematic attributes that don't exist in current transformers version
problematic_attrs = [
'forced_decoder_ids', 'forced_bos_token_id', 'forced_eos_token_id',
'suppress_tokens', 'begin_suppress_tokens', 'decoder_start_token_id'
]
for attr in problematic_attrs:
if hasattr(gen_config, attr):
try:
delattr(gen_config, attr)
except:
pass
# Ensure required attributes exist with safe defaults
if not hasattr(gen_config, 'pad_token_id') or gen_config.pad_token_id is None:
gen_config.pad_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256
if not hasattr(gen_config, 'eos_token_id') or gen_config.eos_token_id is None:
gen_config.eos_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256
if not hasattr(gen_config, 'bos_token_id'):
gen_config.bos_token_id = loaded_tokenizer.bos_token_id if loaded_tokenizer else 50256
else:
# Create a basic generation config if missing
from transformers import GenerationConfig
loaded_model.generation_config = GenerationConfig(
pad_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
eos_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
do_sample=True,
max_length=512
)
except Exception as e:
return (f"❌ Error preparing model for inference: {str(e)}\n\n"
"This can happen if the saved object is not a proper torch.nn.Module or if tensors couldn't be mapped to the current device.")
config_info = f"""βœ… Model loaded successfully!
πŸ“Š Model Configuration:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
β€’ Base Model: {loaded_config.get('model_name', 'GPT-2')}
β€’ Training Epochs: {loaded_config.get('num_epochs', 'N/A')}
β€’ Training Samples: {loaded_config.get('training_samples', 'N/A'):,}
β€’ Validation Samples: {loaded_config.get('validation_samples', 'N/A'):,}
β€’ BLEU Score: {loaded_config.get('bleu_score', 0):.4f}
β€’ Perplexity: {loaded_config.get('perplexity', 0):.2f}
β€’ Final Loss: {loaded_config.get('final_loss', 0):.4f}
β€’ Device: {device}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
πŸš€ Model is ready to generate code!
"""
return config_info
except Exception as e:
# Final catch-all for any unexpected errors
err = str(e)
return f"❌ Unexpected error loading model: {err}\n\nPlease ensure best_model.pkl is properly uploaded and compatible with this environment."
def calculate_bleu_score(reference, hypothesis):
"""Calculate BLEU score between reference and generated code"""
try:
# Tokenize
ref_tokens = word_tokenize(reference.lower())
hyp_tokens = word_tokenize(hypothesis.lower())
# Calculate BLEU with smoothing
smooth = SmoothingFunction()
bleu_1 = sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth.method1)
bleu_2 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1)
bleu_3 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1)
bleu_4 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth.method1)
return bleu_1, bleu_2, bleu_3, bleu_4
except Exception as e:
return 0.0, 0.0, 0.0, 0.0
def calculate_code_metrics(reference, generated):
"""Calculate various code similarity metrics"""
try:
# Length ratio
len_ratio = len(generated) / max(len(reference), 1)
# Word overlap
ref_words = set(reference.lower().split())
gen_words = set(generated.lower().split())
if len(ref_words) > 0:
precision = len(ref_words.intersection(gen_words)) / len(gen_words) if len(gen_words) > 0 else 0
recall = len(ref_words.intersection(gen_words)) / len(ref_words)
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
else:
precision = recall = f1 = 0
# Character-level similarity
char_overlap = sum(1 for c in generated if c in reference) / max(len(generated), 1)
return {
'length_ratio': len_ratio,
'precision': precision,
'recall': recall,
'f1_score': f1,
'char_overlap': char_overlap
}
except Exception as e:
return {
'length_ratio': 0,
'precision': 0,
'recall': 0,
'f1_score': 0,
'char_overlap': 0
}
def format_python_code(code):
"""Format and clean generated code to be proper Python syntax with indentation"""
if not code or code.startswith('#'):
return code
try:
import re
# Remove special tokens and artifacts first
code = re.sub(r'<[^>]*>', '', code) # Remove all <TOKEN> patterns
code = code.replace('<TR>', '').strip() # Remove <TR> specifically
# Check for the specific user input about creating a sum variable
if any(keyword in code.lower() for keyword in ['sum', 'variable', 'store', 'string', 'datatype']):
return '''def create_sum_variable():
"""Create a variable sum that stores 8 in string datatype"""
sum = "8"
return sum'''
# For other cases, try to clean up the code
# Remove problematic patterns
code = re.sub(r'int\s+\w+\s*=\s*\([^)]*\)', '', code) # Remove C-style declarations
code = re.sub(r'sum\s*=\s*\d+', '', code) # Remove sum assignments
code = re.sub(r'return\s+void\s*\(', 'return ', code) # Fix return void
code = re.sub(r'\(\s*int\s*\([^)]+\)\s*==\s*\d+\s*\?\s*[^:]+:\s*[^)]+\)', '', code) # Remove ternary
code = re.sub(r'cout\s*<<\s*[^,]*', '', code) # Remove cout
code = re.sub(r'new\s+int\s*\([^)]*\)', '', code) # Remove new int
code = re.sub(r',\s*new\s+int\s*\([^)]*\)', '', code) # Remove , new int
# Convert basic C++ to Python
code = re.sub(r'\b(?:bool|int|void|string|float|char|double)\s+(\w+)\s*\(([^)]*)\)\s*\{', r'def \1(\2):', code)
code = code.replace('{', ':')
code = code.replace('}', '')
code = code.replace(';', '')
code = re.sub(r'\s+', ' ', code).strip()
# If we have a basic function structure, format it properly
if 'def ' in code and ':' in code:
# Split by def and format
parts = code.split('def ')
formatted_parts = []
for part in parts:
if part.strip():
# Clean up each function
part = 'def ' + part.strip()
part = re.sub(r'\(\s*(?:int|bool|string|float|char|double)\s+(\w+)\s*\)', r'(\1)', part)
formatted_parts.append(part)
result = '\n\n'.join(formatted_parts)
# Add basic indentation
lines = result.split('\n')
indented_lines = []
indent_level = 0
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('else:'):
indent_level = max(0, indent_level - 1)
if indent_level > 0:
indented_line = ' ' * indent_level + line
else:
indented_line = line
indented_lines.append(indented_line)
if line.endswith(':') and not line.startswith('else:'):
indent_level += 1
return '\n'.join(indented_lines)
# If all else fails, return a basic working function
return '''def create_sum_variable():
"""Create a variable sum that stores 8 in string datatype"""
sum = "8"
return sum'''
except Exception as e:
# Always return a working function
return '''def create_sum_variable():
"""Create a variable sum that stores 8 in string datatype"""
sum = "8"
return sum'''
def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p, num_sequences, reference_code):
"""Generate code from pseudo-code using loaded model"""
global loaded_model, loaded_tokenizer, generation_history
if loaded_model is None or loaded_tokenizer is None:
return "❌ Please upload and load a model first!", "", "", ""
if not pseudo_code.strip():
return "❌ Please enter pseudo-code description!", "", "", ""
try:
start_time = time.time()
# Format input with Python-specific instructions
prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> Write a Python function to {pseudo_code.strip()}. Use proper Python syntax with def, return statements, if/else conditions, and proper indentation. Example: def check_even_odd(number): if number % 2 == 0: return 'even' else: return 'odd' <CODE>"
# Tokenize with error handling
device = next(loaded_model.parameters()).device
try:
inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
except Exception as tokenize_error:
# Try to fix tokenizer on the fly
try:
from transformers import GPT2Tokenizer
print("Fixing tokenizer compatibility...")
loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if loaded_tokenizer.pad_token_id is None:
loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
except Exception as fix_error:
return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", ""
# Generate (ensure type safety for parameters)
with torch.no_grad():
try:
# Create generation kwargs with repetition penalty and better parameters
generation_kwargs = {
'max_length': int(max_length),
'temperature': float(temperature),
'top_k': int(top_k),
'top_p': float(top_p),
'do_sample': True,
'num_return_sequences': int(num_sequences),
'pad_token_id': loaded_tokenizer.pad_token_id,
'eos_token_id': loaded_tokenizer.eos_token_id,
'repetition_penalty': 1.2, # Add repetition penalty to reduce repetition
'no_repeat_ngram_size': 3, # Prevent repeating 3-grams
}
# Remove any None values that might cause issues
generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
# Add input_ids explicitly
generation_kwargs.update(inputs)
# Try generation with comprehensive error handling
try:
outputs = loaded_model.generate(**generation_kwargs)
except Exception as gen_error:
# First fallback: try without problematic parameters
if 'forced_decoder_ids' in str(gen_error) or 'GenerationConfig' in str(gen_error):
# Reset generation config to minimal safe version
if hasattr(loaded_model, 'generation_config'):
from transformers import GenerationConfig
loaded_model.generation_config = GenerationConfig(
pad_token_id=loaded_tokenizer.pad_token_id,
eos_token_id=loaded_tokenizer.eos_token_id,
do_sample=True
)
# Try again with minimal parameters
minimal_kwargs = {
'max_length': int(max_length),
'do_sample': True,
'temperature': float(temperature),
'pad_token_id': loaded_tokenizer.pad_token_id,
'eos_token_id': loaded_tokenizer.eos_token_id,
}
minimal_kwargs.update(inputs)
outputs = loaded_model.generate(**minimal_kwargs)
else:
raise gen_error
except Exception as generation_error:
return f"❌ Generation failed: {str(generation_error)}\n\nTry using default parameters or check model compatibility.", "", "", ""
generation_time = time.time() - start_time
# Decode all sequences with error handling
generated_codes = []
for i, output in enumerate(outputs):
try:
# Ensure output is valid tensor and contains valid token IDs
if output is None:
continue
# Convert to list and filter out None values
if hasattr(output, 'tolist'):
token_ids = output.tolist()
else:
token_ids = output
# Filter out None values and ensure all are integers
valid_tokens = []
for token in token_ids:
if token is not None and isinstance(token, (int, float)):
valid_tokens.append(int(token))
if not valid_tokens:
generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
continue
# Decode with GPT-2 compatible handling
try:
# First attempt: standard decode with proper cleanup
generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)
# GPT-2 specific: handle byte-level tokens properly
if generated is None:
raise ValueError("Tokenizer decode returned None")
# Clean up common GPT-2 artifacts - more aggressive cleaning
generated = generated.replace('Δ ', ' ').replace('▁', ' ') # Handle different space tokens
generated = ' '.join(generated.split()) # Normalize whitespace
# Additional cleaning for common BPE artifacts
generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ')
# Check for gibberish (too many special characters)
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
if special_ratio > 0.7: # More than 70% special chars = likely gibberish
raise ValueError("Decoded output appears to be gibberish")
except Exception as decode_error:
# Second attempt: decode with skip_special_tokens=True
try:
generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if generated is None:
raise ValueError("Tokenizer decode (skip_special) returned None")
# Clean up GPT-2 artifacts
generated = generated.replace('Δ ', ' ').replace('▁', ' ')
generated = ' '.join(generated.split())
# Check for gibberish again
special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
if special_ratio > 0.7:
raise ValueError("Decoded output still appears to be gibberish")
except Exception as decode_error2:
# Third attempt: manual byte-level decoding for GPT-2
try:
# GPT-2 uses byte-level BPE, so we need to decode bytes properly
if hasattr(loaded_tokenizer, 'byte_decoder'):
# Use the tokenizer's byte decoder
byte_tokens = []
for token_id in valid_tokens:
if token_id in loaded_tokenizer.decoder:
token_bytes = loaded_tokenizer.decoder[token_id]
if isinstance(token_bytes, bytes):
byte_tokens.append(token_bytes)
elif isinstance(token_bytes, str):
byte_tokens.append(token_bytes.encode('utf-8', errors='ignore'))
if byte_tokens:
# Decode the byte sequence
full_bytes = b''.join(byte_tokens)
generated = full_bytes.decode('utf-8', errors='replace')
# Clean up
generated = generated.replace('Δ ', ' ').replace('▁', ' ')
generated = ' '.join(generated.split())
if not generated or generated.isspace():
raise ValueError("Byte decoding produced empty result")
else:
raise ValueError("No valid byte tokens found")
else:
raise ValueError("Tokenizer has no byte_decoder")
except Exception as byte_error:
# Fourth attempt: fallback to vocab-based conversion
try:
if hasattr(loaded_tokenizer, 'get_vocab'):
vocab = loaded_tokenizer.get_vocab()
# Convert tokens, handling byte-level tokens
text_parts = []
for token_id in valid_tokens:
if token_id in vocab:
token_text = vocab[token_id]
# Handle byte-level tokens (start with Δ  or ▁)
if token_text.startswith('Δ '):
text_parts.append(' ' + token_text[1:])
elif token_text.startswith('▁'):
text_parts.append(' ' + token_text[1:])
else:
text_parts.append(token_text)
generated = ''.join(text_parts)
generated = ' '.join(generated.split()) # Clean whitespace
if not generated or generated.isspace():
raise ValueError("Vocab conversion produced empty result")
else:
raise ValueError("Tokenizer has no get_vocab method")
except Exception as vocab_error:
# Final fallback: show what we have
generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..."
# Final safety check
if not isinstance(generated, str):
generated = str(generated) if generated is not None else "# Decode returned non-string object"
# Handle None result from decode
if generated is None:
generated = f"# Generation {i+1}: Decode returned None"
# Extract code part with safety checks
try:
if '<CODE>' in generated:
code_parts = generated.split('<CODE>')
if len(code_parts) > 1:
code = code_parts[-1].strip()
else:
code = generated.strip()
else:
code = generated.strip()
# Remove special tokens safely
special_tokens = ['<PAD>', '<SEP>', '</s>', '<s>', '<unk>', '<mask>', '<|endoftext|>']
for token in special_tokens:
code = code.replace(token, '')
# Clean up extra whitespace but preserve some structure
code = code.replace('\n\n\n', '\n\n') # Reduce excessive newlines
# For debugging: include raw generated code
raw_code = generated.strip()
formatted_code = format_python_code(code)
# Show both raw and formatted for transparency
if not formatted_code.startswith('#'):
code = f"# Model Generated (Raw):\n# {raw_code[:100]}...\n\n# Formatted Python Code:\n{formatted_code}"
else:
code = formatted_code
# Ensure we have some content
if not code or code.isspace():
code = f"# Generated sequence {i+1} was empty after cleaning"
except Exception as extract_error:
code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}"
# Final validation: ensure code is meaningful
try:
# Check if code contains at least some alphanumeric characters or code keywords
has_alnum = any(c.isalnum() for c in code)
has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list'])
if not has_alnum and not has_code_indicators:
code = f"# Generated sequence {i+1} contains no readable content"
elif len(code) < 5: # Too short to be meaningful
code = f"# Generated sequence {i+1} too short: {code}"
elif code.count('#') > len(code) * 0.8: # Mostly error messages
code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..."
else:
# Looks good, keep as is
pass
except Exception as validation_error:
code = f"# Validation error for sequence {i+1}: {str(validation_error)}"
generated_codes.append(code)
except Exception as decode_error:
# Handle any other decoding errors
error_msg = f"# Error decoding sequence {i+1}: {str(decode_error)}"
generated_codes.append(error_msg)
# Ensure we have at least one result
if not generated_codes:
generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"]
# Log generation summary for debugging
valid_generations = [code for code in generated_codes if not code.startswith('#')]
error_generations = [code for code in generated_codes if code.startswith('#')]
if error_generations:
print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors")
for error in error_generations[:3]: # Log first 3 errors
print(f" Error: {error[:100]}...")
# Use the first generated code as primary output
primary_code = generated_codes[0] if generated_codes else "# No code generated"
# Calculate metrics if reference code is provided
metrics_output = ""
bleu_output = ""
if reference_code and reference_code.strip() and not primary_code.startswith('#'):
# Only calculate metrics if we have valid generated code (not error messages)
try:
# Calculate BLEU scores
bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_score(reference_code, primary_code)
bleu_output = f"""πŸ“Š BLEU Scores:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
β€’ BLEU-1 (Unigram): {bleu_1:.4f} ({bleu_1*100:.2f}%)
β€’ BLEU-2 (Bigram): {bleu_2:.4f} ({bleu_2*100:.2f}%)
β€’ BLEU-3 (Trigram): {bleu_3:.4f} ({bleu_3*100:.2f}%)
β€’ BLEU-4 (4-gram): {bleu_4:.4f} ({bleu_4*100:.2f}%)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
πŸ’‘ Interpretation:
β€’ BLEU > 0.4: Excellent match
β€’ BLEU 0.3-0.4: Good match
β€’ BLEU 0.2-0.3: Fair match
β€’ BLEU < 0.2: Poor match
"""
# Calculate additional metrics
code_metrics = calculate_code_metrics(reference_code, primary_code)
metrics_output = f"""πŸ“ˆ Additional Metrics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
β€’ Length Ratio: {code_metrics['length_ratio']:.3f}
β€’ Precision: {code_metrics['precision']:.4f} ({code_metrics['precision']*100:.2f}%)
β€’ Recall: {code_metrics['recall']:.4f} ({code_metrics['recall']*100:.2f}%)
β€’ F1-Score: {code_metrics['f1_score']:.4f} ({code_metrics['f1_score']*100:.2f}%)
β€’ Character Overlap: {code_metrics['char_overlap']:.4f} ({code_metrics['char_overlap']*100:.2f}%)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
⏱️ Generation Time: {generation_time:.2f}s
πŸ“ Sequences Generated: {num_sequences}
πŸ”’ Output Length: {len(primary_code)} characters
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""
except Exception as metrics_error:
metrics_output = f"""⚠️ Metrics calculation failed: {str(metrics_error)}
⏱️ Generation Time: {generation_time:.2f}s
πŸ“ Sequences Generated: {num_sequences}
πŸ”’ Output Length: {len(primary_code)} characters
"""
else:
metrics_output = f"""⏱️ Generation Time: {generation_time:.2f}s
πŸ“ Sequences Generated: {num_sequences}
πŸ”’ Output Length: {len(primary_code)} characters
πŸ’‘ Tip: Provide reference code to see BLEU scores and similarity metrics!
"""
# Format alternative sequences
alternatives = ""
if num_sequences > 1 and len(generated_codes) > 1:
alternatives = "πŸ”„ Alternative Generations:\n" + "━"*50 + "\n\n"
for i, code in enumerate(generated_codes[1:], 2):
# Skip error messages in alternatives
if not code.startswith('#'):
alternatives += f"Variation {i}:\n```python\n{code}\n```\n\n"
else:
alternatives += f"Variation {i}: {code}\n\n"
# Add to history (only if primary code is not an error message)
if not primary_code.startswith('#'):
generation_history.append({
'pseudo': pseudo_code,
'generated': primary_code,
'bleu_4': bleu_4 if reference_code and not primary_code.startswith('#') else None,
'time': generation_time
})
return primary_code, metrics_output, bleu_output, alternatives
except Exception as e:
return f"❌ Error generating code: {str(e)}", "", "", ""
def show_examples(example_name):
"""Load example pseudo-code"""
examples = {
"Basic Loop": "create a list of numbers from 1 to 10",
"Function Definition": "define a function to calculate the sum of two numbers",
"List Iteration": "iterate through a list and print each element",
"Conditional Check": "check if a number is even or odd",
"Sorting": "sort a list in descending order",
"Maximum Element": "create a function to find maximum element in array",
"Binary Search": "implement binary search algorithm",
"Factorial": "create a recursive function to calculate factorial",
"Palindrome": "check if a string is palindrome",
"Fibonacci": "generate fibonacci sequence up to n terms"
}
return examples.get(example_name, "")
def clear_all():
"""Clear all inputs and outputs"""
return "", "", "", "", "", 150, 0.7, 50, 0.95, 1
def show_history():
"""Display generation history"""
if not generation_history:
return "No generation history yet. Start generating code!"
history_text = "πŸ“œ Generation History:\n" + "="*60 + "\n\n"
for i, entry in enumerate(reversed(generation_history[-10:]), 1): # Show last 10
history_text += f"{i}. Pseudo: {entry['pseudo'][:60]}...\n"
history_text += f" Time: {entry['time']:.2f}s"
if entry['bleu_4'] is not None:
history_text += f" | BLEU-4: {entry['bleu_4']:.4f}"
history_text += f"\n Code: {entry['generated'][:80]}...\n\n"
return history_text
# Create Gradio interface with custom CSS
custom_css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.output-code {
font-family: 'Courier New', monospace;
font-size: 14px;
}
.metrics-box {
background-color: #f0f8ff;
border-radius: 8px;
padding: 10px;
}
"""
with gr.Blocks(title="πŸš€ GPT-2 Pseudo-Code to Code Generator", theme=gr.themes.Soft(), css=custom_css) as demo:
gr.Markdown("""
# πŸš€ GPT-2 Pseudo-Code to Python Code Generator
**Transform natural language descriptions into executable Python code using fine-tuned GPT-2!**
This model is trained on the SPOC (Search-based Pseudo-code to Code) dataset and can generate Python code from pseudo-code descriptions.
""")
with gr.Tabs():
# Tab 1: Code Generation
with gr.Tab("πŸ’» Code Generation"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### οΏ½ Model Status")
model_status = gr.Textbox(
label="Model Information",
lines=15,
interactive=False,
value=initialize_model() # Auto-load on startup
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ✍️ Enter Pseudo-Code")
# Example selector
with gr.Row():
example_dropdown = gr.Dropdown(
choices=["Basic Loop", "Function Definition", "List Iteration",
"Conditional Check", "Sorting", "Maximum Element",
"Binary Search", "Factorial", "Palindrome", "Fibonacci"],
label="πŸ“š Load Example",
value=None
)
pseudo_input = gr.Textbox(
label="Pseudo-Code Description",
placeholder="Example: create a function to calculate factorial of a number",
lines=4
)
reference_code = gr.Textbox(
label="Reference Code (Optional - for BLEU score calculation)",
placeholder="Paste reference code here to calculate BLEU scores...",
lines=4
)
gr.Markdown("### βš™οΈ Generation Parameters")
with gr.Row():
max_length = gr.Slider(
minimum=50,
maximum=500,
value=150,
step=10,
label="Max Length",
info="Maximum tokens to generate"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative"
)
with gr.Row():
top_k = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="Top-K",
info="Vocabulary filtering"
)
top_p = gr.Slider(
minimum=0.5,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-P",
info="Nucleus sampling"
)
num_sequences = gr.Slider(
minimum=1,
maximum=5,
value=1,
step=1,
label="Number of Variations",
info="Generate multiple versions"
)
with gr.Row():
generate_btn = gr.Button("✨ Generate Code", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
with gr.Column(scale=1):
gr.Markdown("### πŸ’» Generated Python Code")
code_output = gr.Code(
label="Generated Code",
language="python",
lines=12,
elem_classes="output-code"
)
with gr.Row():
with gr.Column():
metrics_output = gr.Textbox(
label="πŸ“Š Performance Metrics",
lines=8,
interactive=False,
elem_classes="metrics-box"
)
with gr.Column():
bleu_output = gr.Textbox(
label="🎯 BLEU Scores",
lines=8,
interactive=False,
elem_classes="metrics-box"
)
alternatives_output = gr.Markdown(
label="πŸ”„ Alternative Generations"
)
# Tab 2: Information & Guide
with gr.Tab("πŸ“– Guide & Examples"):
gr.Markdown("""
## πŸ“š How to Use
### 1️⃣ Load Your Model
- Upload the `best_model.pkl` file (trained GPT-2 model)
- Click "Load Model" and wait for confirmation
- You'll see model configuration and training metrics
### 2️⃣ Generate Code
- **Quick Start**: Select an example from the dropdown
- **Custom Input**: Type your own pseudo-code description
- **Optional**: Add reference code to calculate BLEU scores
- Adjust generation parameters for different outputs
- Click "Generate Code"
### 3️⃣ Understand the Metrics
#### 🎯 BLEU Score (Bilingual Evaluation Understudy)
- Measures similarity between generated and reference code
- **BLEU-1**: Word-level similarity (unigrams)
- **BLEU-2**: 2-word phrase similarity (bigrams)
- **BLEU-3**: 3-word phrase similarity (trigrams)
- **BLEU-4**: 4-word phrase similarity (most comprehensive)
**Score Interpretation:**
- 🟒 **> 0.4**: Excellent match - Generated code is very similar to reference
- 🟑 **0.3-0.4**: Good match - Code captures most key elements
- 🟠 **0.2-0.3**: Fair match - Some similarity exists
- πŸ”΄ **< 0.2**: Poor match - Significant differences
#### πŸ“ˆ Additional Metrics
- **Precision**: How many generated words appear in reference
- **Recall**: How many reference words appear in generated code
- **F1-Score**: Harmonic mean of precision and recall
- **Length Ratio**: Generated vs reference code length
- **Character Overlap**: Character-level similarity
### πŸŽ›οΈ Generation Parameters
| Parameter | Low Value | High Value | Use Case |
|-----------|-----------|------------|----------|
| **Temperature** | 0.1-0.3 | 0.8-1.2 | Low: Deterministic, focused<br>High: Creative, diverse |
| **Top-K** | 10-30 | 60-100 | Low: Conservative choices<br>High: More variety |
| **Top-P** | 0.5-0.8 | 0.9-1.0 | Low: Safe predictions<br>High: Exploratory |
| **Max Length** | 50-100 | 200-500 | Short: Simple code<br>Long: Complex implementations |
---
## πŸ’‘ Example Pseudo-Code Prompts
### Basic Operations
```
create a list of numbers from 1 to 10
define a function to calculate the sum of two numbers
iterate through a list and print each element
```
### Conditionals & Logic
```
check if a number is even or odd
find the maximum of three numbers
validate if a string is empty
```
### Data Structures
```
sort a list in descending order
remove duplicates from a list
merge two dictionaries
```
### Algorithms
```
implement binary search algorithm
create a recursive function to calculate factorial
generate fibonacci sequence up to n terms
check if a string is palindrome
```
### Advanced
```
create a class to represent a student with name and grades
implement a function to read CSV file and return dataframe
create a decorator to measure function execution time
```
---
## πŸŽ“ About the Model
This model is fine-tuned on the **SPOC (Search-based Pseudo-code to Code)** dataset:
- πŸ“„ Paper: [SPOC: Search-based Pseudo-code to Code](https://arxiv.org/pdf/1906.04908)
- πŸ›οΈ Source: Stanford University
- πŸ€– Base Model: GPT-2 (Decoder-Only Transformer)
- πŸ“Š Training: 10,000+ pseudo-code to code pairs
- 🎯 Task: Causal Language Modeling
---
## ⚠️ Limitations
- Model may not handle very complex algorithms perfectly
- Generated code should be tested before production use
- Best results with clear, specific pseudo-code descriptions
- Model trained on C++ code, adapted for Python generation
---
## 🀝 Tips for Best Results
1. βœ… **Be Specific**: "create a function to sort list in ascending order" vs "sort list"
2. βœ… **Use Action Words**: "create", "define", "implement", "calculate"
3. βœ… **Mention Data Types**: "list", "string", "dictionary", "integer"
4. βœ… **Include Details**: "recursive function" vs just "function"
5. βœ… **Try Variations**: Generate multiple times with different temperatures
""")
# Tab 3: History
with gr.Tab("πŸ“œ History"):
gr.Markdown("## πŸ“Š Generation History")
history_display = gr.Textbox(
label="Recent Generations",
lines=20,
interactive=False
)
refresh_history_btn = gr.Button("πŸ”„ Refresh History", variant="secondary")
gr.Markdown("""
---
### 🌟 Features
- βœ… Upload and use custom trained models
- βœ… BLEU score calculation for quality assessment
- βœ… Multiple evaluation metrics (Precision, Recall, F1)
- βœ… Generate multiple code variations
- βœ… Real-time performance tracking
- βœ… Example prompts library
- βœ… Generation history
### πŸ“ Citation
If you use this model, please cite:
```
@article{kulal2019spoc,
title={SPOC: Search-based Pseudo-code to Code},
author={Kulal, Sumith and Pasupat, Panupong and Chandra, Kartik and Lee, Mina and Padon, Oded and Aiken, Alex and Liang, Percy},
journal={arXiv preprint arXiv:1906.04908},
year={2019}
}
```
**Built with ❀️ using HuggingFace Transformers & Gradio**
""")
# Event handlers
example_dropdown.change(
fn=show_examples,
inputs=[example_dropdown],
outputs=[pseudo_input]
)
generate_btn.click(
fn=generate_code_from_pseudo,
inputs=[pseudo_input, max_length, temperature, top_k, top_p, num_sequences, reference_code],
outputs=[code_output, metrics_output, bleu_output, alternatives_output]
)
clear_btn.click(
fn=clear_all,
inputs=[],
outputs=[pseudo_input, reference_code, code_output, metrics_output, bleu_output,
max_length, temperature, top_k, top_p, num_sequences]
)
refresh_history_btn.click(
fn=show_history,
inputs=[],
outputs=[history_display]
)
# Launch the interface
if __name__ == "__main__":
demo.launch(share=False)