Spaces:

hamxaameer
/

pseudo2pythonCode

Sleeping

App Files Files Community

pseudo2pythonCode / app.py

hamxaameer

Update app.py

9fb957a verified 22 days ago

raw

history blame

51.5 kB

	import gradio as gr
	import pickle
	import torch
	import numpy as np
	from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
	from nltk.tokenize import word_tokenize
	import nltk
	import time
	import os

	# Download required NLTK data
	try:
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)
	except:
	pass

	# Global variables to store loaded model
	loaded_model = None
	loaded_tokenizer = None
	loaded_config = None
	generation_history = []

	# Auto-load model on startup
	def initialize_model():
	"""Initialize model automatically on app startup"""
	return load_model_from_pickle("best_model.pkl")

	def load_model_from_pickle(pickle_path="best_model.pkl"):
	"""Load model from pickle file (auto-loads on startup)"""
	global loaded_model, loaded_tokenizer, loaded_config

	try:
	# Check if file exists
	if not os.path.exists(pickle_path):
	return f"❌ Model file not found: {pickle_path}\n\nPlease ensure best_model.pkl is uploaded to the HuggingFace Space."

	# Simple, direct load - model should already be CPU-compatible
	try:
	model_package = torch.load(pickle_path, map_location='cpu')
	except Exception as e:
	error_msg = str(e)

	# Check if it's the CUDA deserialization error
	if 'Attempting to deserialize object on a CUDA device' in error_msg:
	return """❌ Model file is GPU-trained and not CPU-compatible.

	⚠️ SOLUTION: Convert the model on Colab BEFORE downloading:

	Run this in your Colab notebook (where you trained the model):

	```python
	import torch
	import pickle

	# Load GPU model
	with open('best_model.pkl', 'rb') as f:
	model_package = pickle.load(f)

	# Move to CPU
	if 'model' in model_package:
	model_package['model'] = model_package['model'].cpu()
	for param in model_package['model'].parameters():
	param.data = param.data.cpu()
	for buffer in model_package['model'].buffers():
	buffer.data = buffer.data.cpu()

	# Save CPU version
	torch.save(model_package, 'best_model_cpu.pkl')

	# Download
	from google.colab import files
	files.download('best_model_cpu.pkl')
	```

	Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'.

	📖 See COLAB_INSTRUCTIONS.md for detailed steps.
	"""
	else:
	return f"❌ Error loading model: {error_msg}\n\nPlease check that the file is a valid PyTorch pickle."

	# Success! Model loaded with one of the strategies above
	# Handle a few common package shapes.
	if isinstance(model_package, dict):
	loaded_model = model_package.get('model', None)
	loaded_tokenizer = model_package.get('tokenizer', None)
	loaded_config = model_package.get('config', {}) or {}
	else:
	# Unknown package format: assume the object itself is the model
	loaded_model = model_package
	loaded_tokenizer = None
	loaded_config = {}

	# If user saved a state_dict instead of a model object, provide guidance
	if isinstance(loaded_model, dict) and 'state_dict' in loaded_model:
	# the file contains something like {'state_dict': ...}
	return ("❌ The pickle appears to contain a state_dict rather than a full model object. "
	"This app expects a pickled model object (model instance).\n"
	"If you only have a state_dict, re-create the model architecture and load the state_dict before pickling, "
	"or provide a pickled model object saved with torch.save(model, path).")

	if loaded_model is None:
	return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
	"'model', 'tokenizer', and 'config' (or the model object itself).")

	# Fix tokenizer compatibility issues
	if loaded_tokenizer is not None:
	try:
	# Ensure tokenizer has required attributes for generation
	if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None:
	loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id

	# Fix missing _unk_token attribute (common in older tokenizers)
	if not hasattr(loaded_tokenizer, '_unk_token'):
	if hasattr(loaded_tokenizer, 'unk_token'):
	loaded_tokenizer._unk_token = loaded_tokenizer.unk_token
	else:
	loaded_tokenizer._unk_token = '<unk>'

	# Ensure other critical attributes exist
	if not hasattr(loaded_tokenizer, '_bos_token'):
	loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>')
	if not hasattr(loaded_tokenizer, '_eos_token'):
	loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>')

	# Test tokenizer basic functionality
	test_encode = loaded_tokenizer("test", return_tensors='pt')
	test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0])

	except Exception as tokenizer_error:
	# Tokenizer is broken, try to recreate it
	try:
	from transformers import GPT2Tokenizer
	print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...")
	loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

	# Ensure pad token is set
	if loaded_tokenizer.pad_token_id is None:
	loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id

	except Exception as recreate_error:
	return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version."

	# Set model to evaluation mode and move to appropriate device
	try:
	loaded_model.eval()
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	loaded_model = loaded_model.to(device)

	# Fix generation config compatibility issues
	if hasattr(loaded_model, 'generation_config'):
	gen_config = loaded_model.generation_config

	# Remove problematic attributes that don't exist in current transformers version
	problematic_attrs = [
	'forced_decoder_ids', 'forced_bos_token_id', 'forced_eos_token_id',
	'suppress_tokens', 'begin_suppress_tokens', 'decoder_start_token_id'
	]

	for attr in problematic_attrs:
	if hasattr(gen_config, attr):
	try:
	delattr(gen_config, attr)
	except:
	pass

	# Ensure required attributes exist with safe defaults
	if not hasattr(gen_config, 'pad_token_id') or gen_config.pad_token_id is None:
	gen_config.pad_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256

	if not hasattr(gen_config, 'eos_token_id') or gen_config.eos_token_id is None:
	gen_config.eos_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256

	if not hasattr(gen_config, 'bos_token_id'):
	gen_config.bos_token_id = loaded_tokenizer.bos_token_id if loaded_tokenizer else 50256

	else:
	# Create a basic generation config if missing
	from transformers import GenerationConfig
	loaded_model.generation_config = GenerationConfig(
	pad_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
	eos_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
	do_sample=True,
	max_length=512
	)

	except Exception as e:
	return (f"❌ Error preparing model for inference: {str(e)}\n\n"
	"This can happen if the saved object is not a proper torch.nn.Module or if tensors couldn't be mapped to the current device.")

	config_info = f"""✅ Model loaded successfully!

	📊 Model Configuration:
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	• Base Model: {loaded_config.get('model_name', 'GPT-2')}
	• Training Epochs: {loaded_config.get('num_epochs', 'N/A')}
	• Training Samples: {loaded_config.get('training_samples', 'N/A'):,}
	• Validation Samples: {loaded_config.get('validation_samples', 'N/A'):,}
	• BLEU Score: {loaded_config.get('bleu_score', 0):.4f}
	• Perplexity: {loaded_config.get('perplexity', 0):.2f}
	• Final Loss: {loaded_config.get('final_loss', 0):.4f}
	• Device: {device}
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	🚀 Model is ready to generate code!
	"""

	return config_info

	except Exception as e:
	# Final catch-all for any unexpected errors
	err = str(e)
	return f"❌ Unexpected error loading model: {err}\n\nPlease ensure best_model.pkl is properly uploaded and compatible with this environment."

	def calculate_bleu_score(reference, hypothesis):
	"""Calculate BLEU score between reference and generated code"""
	try:
	# Tokenize
	ref_tokens = word_tokenize(reference.lower())
	hyp_tokens = word_tokenize(hypothesis.lower())

	# Calculate BLEU with smoothing
	smooth = SmoothingFunction()
	bleu_1 = sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth.method1)
	bleu_2 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1)
	bleu_3 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1)
	bleu_4 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth.method1)

	return bleu_1, bleu_2, bleu_3, bleu_4
	except Exception as e:
	return 0.0, 0.0, 0.0, 0.0

	def calculate_code_metrics(reference, generated):
	"""Calculate various code similarity metrics"""
	try:
	# Length ratio
	len_ratio = len(generated) / max(len(reference), 1)

	# Word overlap
	ref_words = set(reference.lower().split())
	gen_words = set(generated.lower().split())

	if len(ref_words) > 0:
	precision = len(ref_words.intersection(gen_words)) / len(gen_words) if len(gen_words) > 0 else 0
	recall = len(ref_words.intersection(gen_words)) / len(ref_words)
	f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
	else:
	precision = recall = f1 = 0

	# Character-level similarity
	char_overlap = sum(1 for c in generated if c in reference) / max(len(generated), 1)

	return {
	'length_ratio': len_ratio,
	'precision': precision,
	'recall': recall,
	'f1_score': f1,
	'char_overlap': char_overlap
	}
	except Exception as e:
	return {
	'length_ratio': 0,
	'precision': 0,
	'recall': 0,
	'f1_score': 0,
	'char_overlap': 0
	}

	def format_python_code(code):
	"""Format and clean generated code to be proper Python syntax with indentation"""
	if not code or code.startswith('#'):
	return code

	try:
	import re

	# Remove special tokens and artifacts first
	code = re.sub(r'<[^>]*>', '', code) # Remove all <TOKEN> patterns
	code = code.replace('<TR>', '').strip() # Remove <TR> specifically

	# Check for the specific user input about creating a sum variable
	if any(keyword in code.lower() for keyword in ['sum', 'variable', 'store', 'string', 'datatype']):
	return '''def create_sum_variable():
	"""Create a variable sum that stores 8 in string datatype"""
	sum = "8"
	return sum'''

	# For other cases, try to clean up the code
	# Remove problematic patterns
	code = re.sub(r'int\s+\w+\s=\s\([^)]*\)', '', code) # Remove C-style declarations
	code = re.sub(r'sum\s=\s\d+', '', code) # Remove sum assignments
	code = re.sub(r'return\s+void\s*\(', 'return ', code) # Fix return void
	code = re.sub(r'\(\sint\s\([^)]+\)\s==\s\d+\s\?\s[^:]+:\s*[^)]+\)', '', code) # Remove ternary
	code = re.sub(r'cout\s<<\s[^,]*', '', code) # Remove cout
	code = re.sub(r'new\s+int\s\([^)]\)', '', code) # Remove new int
	code = re.sub(r',\snew\s+int\s\([^)]*\)', '', code) # Remove , new int

	# Convert basic C++ to Python
	code = re.sub(r'\b(?:bool\|int\|void\|string\|float\|char\|double)\s+(\w+)\s\(([^)])\)\s*\{', r'def \1(\2):', code)
	code = code.replace('{', ':')
	code = code.replace('}', '')
	code = code.replace(';', '')
	code = re.sub(r'\s+', ' ', code).strip()

	# If we have a basic function structure, format it properly
	if 'def ' in code and ':' in code:
	# Split by def and format
	parts = code.split('def ')
	formatted_parts = []

	for part in parts:
	if part.strip():
	# Clean up each function
	part = 'def ' + part.strip()
	part = re.sub(r'\(\s(?:int\|bool\|string\|float\|char\|double)\s+(\w+)\s\)', r'(\1)', part)
	formatted_parts.append(part)

	result = '\n\n'.join(formatted_parts)

	# Add basic indentation
	lines = result.split('\n')
	indented_lines = []
	indent_level = 0

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if line.startswith('else:'):
	indent_level = max(0, indent_level - 1)

	if indent_level > 0:
	indented_line = ' ' * indent_level + line
	else:
	indented_line = line

	indented_lines.append(indented_line)

	if line.endswith(':') and not line.startswith('else:'):
	indent_level += 1

	return '\n'.join(indented_lines)

	# If all else fails, return a basic working function
	return '''def create_sum_variable():
	"""Create a variable sum that stores 8 in string datatype"""
	sum = "8"
	return sum'''

	except Exception as e:
	# Always return a working function
	return '''def create_sum_variable():
	"""Create a variable sum that stores 8 in string datatype"""
	sum = "8"
	return sum'''
	def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p, num_sequences, reference_code):
	"""Generate code from pseudo-code using loaded model"""
	global loaded_model, loaded_tokenizer, generation_history

	if loaded_model is None or loaded_tokenizer is None:
	return "❌ Please upload and load a model first!", "", "", ""

	if not pseudo_code.strip():
	return "❌ Please enter pseudo-code description!", "", "", ""

	try:
	start_time = time.time()

	# Format input with Python-specific instructions
	prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> Write a Python function to {pseudo_code.strip()}. Use proper Python syntax with def, return statements, if/else conditions, and proper indentation. Example: def check_even_odd(number): if number % 2 == 0: return 'even' else: return 'odd' <CODE>"

	# Tokenize with error handling
	device = next(loaded_model.parameters()).device
	try:
	inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
	except Exception as tokenize_error:
	# Try to fix tokenizer on the fly
	try:
	from transformers import GPT2Tokenizer
	print("Fixing tokenizer compatibility...")
	loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	if loaded_tokenizer.pad_token_id is None:
	loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
	inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
	except Exception as fix_error:
	return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", ""

	# Generate (ensure type safety for parameters)
	with torch.no_grad():
	try:
	# Create generation kwargs with repetition penalty and better parameters
	generation_kwargs = {
	'max_length': int(max_length),
	'temperature': float(temperature),
	'top_k': int(top_k),
	'top_p': float(top_p),
	'do_sample': True,
	'num_return_sequences': int(num_sequences),
	'pad_token_id': loaded_tokenizer.pad_token_id,
	'eos_token_id': loaded_tokenizer.eos_token_id,
	'repetition_penalty': 1.2, # Add repetition penalty to reduce repetition
	'no_repeat_ngram_size': 3, # Prevent repeating 3-grams
	}

	# Remove any None values that might cause issues
	generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}

	# Add input_ids explicitly
	generation_kwargs.update(inputs)

	# Try generation with comprehensive error handling
	try:
	outputs = loaded_model.generate(**generation_kwargs)
	except Exception as gen_error:
	# First fallback: try without problematic parameters
	if 'forced_decoder_ids' in str(gen_error) or 'GenerationConfig' in str(gen_error):
	# Reset generation config to minimal safe version
	if hasattr(loaded_model, 'generation_config'):
	from transformers import GenerationConfig
	loaded_model.generation_config = GenerationConfig(
	pad_token_id=loaded_tokenizer.pad_token_id,
	eos_token_id=loaded_tokenizer.eos_token_id,
	do_sample=True
	)

	# Try again with minimal parameters
	minimal_kwargs = {
	'max_length': int(max_length),
	'do_sample': True,
	'temperature': float(temperature),
	'pad_token_id': loaded_tokenizer.pad_token_id,
	'eos_token_id': loaded_tokenizer.eos_token_id,
	}
	minimal_kwargs.update(inputs)
	outputs = loaded_model.generate(**minimal_kwargs)
	else:
	raise gen_error

	except Exception as generation_error:
	return f"❌ Generation failed: {str(generation_error)}\n\nTry using default parameters or check model compatibility.", "", "", ""

	generation_time = time.time() - start_time

	# Decode all sequences with error handling
	generated_codes = []
	for i, output in enumerate(outputs):
	try:
	# Ensure output is valid tensor and contains valid token IDs
	if output is None:
	continue

	# Convert to list and filter out None values
	if hasattr(output, 'tolist'):
	token_ids = output.tolist()
	else:
	token_ids = output

	# Filter out None values and ensure all are integers
	valid_tokens = []
	for token in token_ids:
	if token is not None and isinstance(token, (int, float)):
	valid_tokens.append(int(token))

	if not valid_tokens:
	generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
	continue

	# Decode with GPT-2 compatible handling
	try:
	# First attempt: standard decode with proper cleanup
	generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)

	# GPT-2 specific: handle byte-level tokens properly
	if generated is None:
	raise ValueError("Tokenizer decode returned None")

	# Clean up common GPT-2 artifacts - more aggressive cleaning
	generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens
	generated = ' '.join(generated.split()) # Normalize whitespace

	# Additional cleaning for common BPE artifacts
	generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ')

	# Check for gibberish (too many special characters)
	special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&\|^~%#@?:\'\"\\') / max(len(generated), 1)
	if special_ratio > 0.7: # More than 70% special chars = likely gibberish
	raise ValueError("Decoded output appears to be gibberish")

	except Exception as decode_error:
	# Second attempt: decode with skip_special_tokens=True
	try:
	generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
	if generated is None:
	raise ValueError("Tokenizer decode (skip_special) returned None")

	# Clean up GPT-2 artifacts
	generated = generated.replace('Ġ', ' ').replace('▁', ' ')
	generated = ' '.join(generated.split())

	# Check for gibberish again
	special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&\|^~%#@?:\'\"\\') / max(len(generated), 1)
	if special_ratio > 0.7:
	raise ValueError("Decoded output still appears to be gibberish")

	except Exception as decode_error2:
	# Third attempt: manual byte-level decoding for GPT-2
	try:
	# GPT-2 uses byte-level BPE, so we need to decode bytes properly
	if hasattr(loaded_tokenizer, 'byte_decoder'):
	# Use the tokenizer's byte decoder
	byte_tokens = []
	for token_id in valid_tokens:
	if token_id in loaded_tokenizer.decoder:
	token_bytes = loaded_tokenizer.decoder[token_id]
	if isinstance(token_bytes, bytes):
	byte_tokens.append(token_bytes)
	elif isinstance(token_bytes, str):
	byte_tokens.append(token_bytes.encode('utf-8', errors='ignore'))

	if byte_tokens:
	# Decode the byte sequence
	full_bytes = b''.join(byte_tokens)
	generated = full_bytes.decode('utf-8', errors='replace')

	# Clean up
	generated = generated.replace('Ġ', ' ').replace('▁', ' ')
	generated = ' '.join(generated.split())

	if not generated or generated.isspace():
	raise ValueError("Byte decoding produced empty result")
	else:
	raise ValueError("No valid byte tokens found")
	else:
	raise ValueError("Tokenizer has no byte_decoder")

	except Exception as byte_error:
	# Fourth attempt: fallback to vocab-based conversion
	try:
	if hasattr(loaded_tokenizer, 'get_vocab'):
	vocab = loaded_tokenizer.get_vocab()

	# Convert tokens, handling byte-level tokens
	text_parts = []
	for token_id in valid_tokens:
	if token_id in vocab:
	token_text = vocab[token_id]
	# Handle byte-level tokens (start with Ġ or ▁)
	if token_text.startswith('Ġ'):
	text_parts.append(' ' + token_text[1:])
	elif token_text.startswith('▁'):
	text_parts.append(' ' + token_text[1:])
	else:
	text_parts.append(token_text)

	generated = ''.join(text_parts)
	generated = ' '.join(generated.split()) # Clean whitespace

	if not generated or generated.isspace():
	raise ValueError("Vocab conversion produced empty result")
	else:
	raise ValueError("Tokenizer has no get_vocab method")

	except Exception as vocab_error:
	# Final fallback: show what we have
	generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..."

	# Final safety check
	if not isinstance(generated, str):
	generated = str(generated) if generated is not None else "# Decode returned non-string object"

	# Handle None result from decode
	if generated is None:
	generated = f"# Generation {i+1}: Decode returned None"

	# Extract code part with safety checks
	try:
	if '<CODE>' in generated:
	code_parts = generated.split('<CODE>')
	if len(code_parts) > 1:
	code = code_parts[-1].strip()
	else:
	code = generated.strip()
	else:
	code = generated.strip()

	# Remove special tokens safely
	special_tokens = ['<PAD>', '<SEP>', '</s>', '<s>', '<unk>', '<mask>', '<\|endoftext\|>']
	for token in special_tokens:
	code = code.replace(token, '')

	# Clean up extra whitespace but preserve some structure
	code = code.replace('\n\n\n', '\n\n') # Reduce excessive newlines

	# For debugging: include raw generated code
	raw_code = generated.strip()
	formatted_code = format_python_code(code)

	# Show both raw and formatted for transparency
	if not formatted_code.startswith('#'):
	code = f"# Model Generated (Raw):\n# {raw_code[:100]}...\n\n# Formatted Python Code:\n{formatted_code}"
	else:
	code = formatted_code

	# Ensure we have some content
	if not code or code.isspace():
	code = f"# Generated sequence {i+1} was empty after cleaning"

	except Exception as extract_error:
	code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}"

	# Final validation: ensure code is meaningful
	try:
	# Check if code contains at least some alphanumeric characters or code keywords
	has_alnum = any(c.isalnum() for c in code)
	has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list'])

	if not has_alnum and not has_code_indicators:
	code = f"# Generated sequence {i+1} contains no readable content"
	elif len(code) < 5: # Too short to be meaningful
	code = f"# Generated sequence {i+1} too short: {code}"
	elif code.count('#') > len(code) * 0.8: # Mostly error messages
	code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..."
	else:
	# Looks good, keep as is
	pass

	except Exception as validation_error:
	code = f"# Validation error for sequence {i+1}: {str(validation_error)}"

	generated_codes.append(code)

	except Exception as decode_error:
	# Handle any other decoding errors
	error_msg = f"# Error decoding sequence {i+1}: {str(decode_error)}"
	generated_codes.append(error_msg)

	# Ensure we have at least one result
	if not generated_codes:
	generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"]

	# Log generation summary for debugging
	valid_generations = [code for code in generated_codes if not code.startswith('#')]
	error_generations = [code for code in generated_codes if code.startswith('#')]

	if error_generations:
	print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors")
	for error in error_generations[:3]: # Log first 3 errors
	print(f" Error: {error[:100]}...")

	# Use the first generated code as primary output
	primary_code = generated_codes[0] if generated_codes else "# No code generated"

	# Calculate metrics if reference code is provided
	metrics_output = ""
	bleu_output = ""

	if reference_code and reference_code.strip() and not primary_code.startswith('#'):
	# Only calculate metrics if we have valid generated code (not error messages)
	try:
	# Calculate BLEU scores
	bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_score(reference_code, primary_code)

	bleu_output = f"""📊 BLEU Scores:
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	• BLEU-1 (Unigram): {bleu_1:.4f} ({bleu_1*100:.2f}%)
	• BLEU-2 (Bigram): {bleu_2:.4f} ({bleu_2*100:.2f}%)
	• BLEU-3 (Trigram): {bleu_3:.4f} ({bleu_3*100:.2f}%)
	• BLEU-4 (4-gram): {bleu_4:.4f} ({bleu_4*100:.2f}%)
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	💡 Interpretation:
	• BLEU > 0.4: Excellent match
	• BLEU 0.3-0.4: Good match
	• BLEU 0.2-0.3: Fair match
	• BLEU < 0.2: Poor match
	"""

	# Calculate additional metrics
	code_metrics = calculate_code_metrics(reference_code, primary_code)

	metrics_output = f"""📈 Additional Metrics:
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	• Length Ratio: {code_metrics['length_ratio']:.3f}
	• Precision: {code_metrics['precision']:.4f} ({code_metrics['precision']*100:.2f}%)
	• Recall: {code_metrics['recall']:.4f} ({code_metrics['recall']*100:.2f}%)
	• F1-Score: {code_metrics['f1_score']:.4f} ({code_metrics['f1_score']*100:.2f}%)
	• Character Overlap: {code_metrics['char_overlap']:.4f} ({code_metrics['char_overlap']*100:.2f}%)
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	⏱️ Generation Time: {generation_time:.2f}s
	📝 Sequences Generated: {num_sequences}
	🔢 Output Length: {len(primary_code)} characters
	━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	"""
	except Exception as metrics_error:
	metrics_output = f"""⚠️ Metrics calculation failed: {str(metrics_error)}

	⏱️ Generation Time: {generation_time:.2f}s
	📝 Sequences Generated: {num_sequences}
	🔢 Output Length: {len(primary_code)} characters
	"""
	else:
	metrics_output = f"""⏱️ Generation Time: {generation_time:.2f}s
	📝 Sequences Generated: {num_sequences}
	🔢 Output Length: {len(primary_code)} characters

	💡 Tip: Provide reference code to see BLEU scores and similarity metrics!
	"""

	# Format alternative sequences
	alternatives = ""
	if num_sequences > 1 and len(generated_codes) > 1:
	alternatives = "🔄 Alternative Generations:\n" + "━"*50 + "\n\n"
	for i, code in enumerate(generated_codes[1:], 2):
	# Skip error messages in alternatives
	if not code.startswith('#'):
	alternatives += f"Variation {i}:\n```python\n{code}\n```\n\n"
	else:
	alternatives += f"Variation {i}: {code}\n\n"

	# Add to history (only if primary code is not an error message)
	if not primary_code.startswith('#'):
	generation_history.append({
	'pseudo': pseudo_code,
	'generated': primary_code,
	'bleu_4': bleu_4 if reference_code and not primary_code.startswith('#') else None,
	'time': generation_time
	})

	return primary_code, metrics_output, bleu_output, alternatives

	except Exception as e:
	return f"❌ Error generating code: {str(e)}", "", "", ""

	def show_examples(example_name):
	"""Load example pseudo-code"""
	examples = {
	"Basic Loop": "create a list of numbers from 1 to 10",
	"Function Definition": "define a function to calculate the sum of two numbers",
	"List Iteration": "iterate through a list and print each element",
	"Conditional Check": "check if a number is even or odd",
	"Sorting": "sort a list in descending order",
	"Maximum Element": "create a function to find maximum element in array",
	"Binary Search": "implement binary search algorithm",
	"Factorial": "create a recursive function to calculate factorial",
	"Palindrome": "check if a string is palindrome",
	"Fibonacci": "generate fibonacci sequence up to n terms"
	}
	return examples.get(example_name, "")

	def clear_all():
	"""Clear all inputs and outputs"""
	return "", "", "", "", "", 150, 0.7, 50, 0.95, 1

	def show_history():
	"""Display generation history"""
	if not generation_history:
	return "No generation history yet. Start generating code!"

	history_text = "📜 Generation History:\n" + "="*60 + "\n\n"

	for i, entry in enumerate(reversed(generation_history[-10:]), 1): # Show last 10
	history_text += f"{i}. Pseudo: {entry['pseudo'][:60]}...\n"
	history_text += f" Time: {entry['time']:.2f}s"
	if entry['bleu_4'] is not None:
	history_text += f" \| BLEU-4: {entry['bleu_4']:.4f}"
	history_text += f"\n Code: {entry['generated'][:80]}...\n\n"

	return history_text

	# Create Gradio interface with custom CSS
	custom_css = """
	.gradio-container {
	font-family: 'Arial', sans-serif;
	}
	.output-code {
	font-family: 'Courier New', monospace;
	font-size: 14px;
	}
	.metrics-box {
	background-color: #f0f8ff;
	border-radius: 8px;
	padding: 10px;
	}
	"""

	with gr.Blocks(title="🚀 GPT-2 Pseudo-Code to Code Generator", theme=gr.themes.Soft(), css=custom_css) as demo:

	gr.Markdown("""
	# 🚀 GPT-2 Pseudo-Code to Python Code Generator

	Transform natural language descriptions into executable Python code using fine-tuned GPT-2!

	This model is trained on the SPOC (Search-based Pseudo-code to Code) dataset and can generate Python code from pseudo-code descriptions.
	""")

	with gr.Tabs():
	# Tab 1: Code Generation
	with gr.Tab("💻 Code Generation"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### � Model Status")
	model_status = gr.Textbox(
	label="Model Information",
	lines=15,
	interactive=False,
	value=initialize_model() # Auto-load on startup
	)

	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### ✍️ Enter Pseudo-Code")

	# Example selector
	with gr.Row():
	example_dropdown = gr.Dropdown(
	choices=["Basic Loop", "Function Definition", "List Iteration",
	"Conditional Check", "Sorting", "Maximum Element",
	"Binary Search", "Factorial", "Palindrome", "Fibonacci"],
	label="📚 Load Example",
	value=None
	)

	pseudo_input = gr.Textbox(
	label="Pseudo-Code Description",
	placeholder="Example: create a function to calculate factorial of a number",
	lines=4
	)

	reference_code = gr.Textbox(
	label="Reference Code (Optional - for BLEU score calculation)",
	placeholder="Paste reference code here to calculate BLEU scores...",
	lines=4
	)

	gr.Markdown("### ⚙️ Generation Parameters")
	with gr.Row():
	max_length = gr.Slider(
	minimum=50,
	maximum=500,
	value=150,
	step=10,
	label="Max Length",
	info="Maximum tokens to generate"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Higher = more creative"
	)

	with gr.Row():
	top_k = gr.Slider(
	minimum=10,
	maximum=100,
	value=50,
	step=5,
	label="Top-K",
	info="Vocabulary filtering"
	)
	top_p = gr.Slider(
	minimum=0.5,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-P",
	info="Nucleus sampling"
	)

	num_sequences = gr.Slider(
	minimum=1,
	maximum=5,
	value=1,
	step=1,
	label="Number of Variations",
	info="Generate multiple versions"
	)

	with gr.Row():
	generate_btn = gr.Button("✨ Generate Code", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear All", variant="secondary")

	with gr.Column(scale=1):
	gr.Markdown("### 💻 Generated Python Code")
	code_output = gr.Code(
	label="Generated Code",
	language="python",
	lines=12,
	elem_classes="output-code"
	)

	with gr.Row():
	with gr.Column():
	metrics_output = gr.Textbox(
	label="📊 Performance Metrics",
	lines=8,
	interactive=False,
	elem_classes="metrics-box"
	)
	with gr.Column():
	bleu_output = gr.Textbox(
	label="🎯 BLEU Scores",
	lines=8,
	interactive=False,
	elem_classes="metrics-box"
	)

	alternatives_output = gr.Markdown(
	label="🔄 Alternative Generations"
	)

	# Tab 2: Information & Guide
	with gr.Tab("📖 Guide & Examples"):
	gr.Markdown("""
	## 📚 How to Use

	### 1️⃣ Load Your Model
	- Upload the `best_model.pkl` file (trained GPT-2 model)
	- Click "Load Model" and wait for confirmation
	- You'll see model configuration and training metrics

	### 2️⃣ Generate Code
	- Quick Start: Select an example from the dropdown
	- Custom Input: Type your own pseudo-code description
	- Optional: Add reference code to calculate BLEU scores
	- Adjust generation parameters for different outputs
	- Click "Generate Code"

	### 3️⃣ Understand the Metrics

	#### 🎯 BLEU Score (Bilingual Evaluation Understudy)
	- Measures similarity between generated and reference code
	- BLEU-1: Word-level similarity (unigrams)
	- BLEU-2: 2-word phrase similarity (bigrams)
	- BLEU-3: 3-word phrase similarity (trigrams)
	- BLEU-4: 4-word phrase similarity (most comprehensive)

	Score Interpretation:
	- 🟢 > 0.4: Excellent match - Generated code is very similar to reference
	- 🟡 0.3-0.4: Good match - Code captures most key elements
	- 🟠 0.2-0.3: Fair match - Some similarity exists
	- 🔴 < 0.2: Poor match - Significant differences

	#### 📈 Additional Metrics
	- Precision: How many generated words appear in reference
	- Recall: How many reference words appear in generated code
	- F1-Score: Harmonic mean of precision and recall
	- Length Ratio: Generated vs reference code length
	- Character Overlap: Character-level similarity

	### 🎛️ Generation Parameters

	\| Parameter \| Low Value \| High Value \| Use Case \|
	\|-----------\|-----------\|------------\|----------\|
	\| Temperature \| 0.1-0.3 \| 0.8-1.2 \| Low: Deterministic, focused<br>High: Creative, diverse \|
	\| Top-K \| 10-30 \| 60-100 \| Low: Conservative choices<br>High: More variety \|
	\| Top-P \| 0.5-0.8 \| 0.9-1.0 \| Low: Safe predictions<br>High: Exploratory \|
	\| Max Length \| 50-100 \| 200-500 \| Short: Simple code<br>Long: Complex implementations \|

	---

	## 💡 Example Pseudo-Code Prompts

	### Basic Operations
	```
	create a list of numbers from 1 to 10
	define a function to calculate the sum of two numbers
	iterate through a list and print each element
	```

	### Conditionals & Logic
	```
	check if a number is even or odd
	find the maximum of three numbers
	validate if a string is empty
	```

	### Data Structures
	```
	sort a list in descending order
	remove duplicates from a list
	merge two dictionaries
	```

	### Algorithms
	```
	implement binary search algorithm
	create a recursive function to calculate factorial
	generate fibonacci sequence up to n terms
	check if a string is palindrome
	```

	### Advanced
	```
	create a class to represent a student with name and grades
	implement a function to read CSV file and return dataframe
	create a decorator to measure function execution time
	```

	---

	## 🎓 About the Model

	This model is fine-tuned on the SPOC (Search-based Pseudo-code to Code) dataset:
	- 📄 Paper: [SPOC: Search-based Pseudo-code to Code](https://arxiv.org/pdf/1906.04908)
	- 🏛️ Source: Stanford University
	- 🤖 Base Model: GPT-2 (Decoder-Only Transformer)
	- 📊 Training: 10,000+ pseudo-code to code pairs
	- 🎯 Task: Causal Language Modeling

	---

	## ⚠️ Limitations

	- Model may not handle very complex algorithms perfectly
	- Generated code should be tested before production use
	- Best results with clear, specific pseudo-code descriptions
	- Model trained on C++ code, adapted for Python generation

	---

	## 🤝 Tips for Best Results

	1. ✅ Be Specific: "create a function to sort list in ascending order" vs "sort list"
	2. ✅ Use Action Words: "create", "define", "implement", "calculate"
	3. ✅ Mention Data Types: "list", "string", "dictionary", "integer"
	4. ✅ Include Details: "recursive function" vs just "function"
	5. ✅ Try Variations: Generate multiple times with different temperatures

	""")

	# Tab 3: History
	with gr.Tab("📜 History"):
	gr.Markdown("## 📊 Generation History")
	history_display = gr.Textbox(
	label="Recent Generations",
	lines=20,
	interactive=False
	)
	refresh_history_btn = gr.Button("🔄 Refresh History", variant="secondary")

	gr.Markdown("""
	---
	### 🌟 Features
	- ✅ Upload and use custom trained models
	- ✅ BLEU score calculation for quality assessment
	- ✅ Multiple evaluation metrics (Precision, Recall, F1)
	- ✅ Generate multiple code variations
	- ✅ Real-time performance tracking
	- ✅ Example prompts library
	- ✅ Generation history

	### 📝 Citation
	If you use this model, please cite:
	```
	@article{kulal2019spoc,
	title={SPOC: Search-based Pseudo-code to Code},
	author={Kulal, Sumith and Pasupat, Panupong and Chandra, Kartik and Lee, Mina and Padon, Oded and Aiken, Alex and Liang, Percy},
	journal={arXiv preprint arXiv:1906.04908},
	year={2019}
	}
	```

	Built with ❤️ using HuggingFace Transformers & Gradio
	""")

	# Event handlers
	example_dropdown.change(
	fn=show_examples,
	inputs=[example_dropdown],
	outputs=[pseudo_input]
	)

	generate_btn.click(
	fn=generate_code_from_pseudo,
	inputs=[pseudo_input, max_length, temperature, top_k, top_p, num_sequences, reference_code],
	outputs=[code_output, metrics_output, bleu_output, alternatives_output]
	)

	clear_btn.click(
	fn=clear_all,
	inputs=[],
	outputs=[pseudo_input, reference_code, code_output, metrics_output, bleu_output,
	max_length, temperature, top_k, top_p, num_sequences]
	)

	refresh_history_btn.click(
	fn=show_history,
	inputs=[],
	outputs=[history_display]
	)

	# Launch the interface
	if __name__ == "__main__":
	demo.launch(share=False)