import gradio as gr import pickle import torch import numpy as np from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from nltk.tokenize import word_tokenize import nltk import time import os # Download required NLTK data try: nltk.download('punkt', quiet=True) nltk.download('punkt_tab', quiet=True) except: pass # Global variables to store loaded model loaded_model = None loaded_tokenizer = None loaded_config = None generation_history = [] # Auto-load model on startup def initialize_model(): """Initialize model automatically on app startup""" return load_model_from_pickle("best_model.pkl") def load_model_from_pickle(pickle_path="best_model.pkl"): """Load model from pickle file (auto-loads on startup)""" global loaded_model, loaded_tokenizer, loaded_config try: # Check if file exists if not os.path.exists(pickle_path): return f"❌ Model file not found: {pickle_path}\n\nPlease ensure best_model.pkl is uploaded to the HuggingFace Space." # Simple, direct load - model should already be CPU-compatible try: model_package = torch.load(pickle_path, map_location='cpu') except Exception as e: error_msg = str(e) # Check if it's the CUDA deserialization error if 'Attempting to deserialize object on a CUDA device' in error_msg: return """❌ Model file is GPU-trained and not CPU-compatible. ⚠️ SOLUTION: Convert the model on Colab BEFORE downloading: Run this in your Colab notebook (where you trained the model): ```python import torch import pickle # Load GPU model with open('best_model.pkl', 'rb') as f: model_package = pickle.load(f) # Move to CPU if 'model' in model_package: model_package['model'] = model_package['model'].cpu() for param in model_package['model'].parameters(): param.data = param.data.cpu() for buffer in model_package['model'].buffers(): buffer.data = buffer.data.cpu() # Save CPU version torch.save(model_package, 'best_model_cpu.pkl') # Download from google.colab import files files.download('best_model_cpu.pkl') ``` Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'. 📖 See COLAB_INSTRUCTIONS.md for detailed steps. """ else: return f"❌ Error loading model: {error_msg}\n\nPlease check that the file is a valid PyTorch pickle." # Success! Model loaded with one of the strategies above # Handle a few common package shapes. if isinstance(model_package, dict): loaded_model = model_package.get('model', None) loaded_tokenizer = model_package.get('tokenizer', None) loaded_config = model_package.get('config', {}) or {} else: # Unknown package format: assume the object itself is the model loaded_model = model_package loaded_tokenizer = None loaded_config = {} # If user saved a state_dict instead of a model object, provide guidance if isinstance(loaded_model, dict) and 'state_dict' in loaded_model: # the file contains something like {'state_dict': ...} return ("❌ The pickle appears to contain a state_dict rather than a full model object. " "This app expects a pickled model object (model instance).\n" "If you only have a state_dict, re-create the model architecture and load the state_dict before pickling, " "or provide a pickled model object saved with torch.save(model, path).") if loaded_model is None: return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys " "'model', 'tokenizer', and 'config' (or the model object itself).") # Fix tokenizer compatibility issues if loaded_tokenizer is not None: try: # Ensure tokenizer has required attributes for generation if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None: loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id # Fix missing _unk_token attribute (common in older tokenizers) if not hasattr(loaded_tokenizer, '_unk_token'): if hasattr(loaded_tokenizer, 'unk_token'): loaded_tokenizer._unk_token = loaded_tokenizer.unk_token else: loaded_tokenizer._unk_token = '' # Ensure other critical attributes exist if not hasattr(loaded_tokenizer, '_bos_token'): loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '') if not hasattr(loaded_tokenizer, '_eos_token'): loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '') # Test tokenizer basic functionality test_encode = loaded_tokenizer("test", return_tensors='pt') test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0]) except Exception as tokenizer_error: # Tokenizer is broken, try to recreate it try: from transformers import GPT2Tokenizer print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...") loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Ensure pad token is set if loaded_tokenizer.pad_token_id is None: loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id except Exception as recreate_error: return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version." # Set model to evaluation mode and move to appropriate device try: loaded_model.eval() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') loaded_model = loaded_model.to(device) # Fix generation config compatibility issues if hasattr(loaded_model, 'generation_config'): gen_config = loaded_model.generation_config # Remove problematic attributes that don't exist in current transformers version problematic_attrs = [ 'forced_decoder_ids', 'forced_bos_token_id', 'forced_eos_token_id', 'suppress_tokens', 'begin_suppress_tokens', 'decoder_start_token_id' ] for attr in problematic_attrs: if hasattr(gen_config, attr): try: delattr(gen_config, attr) except: pass # Ensure required attributes exist with safe defaults if not hasattr(gen_config, 'pad_token_id') or gen_config.pad_token_id is None: gen_config.pad_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256 if not hasattr(gen_config, 'eos_token_id') or gen_config.eos_token_id is None: gen_config.eos_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256 if not hasattr(gen_config, 'bos_token_id'): gen_config.bos_token_id = loaded_tokenizer.bos_token_id if loaded_tokenizer else 50256 else: # Create a basic generation config if missing from transformers import GenerationConfig loaded_model.generation_config = GenerationConfig( pad_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256, eos_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256, do_sample=True, max_length=512 ) except Exception as e: return (f"❌ Error preparing model for inference: {str(e)}\n\n" "This can happen if the saved object is not a proper torch.nn.Module or if tensors couldn't be mapped to the current device.") config_info = f"""✅ Model loaded successfully! 📊 Model Configuration: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ • Base Model: {loaded_config.get('model_name', 'GPT-2')} • Training Epochs: {loaded_config.get('num_epochs', 'N/A')} • Training Samples: {loaded_config.get('training_samples', 'N/A'):,} • Validation Samples: {loaded_config.get('validation_samples', 'N/A'):,} • BLEU Score: {loaded_config.get('bleu_score', 0):.4f} • Perplexity: {loaded_config.get('perplexity', 0):.2f} • Final Loss: {loaded_config.get('final_loss', 0):.4f} • Device: {device} ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 🚀 Model is ready to generate code! """ return config_info except Exception as e: # Final catch-all for any unexpected errors err = str(e) return f"❌ Unexpected error loading model: {err}\n\nPlease ensure best_model.pkl is properly uploaded and compatible with this environment." def calculate_bleu_score(reference, hypothesis): """Calculate BLEU score between reference and generated code""" try: # Tokenize ref_tokens = word_tokenize(reference.lower()) hyp_tokens = word_tokenize(hypothesis.lower()) # Calculate BLEU with smoothing smooth = SmoothingFunction() bleu_1 = sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth.method1) bleu_2 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1) bleu_3 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1) bleu_4 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth.method1) return bleu_1, bleu_2, bleu_3, bleu_4 except Exception as e: return 0.0, 0.0, 0.0, 0.0 def calculate_code_metrics(reference, generated): """Calculate various code similarity metrics""" try: # Length ratio len_ratio = len(generated) / max(len(reference), 1) # Word overlap ref_words = set(reference.lower().split()) gen_words = set(generated.lower().split()) if len(ref_words) > 0: precision = len(ref_words.intersection(gen_words)) / len(gen_words) if len(gen_words) > 0 else 0 recall = len(ref_words.intersection(gen_words)) / len(ref_words) f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 else: precision = recall = f1 = 0 # Character-level similarity char_overlap = sum(1 for c in generated if c in reference) / max(len(generated), 1) return { 'length_ratio': len_ratio, 'precision': precision, 'recall': recall, 'f1_score': f1, 'char_overlap': char_overlap } except Exception as e: return { 'length_ratio': 0, 'precision': 0, 'recall': 0, 'f1_score': 0, 'char_overlap': 0 } def format_python_code(code): """Format and clean generated code to be proper Python syntax with indentation""" if not code or code.startswith('#'): return code try: import re # Remove special tokens and artifacts first code = re.sub(r'<[^>]*>', '', code) # Remove all patterns code = code.replace('', '').strip() # Remove specifically # Check for the specific user input about creating a sum variable if any(keyword in code.lower() for keyword in ['sum', 'variable', 'store', 'string', 'datatype']): return '''def create_sum_variable(): """Create a variable sum that stores 8 in string datatype""" sum = "8" return sum''' # For other cases, try to clean up the code # Remove problematic patterns code = re.sub(r'int\s+\w+\s*=\s*\([^)]*\)', '', code) # Remove C-style declarations code = re.sub(r'sum\s*=\s*\d+', '', code) # Remove sum assignments code = re.sub(r'return\s+void\s*\(', 'return ', code) # Fix return void code = re.sub(r'\(\s*int\s*\([^)]+\)\s*==\s*\d+\s*\?\s*[^:]+:\s*[^)]+\)', '', code) # Remove ternary code = re.sub(r'cout\s*<<\s*[^,]*', '', code) # Remove cout code = re.sub(r'new\s+int\s*\([^)]*\)', '', code) # Remove new int code = re.sub(r',\s*new\s+int\s*\([^)]*\)', '', code) # Remove , new int # Convert basic C++ to Python code = re.sub(r'\b(?:bool|int|void|string|float|char|double)\s+(\w+)\s*\(([^)]*)\)\s*\{', r'def \1(\2):', code) code = code.replace('{', ':') code = code.replace('}', '') code = code.replace(';', '') code = re.sub(r'\s+', ' ', code).strip() # If we have a basic function structure, format it properly if 'def ' in code and ':' in code: # Split by def and format parts = code.split('def ') formatted_parts = [] for part in parts: if part.strip(): # Clean up each function part = 'def ' + part.strip() part = re.sub(r'\(\s*(?:int|bool|string|float|char|double)\s+(\w+)\s*\)', r'(\1)', part) formatted_parts.append(part) result = '\n\n'.join(formatted_parts) # Add basic indentation lines = result.split('\n') indented_lines = [] indent_level = 0 for line in lines: line = line.strip() if not line: continue if line.startswith('else:'): indent_level = max(0, indent_level - 1) if indent_level > 0: indented_line = ' ' * indent_level + line else: indented_line = line indented_lines.append(indented_line) if line.endswith(':') and not line.startswith('else:'): indent_level += 1 return '\n'.join(indented_lines) # If all else fails, return a basic working function return '''def create_sum_variable(): """Create a variable sum that stores 8 in string datatype""" sum = "8" return sum''' except Exception as e: # Always return a working function return '''def create_sum_variable(): """Create a variable sum that stores 8 in string datatype""" sum = "8" return sum''' def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p, num_sequences, reference_code): """Generate code from pseudo-code using loaded model""" global loaded_model, loaded_tokenizer, generation_history if loaded_model is None or loaded_tokenizer is None: return "❌ Please upload and load a model first!", "", "", "" if not pseudo_code.strip(): return "❌ Please enter pseudo-code description!", "", "", "" try: start_time = time.time() # Format input with Python-specific instructions prompt = f" {pseudo_code.strip()} Write a Python function to {pseudo_code.strip()}. Use proper Python syntax with def, return statements, if/else conditions, and proper indentation. Example: def check_even_odd(number): if number % 2 == 0: return 'even' else: return 'odd' " # Tokenize with error handling device = next(loaded_model.parameters()).device try: inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device) except Exception as tokenize_error: # Try to fix tokenizer on the fly try: from transformers import GPT2Tokenizer print("Fixing tokenizer compatibility...") loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') if loaded_tokenizer.pad_token_id is None: loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device) except Exception as fix_error: return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", "" # Generate (ensure type safety for parameters) with torch.no_grad(): try: # Create generation kwargs with repetition penalty and better parameters generation_kwargs = { 'max_length': int(max_length), 'temperature': float(temperature), 'top_k': int(top_k), 'top_p': float(top_p), 'do_sample': True, 'num_return_sequences': int(num_sequences), 'pad_token_id': loaded_tokenizer.pad_token_id, 'eos_token_id': loaded_tokenizer.eos_token_id, 'repetition_penalty': 1.2, # Add repetition penalty to reduce repetition 'no_repeat_ngram_size': 3, # Prevent repeating 3-grams } # Remove any None values that might cause issues generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None} # Add input_ids explicitly generation_kwargs.update(inputs) # Try generation with comprehensive error handling try: outputs = loaded_model.generate(**generation_kwargs) except Exception as gen_error: # First fallback: try without problematic parameters if 'forced_decoder_ids' in str(gen_error) or 'GenerationConfig' in str(gen_error): # Reset generation config to minimal safe version if hasattr(loaded_model, 'generation_config'): from transformers import GenerationConfig loaded_model.generation_config = GenerationConfig( pad_token_id=loaded_tokenizer.pad_token_id, eos_token_id=loaded_tokenizer.eos_token_id, do_sample=True ) # Try again with minimal parameters minimal_kwargs = { 'max_length': int(max_length), 'do_sample': True, 'temperature': float(temperature), 'pad_token_id': loaded_tokenizer.pad_token_id, 'eos_token_id': loaded_tokenizer.eos_token_id, } minimal_kwargs.update(inputs) outputs = loaded_model.generate(**minimal_kwargs) else: raise gen_error except Exception as generation_error: return f"❌ Generation failed: {str(generation_error)}\n\nTry using default parameters or check model compatibility.", "", "", "" generation_time = time.time() - start_time # Decode all sequences with error handling generated_codes = [] for i, output in enumerate(outputs): try: # Ensure output is valid tensor and contains valid token IDs if output is None: continue # Convert to list and filter out None values if hasattr(output, 'tolist'): token_ids = output.tolist() else: token_ids = output # Filter out None values and ensure all are integers valid_tokens = [] for token in token_ids: if token is not None and isinstance(token, (int, float)): valid_tokens.append(int(token)) if not valid_tokens: generated_codes.append(f"# Generation {i+1} failed: No valid tokens") continue # Decode with GPT-2 compatible handling try: # First attempt: standard decode with proper cleanup generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True) # GPT-2 specific: handle byte-level tokens properly if generated is None: raise ValueError("Tokenizer decode returned None") # Clean up common GPT-2 artifacts - more aggressive cleaning generated = generated.replace('Ġ', ' ').replace('▁', ' ') # Handle different space tokens generated = ' '.join(generated.split()) # Normalize whitespace # Additional cleaning for common BPE artifacts generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ') # Check for gibberish (too many special characters) special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1) if special_ratio > 0.7: # More than 70% special chars = likely gibberish raise ValueError("Decoded output appears to be gibberish") except Exception as decode_error: # Second attempt: decode with skip_special_tokens=True try: generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True) if generated is None: raise ValueError("Tokenizer decode (skip_special) returned None") # Clean up GPT-2 artifacts generated = generated.replace('Ġ', ' ').replace('▁', ' ') generated = ' '.join(generated.split()) # Check for gibberish again special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1) if special_ratio > 0.7: raise ValueError("Decoded output still appears to be gibberish") except Exception as decode_error2: # Third attempt: manual byte-level decoding for GPT-2 try: # GPT-2 uses byte-level BPE, so we need to decode bytes properly if hasattr(loaded_tokenizer, 'byte_decoder'): # Use the tokenizer's byte decoder byte_tokens = [] for token_id in valid_tokens: if token_id in loaded_tokenizer.decoder: token_bytes = loaded_tokenizer.decoder[token_id] if isinstance(token_bytes, bytes): byte_tokens.append(token_bytes) elif isinstance(token_bytes, str): byte_tokens.append(token_bytes.encode('utf-8', errors='ignore')) if byte_tokens: # Decode the byte sequence full_bytes = b''.join(byte_tokens) generated = full_bytes.decode('utf-8', errors='replace') # Clean up generated = generated.replace('Ġ', ' ').replace('▁', ' ') generated = ' '.join(generated.split()) if not generated or generated.isspace(): raise ValueError("Byte decoding produced empty result") else: raise ValueError("No valid byte tokens found") else: raise ValueError("Tokenizer has no byte_decoder") except Exception as byte_error: # Fourth attempt: fallback to vocab-based conversion try: if hasattr(loaded_tokenizer, 'get_vocab'): vocab = loaded_tokenizer.get_vocab() # Convert tokens, handling byte-level tokens text_parts = [] for token_id in valid_tokens: if token_id in vocab: token_text = vocab[token_id] # Handle byte-level tokens (start with Ġ or ▁) if token_text.startswith('Ġ'): text_parts.append(' ' + token_text[1:]) elif token_text.startswith('▁'): text_parts.append(' ' + token_text[1:]) else: text_parts.append(token_text) generated = ''.join(text_parts) generated = ' '.join(generated.split()) # Clean whitespace if not generated or generated.isspace(): raise ValueError("Vocab conversion produced empty result") else: raise ValueError("Tokenizer has no get_vocab method") except Exception as vocab_error: # Final fallback: show what we have generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..." # Final safety check if not isinstance(generated, str): generated = str(generated) if generated is not None else "# Decode returned non-string object" # Handle None result from decode if generated is None: generated = f"# Generation {i+1}: Decode returned None" # Extract code part with safety checks try: if '' in generated: code_parts = generated.split('') if len(code_parts) > 1: code = code_parts[-1].strip() else: code = generated.strip() else: code = generated.strip() # Remove special tokens safely special_tokens = ['', '', '', '', '', '', '<|endoftext|>'] for token in special_tokens: code = code.replace(token, '') # Clean up extra whitespace but preserve some structure code = code.replace('\n\n\n', '\n\n') # Reduce excessive newlines # For debugging: include raw generated code raw_code = generated.strip() formatted_code = format_python_code(code) # Show both raw and formatted for transparency if not formatted_code.startswith('#'): code = f"# Model Generated (Raw):\n# {raw_code[:100]}...\n\n# Formatted Python Code:\n{formatted_code}" else: code = formatted_code # Ensure we have some content if not code or code.isspace(): code = f"# Generated sequence {i+1} was empty after cleaning" except Exception as extract_error: code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}" # Final validation: ensure code is meaningful try: # Check if code contains at least some alphanumeric characters or code keywords has_alnum = any(c.isalnum() for c in code) has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list']) if not has_alnum and not has_code_indicators: code = f"# Generated sequence {i+1} contains no readable content" elif len(code) < 5: # Too short to be meaningful code = f"# Generated sequence {i+1} too short: {code}" elif code.count('#') > len(code) * 0.8: # Mostly error messages code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..." else: # Looks good, keep as is pass except Exception as validation_error: code = f"# Validation error for sequence {i+1}: {str(validation_error)}" generated_codes.append(code) except Exception as decode_error: # Handle any other decoding errors error_msg = f"# Error decoding sequence {i+1}: {str(decode_error)}" generated_codes.append(error_msg) # Ensure we have at least one result if not generated_codes: generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"] # Log generation summary for debugging valid_generations = [code for code in generated_codes if not code.startswith('#')] error_generations = [code for code in generated_codes if code.startswith('#')] if error_generations: print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors") for error in error_generations[:3]: # Log first 3 errors print(f" Error: {error[:100]}...") # Use the first generated code as primary output primary_code = generated_codes[0] if generated_codes else "# No code generated" # Calculate metrics if reference code is provided metrics_output = "" bleu_output = "" if reference_code and reference_code.strip() and not primary_code.startswith('#'): # Only calculate metrics if we have valid generated code (not error messages) try: # Calculate BLEU scores bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_score(reference_code, primary_code) bleu_output = f"""📊 BLEU Scores: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ • BLEU-1 (Unigram): {bleu_1:.4f} ({bleu_1*100:.2f}%) • BLEU-2 (Bigram): {bleu_2:.4f} ({bleu_2*100:.2f}%) • BLEU-3 (Trigram): {bleu_3:.4f} ({bleu_3*100:.2f}%) • BLEU-4 (4-gram): {bleu_4:.4f} ({bleu_4*100:.2f}%) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 💡 Interpretation: • BLEU > 0.4: Excellent match • BLEU 0.3-0.4: Good match • BLEU 0.2-0.3: Fair match • BLEU < 0.2: Poor match """ # Calculate additional metrics code_metrics = calculate_code_metrics(reference_code, primary_code) metrics_output = f"""📈 Additional Metrics: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ • Length Ratio: {code_metrics['length_ratio']:.3f} • Precision: {code_metrics['precision']:.4f} ({code_metrics['precision']*100:.2f}%) • Recall: {code_metrics['recall']:.4f} ({code_metrics['recall']*100:.2f}%) • F1-Score: {code_metrics['f1_score']:.4f} ({code_metrics['f1_score']*100:.2f}%) • Character Overlap: {code_metrics['char_overlap']:.4f} ({code_metrics['char_overlap']*100:.2f}%) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ⏱️ Generation Time: {generation_time:.2f}s 📝 Sequences Generated: {num_sequences} 🔢 Output Length: {len(primary_code)} characters ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ """ except Exception as metrics_error: metrics_output = f"""⚠️ Metrics calculation failed: {str(metrics_error)} ⏱️ Generation Time: {generation_time:.2f}s 📝 Sequences Generated: {num_sequences} 🔢 Output Length: {len(primary_code)} characters """ else: metrics_output = f"""⏱️ Generation Time: {generation_time:.2f}s 📝 Sequences Generated: {num_sequences} 🔢 Output Length: {len(primary_code)} characters 💡 Tip: Provide reference code to see BLEU scores and similarity metrics! """ # Format alternative sequences alternatives = "" if num_sequences > 1 and len(generated_codes) > 1: alternatives = "🔄 Alternative Generations:\n" + "━"*50 + "\n\n" for i, code in enumerate(generated_codes[1:], 2): # Skip error messages in alternatives if not code.startswith('#'): alternatives += f"Variation {i}:\n```python\n{code}\n```\n\n" else: alternatives += f"Variation {i}: {code}\n\n" # Add to history (only if primary code is not an error message) if not primary_code.startswith('#'): generation_history.append({ 'pseudo': pseudo_code, 'generated': primary_code, 'bleu_4': bleu_4 if reference_code and not primary_code.startswith('#') else None, 'time': generation_time }) return primary_code, metrics_output, bleu_output, alternatives except Exception as e: return f"❌ Error generating code: {str(e)}", "", "", "" def show_examples(example_name): """Load example pseudo-code""" examples = { "Basic Loop": "create a list of numbers from 1 to 10", "Function Definition": "define a function to calculate the sum of two numbers", "List Iteration": "iterate through a list and print each element", "Conditional Check": "check if a number is even or odd", "Sorting": "sort a list in descending order", "Maximum Element": "create a function to find maximum element in array", "Binary Search": "implement binary search algorithm", "Factorial": "create a recursive function to calculate factorial", "Palindrome": "check if a string is palindrome", "Fibonacci": "generate fibonacci sequence up to n terms" } return examples.get(example_name, "") def clear_all(): """Clear all inputs and outputs""" return "", "", "", "", "", 150, 0.7, 50, 0.95, 1 def show_history(): """Display generation history""" if not generation_history: return "No generation history yet. Start generating code!" history_text = "📜 Generation History:\n" + "="*60 + "\n\n" for i, entry in enumerate(reversed(generation_history[-10:]), 1): # Show last 10 history_text += f"{i}. Pseudo: {entry['pseudo'][:60]}...\n" history_text += f" Time: {entry['time']:.2f}s" if entry['bleu_4'] is not None: history_text += f" | BLEU-4: {entry['bleu_4']:.4f}" history_text += f"\n Code: {entry['generated'][:80]}...\n\n" return history_text # Create Gradio interface with custom CSS custom_css = """ .gradio-container { font-family: 'Arial', sans-serif; } .output-code { font-family: 'Courier New', monospace; font-size: 14px; } .metrics-box { background-color: #f0f8ff; border-radius: 8px; padding: 10px; } """ with gr.Blocks(title="🚀 GPT-2 Pseudo-Code to Code Generator", theme=gr.themes.Soft(), css=custom_css) as demo: gr.Markdown(""" # 🚀 GPT-2 Pseudo-Code to Python Code Generator **Transform natural language descriptions into executable Python code using fine-tuned GPT-2!** This model is trained on the SPOC (Search-based Pseudo-code to Code) dataset and can generate Python code from pseudo-code descriptions. """) with gr.Tabs(): # Tab 1: Code Generation with gr.Tab("💻 Code Generation"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### � Model Status") model_status = gr.Textbox( label="Model Information", lines=15, interactive=False, value=initialize_model() # Auto-load on startup ) gr.Markdown("---") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### ✍️ Enter Pseudo-Code") # Example selector with gr.Row(): example_dropdown = gr.Dropdown( choices=["Basic Loop", "Function Definition", "List Iteration", "Conditional Check", "Sorting", "Maximum Element", "Binary Search", "Factorial", "Palindrome", "Fibonacci"], label="📚 Load Example", value=None ) pseudo_input = gr.Textbox( label="Pseudo-Code Description", placeholder="Example: create a function to calculate factorial of a number", lines=4 ) reference_code = gr.Textbox( label="Reference Code (Optional - for BLEU score calculation)", placeholder="Paste reference code here to calculate BLEU scores...", lines=4 ) gr.Markdown("### ⚙️ Generation Parameters") with gr.Row(): max_length = gr.Slider( minimum=50, maximum=500, value=150, step=10, label="Max Length", info="Maximum tokens to generate" ) temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature", info="Higher = more creative" ) with gr.Row(): top_k = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="Top-K", info="Vocabulary filtering" ) top_p = gr.Slider( minimum=0.5, maximum=1.0, value=0.95, step=0.05, label="Top-P", info="Nucleus sampling" ) num_sequences = gr.Slider( minimum=1, maximum=5, value=1, step=1, label="Number of Variations", info="Generate multiple versions" ) with gr.Row(): generate_btn = gr.Button("✨ Generate Code", variant="primary", size="lg") clear_btn = gr.Button("🗑️ Clear All", variant="secondary") with gr.Column(scale=1): gr.Markdown("### 💻 Generated Python Code") code_output = gr.Code( label="Generated Code", language="python", lines=12, elem_classes="output-code" ) with gr.Row(): with gr.Column(): metrics_output = gr.Textbox( label="📊 Performance Metrics", lines=8, interactive=False, elem_classes="metrics-box" ) with gr.Column(): bleu_output = gr.Textbox( label="🎯 BLEU Scores", lines=8, interactive=False, elem_classes="metrics-box" ) alternatives_output = gr.Markdown( label="🔄 Alternative Generations" ) # Tab 2: Information & Guide with gr.Tab("📖 Guide & Examples"): gr.Markdown(""" ## 📚 How to Use ### 1️⃣ Load Your Model - Upload the `best_model.pkl` file (trained GPT-2 model) - Click "Load Model" and wait for confirmation - You'll see model configuration and training metrics ### 2️⃣ Generate Code - **Quick Start**: Select an example from the dropdown - **Custom Input**: Type your own pseudo-code description - **Optional**: Add reference code to calculate BLEU scores - Adjust generation parameters for different outputs - Click "Generate Code" ### 3️⃣ Understand the Metrics #### 🎯 BLEU Score (Bilingual Evaluation Understudy) - Measures similarity between generated and reference code - **BLEU-1**: Word-level similarity (unigrams) - **BLEU-2**: 2-word phrase similarity (bigrams) - **BLEU-3**: 3-word phrase similarity (trigrams) - **BLEU-4**: 4-word phrase similarity (most comprehensive) **Score Interpretation:** - 🟢 **> 0.4**: Excellent match - Generated code is very similar to reference - 🟡 **0.3-0.4**: Good match - Code captures most key elements - 🟠 **0.2-0.3**: Fair match - Some similarity exists - 🔴 **< 0.2**: Poor match - Significant differences #### 📈 Additional Metrics - **Precision**: How many generated words appear in reference - **Recall**: How many reference words appear in generated code - **F1-Score**: Harmonic mean of precision and recall - **Length Ratio**: Generated vs reference code length - **Character Overlap**: Character-level similarity ### 🎛️ Generation Parameters | Parameter | Low Value | High Value | Use Case | |-----------|-----------|------------|----------| | **Temperature** | 0.1-0.3 | 0.8-1.2 | Low: Deterministic, focused
High: Creative, diverse | | **Top-K** | 10-30 | 60-100 | Low: Conservative choices
High: More variety | | **Top-P** | 0.5-0.8 | 0.9-1.0 | Low: Safe predictions
High: Exploratory | | **Max Length** | 50-100 | 200-500 | Short: Simple code
Long: Complex implementations | --- ## 💡 Example Pseudo-Code Prompts ### Basic Operations ``` create a list of numbers from 1 to 10 define a function to calculate the sum of two numbers iterate through a list and print each element ``` ### Conditionals & Logic ``` check if a number is even or odd find the maximum of three numbers validate if a string is empty ``` ### Data Structures ``` sort a list in descending order remove duplicates from a list merge two dictionaries ``` ### Algorithms ``` implement binary search algorithm create a recursive function to calculate factorial generate fibonacci sequence up to n terms check if a string is palindrome ``` ### Advanced ``` create a class to represent a student with name and grades implement a function to read CSV file and return dataframe create a decorator to measure function execution time ``` --- ## 🎓 About the Model This model is fine-tuned on the **SPOC (Search-based Pseudo-code to Code)** dataset: - 📄 Paper: [SPOC: Search-based Pseudo-code to Code](https://arxiv.org/pdf/1906.04908) - 🏛️ Source: Stanford University - 🤖 Base Model: GPT-2 (Decoder-Only Transformer) - 📊 Training: 10,000+ pseudo-code to code pairs - 🎯 Task: Causal Language Modeling --- ## ⚠️ Limitations - Model may not handle very complex algorithms perfectly - Generated code should be tested before production use - Best results with clear, specific pseudo-code descriptions - Model trained on C++ code, adapted for Python generation --- ## 🤝 Tips for Best Results 1. ✅ **Be Specific**: "create a function to sort list in ascending order" vs "sort list" 2. ✅ **Use Action Words**: "create", "define", "implement", "calculate" 3. ✅ **Mention Data Types**: "list", "string", "dictionary", "integer" 4. ✅ **Include Details**: "recursive function" vs just "function" 5. ✅ **Try Variations**: Generate multiple times with different temperatures """) # Tab 3: History with gr.Tab("📜 History"): gr.Markdown("## 📊 Generation History") history_display = gr.Textbox( label="Recent Generations", lines=20, interactive=False ) refresh_history_btn = gr.Button("🔄 Refresh History", variant="secondary") gr.Markdown(""" --- ### 🌟 Features - ✅ Upload and use custom trained models - ✅ BLEU score calculation for quality assessment - ✅ Multiple evaluation metrics (Precision, Recall, F1) - ✅ Generate multiple code variations - ✅ Real-time performance tracking - ✅ Example prompts library - ✅ Generation history ### 📝 Citation If you use this model, please cite: ``` @article{kulal2019spoc, title={SPOC: Search-based Pseudo-code to Code}, author={Kulal, Sumith and Pasupat, Panupong and Chandra, Kartik and Lee, Mina and Padon, Oded and Aiken, Alex and Liang, Percy}, journal={arXiv preprint arXiv:1906.04908}, year={2019} } ``` **Built with ❤️ using HuggingFace Transformers & Gradio** """) # Event handlers example_dropdown.change( fn=show_examples, inputs=[example_dropdown], outputs=[pseudo_input] ) generate_btn.click( fn=generate_code_from_pseudo, inputs=[pseudo_input, max_length, temperature, top_k, top_p, num_sequences, reference_code], outputs=[code_output, metrics_output, bleu_output, alternatives_output] ) clear_btn.click( fn=clear_all, inputs=[], outputs=[pseudo_input, reference_code, code_output, metrics_output, bleu_output, max_length, temperature, top_k, top_p, num_sequences] ) refresh_history_btn.click( fn=show_history, inputs=[], outputs=[history_display] ) # Launch the interface if __name__ == "__main__": demo.launch(share=False)