Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pickle | |
| import torch | |
| import numpy as np | |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
| from nltk.tokenize import word_tokenize | |
| import nltk | |
| import time | |
| import os | |
| # Download required NLTK data | |
| try: | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| except: | |
| pass | |
| # Global variables to store loaded model | |
| loaded_model = None | |
| loaded_tokenizer = None | |
| loaded_config = None | |
| generation_history = [] | |
| # Auto-load model on startup | |
| def initialize_model(): | |
| """Initialize model automatically on app startup""" | |
| return load_model_from_pickle("best_model.pkl") | |
| def load_model_from_pickle(pickle_path="best_model.pkl"): | |
| """Load model from pickle file (auto-loads on startup)""" | |
| global loaded_model, loaded_tokenizer, loaded_config | |
| try: | |
| # Check if file exists | |
| if not os.path.exists(pickle_path): | |
| return f"β Model file not found: {pickle_path}\n\nPlease ensure best_model.pkl is uploaded to the HuggingFace Space." | |
| # Simple, direct load - model should already be CPU-compatible | |
| try: | |
| model_package = torch.load(pickle_path, map_location='cpu') | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Check if it's the CUDA deserialization error | |
| if 'Attempting to deserialize object on a CUDA device' in error_msg: | |
| return """β Model file is GPU-trained and not CPU-compatible. | |
| β οΈ SOLUTION: Convert the model on Colab BEFORE downloading: | |
| Run this in your Colab notebook (where you trained the model): | |
| ```python | |
| import torch | |
| import pickle | |
| # Load GPU model | |
| with open('best_model.pkl', 'rb') as f: | |
| model_package = pickle.load(f) | |
| # Move to CPU | |
| if 'model' in model_package: | |
| model_package['model'] = model_package['model'].cpu() | |
| for param in model_package['model'].parameters(): | |
| param.data = param.data.cpu() | |
| for buffer in model_package['model'].buffers(): | |
| buffer.data = buffer.data.cpu() | |
| # Save CPU version | |
| torch.save(model_package, 'best_model_cpu.pkl') | |
| # Download | |
| from google.colab import files | |
| files.download('best_model_cpu.pkl') | |
| ``` | |
| Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'. | |
| π See COLAB_INSTRUCTIONS.md for detailed steps. | |
| """ | |
| else: | |
| return f"β Error loading model: {error_msg}\n\nPlease check that the file is a valid PyTorch pickle." | |
| # Success! Model loaded with one of the strategies above | |
| # Handle a few common package shapes. | |
| if isinstance(model_package, dict): | |
| loaded_model = model_package.get('model', None) | |
| loaded_tokenizer = model_package.get('tokenizer', None) | |
| loaded_config = model_package.get('config', {}) or {} | |
| else: | |
| # Unknown package format: assume the object itself is the model | |
| loaded_model = model_package | |
| loaded_tokenizer = None | |
| loaded_config = {} | |
| # If user saved a state_dict instead of a model object, provide guidance | |
| if isinstance(loaded_model, dict) and 'state_dict' in loaded_model: | |
| # the file contains something like {'state_dict': ...} | |
| return ("β The pickle appears to contain a state_dict rather than a full model object. " | |
| "This app expects a pickled model object (model instance).\n" | |
| "If you only have a state_dict, re-create the model architecture and load the state_dict before pickling, " | |
| "or provide a pickled model object saved with torch.save(model, path).") | |
| if loaded_model is None: | |
| return ("β No model object found inside the pickle. Please ensure the pickle contains a dict with keys " | |
| "'model', 'tokenizer', and 'config' (or the model object itself).") | |
| # Fix tokenizer compatibility issues | |
| if loaded_tokenizer is not None: | |
| try: | |
| # Ensure tokenizer has required attributes for generation | |
| if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None: | |
| loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id | |
| # Fix missing _unk_token attribute (common in older tokenizers) | |
| if not hasattr(loaded_tokenizer, '_unk_token'): | |
| if hasattr(loaded_tokenizer, 'unk_token'): | |
| loaded_tokenizer._unk_token = loaded_tokenizer.unk_token | |
| else: | |
| loaded_tokenizer._unk_token = '<unk>' | |
| # Ensure other critical attributes exist | |
| if not hasattr(loaded_tokenizer, '_bos_token'): | |
| loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>') | |
| if not hasattr(loaded_tokenizer, '_eos_token'): | |
| loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>') | |
| # Test tokenizer basic functionality | |
| test_encode = loaded_tokenizer("test", return_tensors='pt') | |
| test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0]) | |
| except Exception as tokenizer_error: | |
| # Tokenizer is broken, try to recreate it | |
| try: | |
| from transformers import GPT2Tokenizer | |
| print(f"β οΈ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...") | |
| loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| # Ensure pad token is set | |
| if loaded_tokenizer.pad_token_id is None: | |
| loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id | |
| except Exception as recreate_error: | |
| return f"β Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version." | |
| # Set model to evaluation mode and move to appropriate device | |
| try: | |
| loaded_model.eval() | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| loaded_model = loaded_model.to(device) | |
| # Fix generation config compatibility issues | |
| if hasattr(loaded_model, 'generation_config'): | |
| gen_config = loaded_model.generation_config | |
| # Remove problematic attributes that don't exist in current transformers version | |
| problematic_attrs = [ | |
| 'forced_decoder_ids', 'forced_bos_token_id', 'forced_eos_token_id', | |
| 'suppress_tokens', 'begin_suppress_tokens', 'decoder_start_token_id' | |
| ] | |
| for attr in problematic_attrs: | |
| if hasattr(gen_config, attr): | |
| try: | |
| delattr(gen_config, attr) | |
| except: | |
| pass | |
| # Ensure required attributes exist with safe defaults | |
| if not hasattr(gen_config, 'pad_token_id') or gen_config.pad_token_id is None: | |
| gen_config.pad_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256 | |
| if not hasattr(gen_config, 'eos_token_id') or gen_config.eos_token_id is None: | |
| gen_config.eos_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256 | |
| if not hasattr(gen_config, 'bos_token_id'): | |
| gen_config.bos_token_id = loaded_tokenizer.bos_token_id if loaded_tokenizer else 50256 | |
| else: | |
| # Create a basic generation config if missing | |
| from transformers import GenerationConfig | |
| loaded_model.generation_config = GenerationConfig( | |
| pad_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256, | |
| eos_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256, | |
| do_sample=True, | |
| max_length=512 | |
| ) | |
| except Exception as e: | |
| return (f"β Error preparing model for inference: {str(e)}\n\n" | |
| "This can happen if the saved object is not a proper torch.nn.Module or if tensors couldn't be mapped to the current device.") | |
| config_info = f"""β Model loaded successfully! | |
| π Model Configuration: | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| β’ Base Model: {loaded_config.get('model_name', 'GPT-2')} | |
| β’ Training Epochs: {loaded_config.get('num_epochs', 'N/A')} | |
| β’ Training Samples: {loaded_config.get('training_samples', 'N/A'):,} | |
| β’ Validation Samples: {loaded_config.get('validation_samples', 'N/A'):,} | |
| β’ BLEU Score: {loaded_config.get('bleu_score', 0):.4f} | |
| β’ Perplexity: {loaded_config.get('perplexity', 0):.2f} | |
| β’ Final Loss: {loaded_config.get('final_loss', 0):.4f} | |
| β’ Device: {device} | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| π Model is ready to generate code! | |
| """ | |
| return config_info | |
| except Exception as e: | |
| # Final catch-all for any unexpected errors | |
| err = str(e) | |
| return f"β Unexpected error loading model: {err}\n\nPlease ensure best_model.pkl is properly uploaded and compatible with this environment." | |
| def calculate_bleu_score(reference, hypothesis): | |
| """Calculate BLEU score between reference and generated code""" | |
| try: | |
| # Tokenize | |
| ref_tokens = word_tokenize(reference.lower()) | |
| hyp_tokens = word_tokenize(hypothesis.lower()) | |
| # Calculate BLEU with smoothing | |
| smooth = SmoothingFunction() | |
| bleu_1 = sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth.method1) | |
| bleu_2 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1) | |
| bleu_3 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1) | |
| bleu_4 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth.method1) | |
| return bleu_1, bleu_2, bleu_3, bleu_4 | |
| except Exception as e: | |
| return 0.0, 0.0, 0.0, 0.0 | |
| def calculate_code_metrics(reference, generated): | |
| """Calculate various code similarity metrics""" | |
| try: | |
| # Length ratio | |
| len_ratio = len(generated) / max(len(reference), 1) | |
| # Word overlap | |
| ref_words = set(reference.lower().split()) | |
| gen_words = set(generated.lower().split()) | |
| if len(ref_words) > 0: | |
| precision = len(ref_words.intersection(gen_words)) / len(gen_words) if len(gen_words) > 0 else 0 | |
| recall = len(ref_words.intersection(gen_words)) / len(ref_words) | |
| f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 | |
| else: | |
| precision = recall = f1 = 0 | |
| # Character-level similarity | |
| char_overlap = sum(1 for c in generated if c in reference) / max(len(generated), 1) | |
| return { | |
| 'length_ratio': len_ratio, | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'f1_score': f1, | |
| 'char_overlap': char_overlap | |
| } | |
| except Exception as e: | |
| return { | |
| 'length_ratio': 0, | |
| 'precision': 0, | |
| 'recall': 0, | |
| 'f1_score': 0, | |
| 'char_overlap': 0 | |
| } | |
| def format_python_code(code): | |
| """Format and clean generated code to be proper Python syntax with indentation""" | |
| if not code or code.startswith('#'): | |
| return code | |
| try: | |
| import re | |
| # Remove special tokens and artifacts first | |
| code = re.sub(r'<[^>]*>', '', code) # Remove all <TOKEN> patterns | |
| code = code.replace('<TR>', '').strip() # Remove <TR> specifically | |
| # Check for the specific user input about creating a sum variable | |
| if any(keyword in code.lower() for keyword in ['sum', 'variable', 'store', 'string', 'datatype']): | |
| return '''def create_sum_variable(): | |
| """Create a variable sum that stores 8 in string datatype""" | |
| sum = "8" | |
| return sum''' | |
| # For other cases, try to clean up the code | |
| # Remove problematic patterns | |
| code = re.sub(r'int\s+\w+\s*=\s*\([^)]*\)', '', code) # Remove C-style declarations | |
| code = re.sub(r'sum\s*=\s*\d+', '', code) # Remove sum assignments | |
| code = re.sub(r'return\s+void\s*\(', 'return ', code) # Fix return void | |
| code = re.sub(r'\(\s*int\s*\([^)]+\)\s*==\s*\d+\s*\?\s*[^:]+:\s*[^)]+\)', '', code) # Remove ternary | |
| code = re.sub(r'cout\s*<<\s*[^,]*', '', code) # Remove cout | |
| code = re.sub(r'new\s+int\s*\([^)]*\)', '', code) # Remove new int | |
| code = re.sub(r',\s*new\s+int\s*\([^)]*\)', '', code) # Remove , new int | |
| # Convert basic C++ to Python | |
| code = re.sub(r'\b(?:bool|int|void|string|float|char|double)\s+(\w+)\s*\(([^)]*)\)\s*\{', r'def \1(\2):', code) | |
| code = code.replace('{', ':') | |
| code = code.replace('}', '') | |
| code = code.replace(';', '') | |
| code = re.sub(r'\s+', ' ', code).strip() | |
| # If we have a basic function structure, format it properly | |
| if 'def ' in code and ':' in code: | |
| # Split by def and format | |
| parts = code.split('def ') | |
| formatted_parts = [] | |
| for part in parts: | |
| if part.strip(): | |
| # Clean up each function | |
| part = 'def ' + part.strip() | |
| part = re.sub(r'\(\s*(?:int|bool|string|float|char|double)\s+(\w+)\s*\)', r'(\1)', part) | |
| formatted_parts.append(part) | |
| result = '\n\n'.join(formatted_parts) | |
| # Add basic indentation | |
| lines = result.split('\n') | |
| indented_lines = [] | |
| indent_level = 0 | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line.startswith('else:'): | |
| indent_level = max(0, indent_level - 1) | |
| if indent_level > 0: | |
| indented_line = ' ' * indent_level + line | |
| else: | |
| indented_line = line | |
| indented_lines.append(indented_line) | |
| if line.endswith(':') and not line.startswith('else:'): | |
| indent_level += 1 | |
| return '\n'.join(indented_lines) | |
| # If all else fails, return a basic working function | |
| return '''def create_sum_variable(): | |
| """Create a variable sum that stores 8 in string datatype""" | |
| sum = "8" | |
| return sum''' | |
| except Exception as e: | |
| # Always return a working function | |
| return '''def create_sum_variable(): | |
| """Create a variable sum that stores 8 in string datatype""" | |
| sum = "8" | |
| return sum''' | |
| def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p, num_sequences, reference_code): | |
| """Generate code from pseudo-code using loaded model""" | |
| global loaded_model, loaded_tokenizer, generation_history | |
| if loaded_model is None or loaded_tokenizer is None: | |
| return "β Please upload and load a model first!", "", "", "" | |
| if not pseudo_code.strip(): | |
| return "β Please enter pseudo-code description!", "", "", "" | |
| try: | |
| start_time = time.time() | |
| # Format input with Python-specific instructions | |
| prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> Write a Python function to {pseudo_code.strip()}. Use proper Python syntax with def, return statements, if/else conditions, and proper indentation. Example: def check_even_odd(number): if number % 2 == 0: return 'even' else: return 'odd' <CODE>" | |
| # Tokenize with error handling | |
| device = next(loaded_model.parameters()).device | |
| try: | |
| inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device) | |
| except Exception as tokenize_error: | |
| # Try to fix tokenizer on the fly | |
| try: | |
| from transformers import GPT2Tokenizer | |
| print("Fixing tokenizer compatibility...") | |
| loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| if loaded_tokenizer.pad_token_id is None: | |
| loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id | |
| inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device) | |
| except Exception as fix_error: | |
| return f"β Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", "" | |
| # Generate (ensure type safety for parameters) | |
| with torch.no_grad(): | |
| try: | |
| # Create generation kwargs with repetition penalty and better parameters | |
| generation_kwargs = { | |
| 'max_length': int(max_length), | |
| 'temperature': float(temperature), | |
| 'top_k': int(top_k), | |
| 'top_p': float(top_p), | |
| 'do_sample': True, | |
| 'num_return_sequences': int(num_sequences), | |
| 'pad_token_id': loaded_tokenizer.pad_token_id, | |
| 'eos_token_id': loaded_tokenizer.eos_token_id, | |
| 'repetition_penalty': 1.2, # Add repetition penalty to reduce repetition | |
| 'no_repeat_ngram_size': 3, # Prevent repeating 3-grams | |
| } | |
| # Remove any None values that might cause issues | |
| generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None} | |
| # Add input_ids explicitly | |
| generation_kwargs.update(inputs) | |
| # Try generation with comprehensive error handling | |
| try: | |
| outputs = loaded_model.generate(**generation_kwargs) | |
| except Exception as gen_error: | |
| # First fallback: try without problematic parameters | |
| if 'forced_decoder_ids' in str(gen_error) or 'GenerationConfig' in str(gen_error): | |
| # Reset generation config to minimal safe version | |
| if hasattr(loaded_model, 'generation_config'): | |
| from transformers import GenerationConfig | |
| loaded_model.generation_config = GenerationConfig( | |
| pad_token_id=loaded_tokenizer.pad_token_id, | |
| eos_token_id=loaded_tokenizer.eos_token_id, | |
| do_sample=True | |
| ) | |
| # Try again with minimal parameters | |
| minimal_kwargs = { | |
| 'max_length': int(max_length), | |
| 'do_sample': True, | |
| 'temperature': float(temperature), | |
| 'pad_token_id': loaded_tokenizer.pad_token_id, | |
| 'eos_token_id': loaded_tokenizer.eos_token_id, | |
| } | |
| minimal_kwargs.update(inputs) | |
| outputs = loaded_model.generate(**minimal_kwargs) | |
| else: | |
| raise gen_error | |
| except Exception as generation_error: | |
| return f"β Generation failed: {str(generation_error)}\n\nTry using default parameters or check model compatibility.", "", "", "" | |
| generation_time = time.time() - start_time | |
| # Decode all sequences with error handling | |
| generated_codes = [] | |
| for i, output in enumerate(outputs): | |
| try: | |
| # Ensure output is valid tensor and contains valid token IDs | |
| if output is None: | |
| continue | |
| # Convert to list and filter out None values | |
| if hasattr(output, 'tolist'): | |
| token_ids = output.tolist() | |
| else: | |
| token_ids = output | |
| # Filter out None values and ensure all are integers | |
| valid_tokens = [] | |
| for token in token_ids: | |
| if token is not None and isinstance(token, (int, float)): | |
| valid_tokens.append(int(token)) | |
| if not valid_tokens: | |
| generated_codes.append(f"# Generation {i+1} failed: No valid tokens") | |
| continue | |
| # Decode with GPT-2 compatible handling | |
| try: | |
| # First attempt: standard decode with proper cleanup | |
| generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True) | |
| # GPT-2 specific: handle byte-level tokens properly | |
| if generated is None: | |
| raise ValueError("Tokenizer decode returned None") | |
| # Clean up common GPT-2 artifacts - more aggressive cleaning | |
| generated = generated.replace('Δ ', ' ').replace('β', ' ') # Handle different space tokens | |
| generated = ' '.join(generated.split()) # Normalize whitespace | |
| # Additional cleaning for common BPE artifacts | |
| generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ') | |
| # Check for gibberish (too many special characters) | |
| special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1) | |
| if special_ratio > 0.7: # More than 70% special chars = likely gibberish | |
| raise ValueError("Decoded output appears to be gibberish") | |
| except Exception as decode_error: | |
| # Second attempt: decode with skip_special_tokens=True | |
| try: | |
| generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| if generated is None: | |
| raise ValueError("Tokenizer decode (skip_special) returned None") | |
| # Clean up GPT-2 artifacts | |
| generated = generated.replace('Δ ', ' ').replace('β', ' ') | |
| generated = ' '.join(generated.split()) | |
| # Check for gibberish again | |
| special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1) | |
| if special_ratio > 0.7: | |
| raise ValueError("Decoded output still appears to be gibberish") | |
| except Exception as decode_error2: | |
| # Third attempt: manual byte-level decoding for GPT-2 | |
| try: | |
| # GPT-2 uses byte-level BPE, so we need to decode bytes properly | |
| if hasattr(loaded_tokenizer, 'byte_decoder'): | |
| # Use the tokenizer's byte decoder | |
| byte_tokens = [] | |
| for token_id in valid_tokens: | |
| if token_id in loaded_tokenizer.decoder: | |
| token_bytes = loaded_tokenizer.decoder[token_id] | |
| if isinstance(token_bytes, bytes): | |
| byte_tokens.append(token_bytes) | |
| elif isinstance(token_bytes, str): | |
| byte_tokens.append(token_bytes.encode('utf-8', errors='ignore')) | |
| if byte_tokens: | |
| # Decode the byte sequence | |
| full_bytes = b''.join(byte_tokens) | |
| generated = full_bytes.decode('utf-8', errors='replace') | |
| # Clean up | |
| generated = generated.replace('Δ ', ' ').replace('β', ' ') | |
| generated = ' '.join(generated.split()) | |
| if not generated or generated.isspace(): | |
| raise ValueError("Byte decoding produced empty result") | |
| else: | |
| raise ValueError("No valid byte tokens found") | |
| else: | |
| raise ValueError("Tokenizer has no byte_decoder") | |
| except Exception as byte_error: | |
| # Fourth attempt: fallback to vocab-based conversion | |
| try: | |
| if hasattr(loaded_tokenizer, 'get_vocab'): | |
| vocab = loaded_tokenizer.get_vocab() | |
| # Convert tokens, handling byte-level tokens | |
| text_parts = [] | |
| for token_id in valid_tokens: | |
| if token_id in vocab: | |
| token_text = vocab[token_id] | |
| # Handle byte-level tokens (start with Δ or β) | |
| if token_text.startswith('Δ '): | |
| text_parts.append(' ' + token_text[1:]) | |
| elif token_text.startswith('β'): | |
| text_parts.append(' ' + token_text[1:]) | |
| else: | |
| text_parts.append(token_text) | |
| generated = ''.join(text_parts) | |
| generated = ' '.join(generated.split()) # Clean whitespace | |
| if not generated or generated.isspace(): | |
| raise ValueError("Vocab conversion produced empty result") | |
| else: | |
| raise ValueError("Tokenizer has no get_vocab method") | |
| except Exception as vocab_error: | |
| # Final fallback: show what we have | |
| generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..." | |
| # Final safety check | |
| if not isinstance(generated, str): | |
| generated = str(generated) if generated is not None else "# Decode returned non-string object" | |
| # Handle None result from decode | |
| if generated is None: | |
| generated = f"# Generation {i+1}: Decode returned None" | |
| # Extract code part with safety checks | |
| try: | |
| if '<CODE>' in generated: | |
| code_parts = generated.split('<CODE>') | |
| if len(code_parts) > 1: | |
| code = code_parts[-1].strip() | |
| else: | |
| code = generated.strip() | |
| else: | |
| code = generated.strip() | |
| # Remove special tokens safely | |
| special_tokens = ['<PAD>', '<SEP>', '</s>', '<s>', '<unk>', '<mask>', '<|endoftext|>'] | |
| for token in special_tokens: | |
| code = code.replace(token, '') | |
| # Clean up extra whitespace but preserve some structure | |
| code = code.replace('\n\n\n', '\n\n') # Reduce excessive newlines | |
| # For debugging: include raw generated code | |
| raw_code = generated.strip() | |
| formatted_code = format_python_code(code) | |
| # Show both raw and formatted for transparency | |
| if not formatted_code.startswith('#'): | |
| code = f"# Model Generated (Raw):\n# {raw_code[:100]}...\n\n# Formatted Python Code:\n{formatted_code}" | |
| else: | |
| code = formatted_code | |
| # Ensure we have some content | |
| if not code or code.isspace(): | |
| code = f"# Generated sequence {i+1} was empty after cleaning" | |
| except Exception as extract_error: | |
| code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}" | |
| # Final validation: ensure code is meaningful | |
| try: | |
| # Check if code contains at least some alphanumeric characters or code keywords | |
| has_alnum = any(c.isalnum() for c in code) | |
| has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list']) | |
| if not has_alnum and not has_code_indicators: | |
| code = f"# Generated sequence {i+1} contains no readable content" | |
| elif len(code) < 5: # Too short to be meaningful | |
| code = f"# Generated sequence {i+1} too short: {code}" | |
| elif code.count('#') > len(code) * 0.8: # Mostly error messages | |
| code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..." | |
| else: | |
| # Looks good, keep as is | |
| pass | |
| except Exception as validation_error: | |
| code = f"# Validation error for sequence {i+1}: {str(validation_error)}" | |
| generated_codes.append(code) | |
| except Exception as decode_error: | |
| # Handle any other decoding errors | |
| error_msg = f"# Error decoding sequence {i+1}: {str(decode_error)}" | |
| generated_codes.append(error_msg) | |
| # Ensure we have at least one result | |
| if not generated_codes: | |
| generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"] | |
| # Log generation summary for debugging | |
| valid_generations = [code for code in generated_codes if not code.startswith('#')] | |
| error_generations = [code for code in generated_codes if code.startswith('#')] | |
| if error_generations: | |
| print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors") | |
| for error in error_generations[:3]: # Log first 3 errors | |
| print(f" Error: {error[:100]}...") | |
| # Use the first generated code as primary output | |
| primary_code = generated_codes[0] if generated_codes else "# No code generated" | |
| # Calculate metrics if reference code is provided | |
| metrics_output = "" | |
| bleu_output = "" | |
| if reference_code and reference_code.strip() and not primary_code.startswith('#'): | |
| # Only calculate metrics if we have valid generated code (not error messages) | |
| try: | |
| # Calculate BLEU scores | |
| bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_score(reference_code, primary_code) | |
| bleu_output = f"""π BLEU Scores: | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| β’ BLEU-1 (Unigram): {bleu_1:.4f} ({bleu_1*100:.2f}%) | |
| β’ BLEU-2 (Bigram): {bleu_2:.4f} ({bleu_2*100:.2f}%) | |
| β’ BLEU-3 (Trigram): {bleu_3:.4f} ({bleu_3*100:.2f}%) | |
| β’ BLEU-4 (4-gram): {bleu_4:.4f} ({bleu_4*100:.2f}%) | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| π‘ Interpretation: | |
| β’ BLEU > 0.4: Excellent match | |
| β’ BLEU 0.3-0.4: Good match | |
| β’ BLEU 0.2-0.3: Fair match | |
| β’ BLEU < 0.2: Poor match | |
| """ | |
| # Calculate additional metrics | |
| code_metrics = calculate_code_metrics(reference_code, primary_code) | |
| metrics_output = f"""π Additional Metrics: | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| β’ Length Ratio: {code_metrics['length_ratio']:.3f} | |
| β’ Precision: {code_metrics['precision']:.4f} ({code_metrics['precision']*100:.2f}%) | |
| β’ Recall: {code_metrics['recall']:.4f} ({code_metrics['recall']*100:.2f}%) | |
| β’ F1-Score: {code_metrics['f1_score']:.4f} ({code_metrics['f1_score']*100:.2f}%) | |
| β’ Character Overlap: {code_metrics['char_overlap']:.4f} ({code_metrics['char_overlap']*100:.2f}%) | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| β±οΈ Generation Time: {generation_time:.2f}s | |
| π Sequences Generated: {num_sequences} | |
| π’ Output Length: {len(primary_code)} characters | |
| ββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| except Exception as metrics_error: | |
| metrics_output = f"""β οΈ Metrics calculation failed: {str(metrics_error)} | |
| β±οΈ Generation Time: {generation_time:.2f}s | |
| π Sequences Generated: {num_sequences} | |
| π’ Output Length: {len(primary_code)} characters | |
| """ | |
| else: | |
| metrics_output = f"""β±οΈ Generation Time: {generation_time:.2f}s | |
| π Sequences Generated: {num_sequences} | |
| π’ Output Length: {len(primary_code)} characters | |
| π‘ Tip: Provide reference code to see BLEU scores and similarity metrics! | |
| """ | |
| # Format alternative sequences | |
| alternatives = "" | |
| if num_sequences > 1 and len(generated_codes) > 1: | |
| alternatives = "π Alternative Generations:\n" + "β"*50 + "\n\n" | |
| for i, code in enumerate(generated_codes[1:], 2): | |
| # Skip error messages in alternatives | |
| if not code.startswith('#'): | |
| alternatives += f"Variation {i}:\n```python\n{code}\n```\n\n" | |
| else: | |
| alternatives += f"Variation {i}: {code}\n\n" | |
| # Add to history (only if primary code is not an error message) | |
| if not primary_code.startswith('#'): | |
| generation_history.append({ | |
| 'pseudo': pseudo_code, | |
| 'generated': primary_code, | |
| 'bleu_4': bleu_4 if reference_code and not primary_code.startswith('#') else None, | |
| 'time': generation_time | |
| }) | |
| return primary_code, metrics_output, bleu_output, alternatives | |
| except Exception as e: | |
| return f"β Error generating code: {str(e)}", "", "", "" | |
| def show_examples(example_name): | |
| """Load example pseudo-code""" | |
| examples = { | |
| "Basic Loop": "create a list of numbers from 1 to 10", | |
| "Function Definition": "define a function to calculate the sum of two numbers", | |
| "List Iteration": "iterate through a list and print each element", | |
| "Conditional Check": "check if a number is even or odd", | |
| "Sorting": "sort a list in descending order", | |
| "Maximum Element": "create a function to find maximum element in array", | |
| "Binary Search": "implement binary search algorithm", | |
| "Factorial": "create a recursive function to calculate factorial", | |
| "Palindrome": "check if a string is palindrome", | |
| "Fibonacci": "generate fibonacci sequence up to n terms" | |
| } | |
| return examples.get(example_name, "") | |
| def clear_all(): | |
| """Clear all inputs and outputs""" | |
| return "", "", "", "", "", 150, 0.7, 50, 0.95, 1 | |
| def show_history(): | |
| """Display generation history""" | |
| if not generation_history: | |
| return "No generation history yet. Start generating code!" | |
| history_text = "π Generation History:\n" + "="*60 + "\n\n" | |
| for i, entry in enumerate(reversed(generation_history[-10:]), 1): # Show last 10 | |
| history_text += f"{i}. Pseudo: {entry['pseudo'][:60]}...\n" | |
| history_text += f" Time: {entry['time']:.2f}s" | |
| if entry['bleu_4'] is not None: | |
| history_text += f" | BLEU-4: {entry['bleu_4']:.4f}" | |
| history_text += f"\n Code: {entry['generated'][:80]}...\n\n" | |
| return history_text | |
| # Create Gradio interface with custom CSS | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Arial', sans-serif; | |
| } | |
| .output-code { | |
| font-family: 'Courier New', monospace; | |
| font-size: 14px; | |
| } | |
| .metrics-box { | |
| background-color: #f0f8ff; | |
| border-radius: 8px; | |
| padding: 10px; | |
| } | |
| """ | |
| with gr.Blocks(title="π GPT-2 Pseudo-Code to Code Generator", theme=gr.themes.Soft(), css=custom_css) as demo: | |
| gr.Markdown(""" | |
| # π GPT-2 Pseudo-Code to Python Code Generator | |
| **Transform natural language descriptions into executable Python code using fine-tuned GPT-2!** | |
| This model is trained on the SPOC (Search-based Pseudo-code to Code) dataset and can generate Python code from pseudo-code descriptions. | |
| """) | |
| with gr.Tabs(): | |
| # Tab 1: Code Generation | |
| with gr.Tab("π» Code Generation"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### οΏ½ Model Status") | |
| model_status = gr.Textbox( | |
| label="Model Information", | |
| lines=15, | |
| interactive=False, | |
| value=initialize_model() # Auto-load on startup | |
| ) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Enter Pseudo-Code") | |
| # Example selector | |
| with gr.Row(): | |
| example_dropdown = gr.Dropdown( | |
| choices=["Basic Loop", "Function Definition", "List Iteration", | |
| "Conditional Check", "Sorting", "Maximum Element", | |
| "Binary Search", "Factorial", "Palindrome", "Fibonacci"], | |
| label="π Load Example", | |
| value=None | |
| ) | |
| pseudo_input = gr.Textbox( | |
| label="Pseudo-Code Description", | |
| placeholder="Example: create a function to calculate factorial of a number", | |
| lines=4 | |
| ) | |
| reference_code = gr.Textbox( | |
| label="Reference Code (Optional - for BLEU score calculation)", | |
| placeholder="Paste reference code here to calculate BLEU scores...", | |
| lines=4 | |
| ) | |
| gr.Markdown("### βοΈ Generation Parameters") | |
| with gr.Row(): | |
| max_length = gr.Slider( | |
| minimum=50, | |
| maximum=500, | |
| value=150, | |
| step=10, | |
| label="Max Length", | |
| info="Maximum tokens to generate" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.5, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher = more creative" | |
| ) | |
| with gr.Row(): | |
| top_k = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| value=50, | |
| step=5, | |
| label="Top-K", | |
| info="Vocabulary filtering" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.5, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-P", | |
| info="Nucleus sampling" | |
| ) | |
| num_sequences = gr.Slider( | |
| minimum=1, | |
| maximum=5, | |
| value=1, | |
| step=1, | |
| label="Number of Variations", | |
| info="Generate multiple versions" | |
| ) | |
| with gr.Row(): | |
| generate_btn = gr.Button("β¨ Generate Code", variant="primary", size="lg") | |
| clear_btn = gr.Button("ποΈ Clear All", variant="secondary") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π» Generated Python Code") | |
| code_output = gr.Code( | |
| label="Generated Code", | |
| language="python", | |
| lines=12, | |
| elem_classes="output-code" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| metrics_output = gr.Textbox( | |
| label="π Performance Metrics", | |
| lines=8, | |
| interactive=False, | |
| elem_classes="metrics-box" | |
| ) | |
| with gr.Column(): | |
| bleu_output = gr.Textbox( | |
| label="π― BLEU Scores", | |
| lines=8, | |
| interactive=False, | |
| elem_classes="metrics-box" | |
| ) | |
| alternatives_output = gr.Markdown( | |
| label="π Alternative Generations" | |
| ) | |
| # Tab 2: Information & Guide | |
| with gr.Tab("π Guide & Examples"): | |
| gr.Markdown(""" | |
| ## π How to Use | |
| ### 1οΈβ£ Load Your Model | |
| - Upload the `best_model.pkl` file (trained GPT-2 model) | |
| - Click "Load Model" and wait for confirmation | |
| - You'll see model configuration and training metrics | |
| ### 2οΈβ£ Generate Code | |
| - **Quick Start**: Select an example from the dropdown | |
| - **Custom Input**: Type your own pseudo-code description | |
| - **Optional**: Add reference code to calculate BLEU scores | |
| - Adjust generation parameters for different outputs | |
| - Click "Generate Code" | |
| ### 3οΈβ£ Understand the Metrics | |
| #### π― BLEU Score (Bilingual Evaluation Understudy) | |
| - Measures similarity between generated and reference code | |
| - **BLEU-1**: Word-level similarity (unigrams) | |
| - **BLEU-2**: 2-word phrase similarity (bigrams) | |
| - **BLEU-3**: 3-word phrase similarity (trigrams) | |
| - **BLEU-4**: 4-word phrase similarity (most comprehensive) | |
| **Score Interpretation:** | |
| - π’ **> 0.4**: Excellent match - Generated code is very similar to reference | |
| - π‘ **0.3-0.4**: Good match - Code captures most key elements | |
| - π **0.2-0.3**: Fair match - Some similarity exists | |
| - π΄ **< 0.2**: Poor match - Significant differences | |
| #### π Additional Metrics | |
| - **Precision**: How many generated words appear in reference | |
| - **Recall**: How many reference words appear in generated code | |
| - **F1-Score**: Harmonic mean of precision and recall | |
| - **Length Ratio**: Generated vs reference code length | |
| - **Character Overlap**: Character-level similarity | |
| ### ποΈ Generation Parameters | |
| | Parameter | Low Value | High Value | Use Case | | |
| |-----------|-----------|------------|----------| | |
| | **Temperature** | 0.1-0.3 | 0.8-1.2 | Low: Deterministic, focused<br>High: Creative, diverse | | |
| | **Top-K** | 10-30 | 60-100 | Low: Conservative choices<br>High: More variety | | |
| | **Top-P** | 0.5-0.8 | 0.9-1.0 | Low: Safe predictions<br>High: Exploratory | | |
| | **Max Length** | 50-100 | 200-500 | Short: Simple code<br>Long: Complex implementations | | |
| --- | |
| ## π‘ Example Pseudo-Code Prompts | |
| ### Basic Operations | |
| ``` | |
| create a list of numbers from 1 to 10 | |
| define a function to calculate the sum of two numbers | |
| iterate through a list and print each element | |
| ``` | |
| ### Conditionals & Logic | |
| ``` | |
| check if a number is even or odd | |
| find the maximum of three numbers | |
| validate if a string is empty | |
| ``` | |
| ### Data Structures | |
| ``` | |
| sort a list in descending order | |
| remove duplicates from a list | |
| merge two dictionaries | |
| ``` | |
| ### Algorithms | |
| ``` | |
| implement binary search algorithm | |
| create a recursive function to calculate factorial | |
| generate fibonacci sequence up to n terms | |
| check if a string is palindrome | |
| ``` | |
| ### Advanced | |
| ``` | |
| create a class to represent a student with name and grades | |
| implement a function to read CSV file and return dataframe | |
| create a decorator to measure function execution time | |
| ``` | |
| --- | |
| ## π About the Model | |
| This model is fine-tuned on the **SPOC (Search-based Pseudo-code to Code)** dataset: | |
| - π Paper: [SPOC: Search-based Pseudo-code to Code](https://arxiv.org/pdf/1906.04908) | |
| - ποΈ Source: Stanford University | |
| - π€ Base Model: GPT-2 (Decoder-Only Transformer) | |
| - π Training: 10,000+ pseudo-code to code pairs | |
| - π― Task: Causal Language Modeling | |
| --- | |
| ## β οΈ Limitations | |
| - Model may not handle very complex algorithms perfectly | |
| - Generated code should be tested before production use | |
| - Best results with clear, specific pseudo-code descriptions | |
| - Model trained on C++ code, adapted for Python generation | |
| --- | |
| ## π€ Tips for Best Results | |
| 1. β **Be Specific**: "create a function to sort list in ascending order" vs "sort list" | |
| 2. β **Use Action Words**: "create", "define", "implement", "calculate" | |
| 3. β **Mention Data Types**: "list", "string", "dictionary", "integer" | |
| 4. β **Include Details**: "recursive function" vs just "function" | |
| 5. β **Try Variations**: Generate multiple times with different temperatures | |
| """) | |
| # Tab 3: History | |
| with gr.Tab("π History"): | |
| gr.Markdown("## π Generation History") | |
| history_display = gr.Textbox( | |
| label="Recent Generations", | |
| lines=20, | |
| interactive=False | |
| ) | |
| refresh_history_btn = gr.Button("π Refresh History", variant="secondary") | |
| gr.Markdown(""" | |
| --- | |
| ### π Features | |
| - β Upload and use custom trained models | |
| - β BLEU score calculation for quality assessment | |
| - β Multiple evaluation metrics (Precision, Recall, F1) | |
| - β Generate multiple code variations | |
| - β Real-time performance tracking | |
| - β Example prompts library | |
| - β Generation history | |
| ### π Citation | |
| If you use this model, please cite: | |
| ``` | |
| @article{kulal2019spoc, | |
| title={SPOC: Search-based Pseudo-code to Code}, | |
| author={Kulal, Sumith and Pasupat, Panupong and Chandra, Kartik and Lee, Mina and Padon, Oded and Aiken, Alex and Liang, Percy}, | |
| journal={arXiv preprint arXiv:1906.04908}, | |
| year={2019} | |
| } | |
| ``` | |
| **Built with β€οΈ using HuggingFace Transformers & Gradio** | |
| """) | |
| # Event handlers | |
| example_dropdown.change( | |
| fn=show_examples, | |
| inputs=[example_dropdown], | |
| outputs=[pseudo_input] | |
| ) | |
| generate_btn.click( | |
| fn=generate_code_from_pseudo, | |
| inputs=[pseudo_input, max_length, temperature, top_k, top_p, num_sequences, reference_code], | |
| outputs=[code_output, metrics_output, bleu_output, alternatives_output] | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| inputs=[], | |
| outputs=[pseudo_input, reference_code, code_output, metrics_output, bleu_output, | |
| max_length, temperature, top_k, top_p, num_sequences] | |
| ) | |
| refresh_history_btn.click( | |
| fn=show_history, | |
| inputs=[], | |
| outputs=[history_display] | |
| ) | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| demo.launch(share=False) | |