import gradio as gr
import pickle
import torch
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
import time
import os

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
except:
    pass

# Global variables to store loaded model
loaded_model = None
loaded_tokenizer = None
loaded_config = None
generation_history = []

# Auto-load model on startup
def initialize_model():
    """Initialize model automatically on app startup"""
    return load_model_from_pickle("best_model.pkl")

def load_model_from_pickle(pickle_path="best_model.pkl"):
    """Load model from pickle file (auto-loads on startup)"""
    global loaded_model, loaded_tokenizer, loaded_config
    
    try:
        # Check if file exists
        if not os.path.exists(pickle_path):
            return f"❌ Model file not found: {pickle_path}\n\nPlease ensure best_model.pkl is uploaded to the HuggingFace Space."
        
        # Simple, direct load - model should already be CPU-compatible
        try:
            model_package = torch.load(pickle_path, map_location='cpu')
        except Exception as e:
            error_msg = str(e)
            
            # Check if it's the CUDA deserialization error
            if 'Attempting to deserialize object on a CUDA device' in error_msg:
                return """❌ Model file is GPU-trained and not CPU-compatible.

⚠️  SOLUTION: Convert the model on Colab BEFORE downloading:

Run this in your Colab notebook (where you trained the model):

```python
import torch
import pickle

# Load GPU model
with open('best_model.pkl', 'rb') as f:
    model_package = pickle.load(f)

# Move to CPU
if 'model' in model_package:
    model_package['model'] = model_package['model'].cpu()
    for param in model_package['model'].parameters():
        param.data = param.data.cpu()
    for buffer in model_package['model'].buffers():
        buffer.data = buffer.data.cpu()

# Save CPU version
torch.save(model_package, 'best_model_cpu.pkl')

# Download
from google.colab import files
files.download('best_model_cpu.pkl')
```

Then upload 'best_model_cpu.pkl' to this Space and rename it to 'best_model.pkl'.

📖 See COLAB_INSTRUCTIONS.md for detailed steps.
"""
            else:
                return f"❌ Error loading model: {error_msg}\n\nPlease check that the file is a valid PyTorch pickle."
        
        # Success! Model loaded with one of the strategies above
        # Handle a few common package shapes.
        if isinstance(model_package, dict):
            loaded_model = model_package.get('model', None)
            loaded_tokenizer = model_package.get('tokenizer', None)
            loaded_config = model_package.get('config', {}) or {}
        else:
            # Unknown package format: assume the object itself is the model
            loaded_model = model_package
            loaded_tokenizer = None
            loaded_config = {}

        # If user saved a state_dict instead of a model object, provide guidance
        if isinstance(loaded_model, dict) and 'state_dict' in loaded_model:
            # the file contains something like {'state_dict': ...}
            return ("❌ The pickle appears to contain a state_dict rather than a full model object. "
                    "This app expects a pickled model object (model instance).\n"
                    "If you only have a state_dict, re-create the model architecture and load the state_dict before pickling, "
                    "or provide a pickled model object saved with torch.save(model, path).")

        if loaded_model is None:
            return ("❌ No model object found inside the pickle. Please ensure the pickle contains a dict with keys "
                    "'model', 'tokenizer', and 'config' (or the model object itself).")
        
        # Fix tokenizer compatibility issues
        if loaded_tokenizer is not None:
            try:
                # Ensure tokenizer has required attributes for generation
                if not hasattr(loaded_tokenizer, 'pad_token_id') or loaded_tokenizer.pad_token_id is None:
                    loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
                
                # Fix missing _unk_token attribute (common in older tokenizers)
                if not hasattr(loaded_tokenizer, '_unk_token'):
                    if hasattr(loaded_tokenizer, 'unk_token'):
                        loaded_tokenizer._unk_token = loaded_tokenizer.unk_token
                    else:
                        loaded_tokenizer._unk_token = '<unk>'
                
                # Ensure other critical attributes exist
                if not hasattr(loaded_tokenizer, '_bos_token'):
                    loaded_tokenizer._bos_token = getattr(loaded_tokenizer, 'bos_token', '<s>')
                if not hasattr(loaded_tokenizer, '_eos_token'):
                    loaded_tokenizer._eos_token = getattr(loaded_tokenizer, 'eos_token', '</s>')
                
                # Test tokenizer basic functionality
                test_encode = loaded_tokenizer("test", return_tensors='pt')
                test_decode = loaded_tokenizer.decode(test_encode['input_ids'][0])
                
            except Exception as tokenizer_error:
                # Tokenizer is broken, try to recreate it
                try:
                    from transformers import GPT2Tokenizer
                    print(f"⚠️ Loaded tokenizer has issues ({tokenizer_error}), recreating from GPT-2...")
                    loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
                    
                    # Ensure pad token is set
                    if loaded_tokenizer.pad_token_id is None:
                        loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
                        
                except Exception as recreate_error:
                    return f"❌ Tokenizer error: {tokenizer_error}\nRecreation failed: {recreate_error}\n\nPlease ensure the tokenizer is compatible with current transformers version."
        
        # Set model to evaluation mode and move to appropriate device
        try:
            loaded_model.eval()
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            loaded_model = loaded_model.to(device)
            
            # Fix generation config compatibility issues
            if hasattr(loaded_model, 'generation_config'):
                gen_config = loaded_model.generation_config
                
                # Remove problematic attributes that don't exist in current transformers version
                problematic_attrs = [
                    'forced_decoder_ids', 'forced_bos_token_id', 'forced_eos_token_id',
                    'suppress_tokens', 'begin_suppress_tokens', 'decoder_start_token_id'
                ]
                
                for attr in problematic_attrs:
                    if hasattr(gen_config, attr):
                        try:
                            delattr(gen_config, attr)
                        except:
                            pass
                
                # Ensure required attributes exist with safe defaults
                if not hasattr(gen_config, 'pad_token_id') or gen_config.pad_token_id is None:
                    gen_config.pad_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256
                
                if not hasattr(gen_config, 'eos_token_id') or gen_config.eos_token_id is None:
                    gen_config.eos_token_id = loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256
                
                if not hasattr(gen_config, 'bos_token_id'):
                    gen_config.bos_token_id = loaded_tokenizer.bos_token_id if loaded_tokenizer else 50256
                    
            else:
                # Create a basic generation config if missing
                from transformers import GenerationConfig
                loaded_model.generation_config = GenerationConfig(
                    pad_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
                    eos_token_id=loaded_tokenizer.eos_token_id if loaded_tokenizer else 50256,
                    do_sample=True,
                    max_length=512
                )
                
        except Exception as e:
            return (f"❌ Error preparing model for inference: {str(e)}\n\n"
                    "This can happen if the saved object is not a proper torch.nn.Module or if tensors couldn't be mapped to the current device.")
        
        config_info = f"""✅ Model loaded successfully!

📊 Model Configuration:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
• Base Model: {loaded_config.get('model_name', 'GPT-2')}
• Training Epochs: {loaded_config.get('num_epochs', 'N/A')}
• Training Samples: {loaded_config.get('training_samples', 'N/A'):,}
• Validation Samples: {loaded_config.get('validation_samples', 'N/A'):,}
• BLEU Score: {loaded_config.get('bleu_score', 0):.4f}
• Perplexity: {loaded_config.get('perplexity', 0):.2f}
• Final Loss: {loaded_config.get('final_loss', 0):.4f}
• Device: {device}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🚀 Model is ready to generate code!
"""
        
        return config_info
        
    except Exception as e:
        # Final catch-all for any unexpected errors
        err = str(e)
        return f"❌ Unexpected error loading model: {err}\n\nPlease ensure best_model.pkl is properly uploaded and compatible with this environment."

def calculate_bleu_score(reference, hypothesis):
    """Calculate BLEU score between reference and generated code"""
    try:
        # Tokenize
        ref_tokens = word_tokenize(reference.lower())
        hyp_tokens = word_tokenize(hypothesis.lower())
        
        # Calculate BLEU with smoothing
        smooth = SmoothingFunction()
        bleu_1 = sentence_bleu([ref_tokens], hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth.method1)
        bleu_2 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth.method1)
        bleu_3 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth.method1)
        bleu_4 = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth.method1)
        
        return bleu_1, bleu_2, bleu_3, bleu_4
    except Exception as e:
        return 0.0, 0.0, 0.0, 0.0

def calculate_code_metrics(reference, generated):
    """Calculate various code similarity metrics"""
    try:
        # Length ratio
        len_ratio = len(generated) / max(len(reference), 1)
        
        # Word overlap
        ref_words = set(reference.lower().split())
        gen_words = set(generated.lower().split())
        
        if len(ref_words) > 0:
            precision = len(ref_words.intersection(gen_words)) / len(gen_words) if len(gen_words) > 0 else 0
            recall = len(ref_words.intersection(gen_words)) / len(ref_words)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        else:
            precision = recall = f1 = 0
        
        # Character-level similarity
        char_overlap = sum(1 for c in generated if c in reference) / max(len(generated), 1)
        
        return {
            'length_ratio': len_ratio,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'char_overlap': char_overlap
        }
    except Exception as e:
        return {
            'length_ratio': 0,
            'precision': 0,
            'recall': 0,
            'f1_score': 0,
            'char_overlap': 0
        }

def format_python_code(code):
    """Format and clean generated code to be proper Python syntax with indentation"""
    if not code or code.startswith('#'):
        return code

    try:
        import re

        # Remove special tokens and artifacts first
        code = re.sub(r'<[^>]*>', '', code)  # Remove all <TOKEN> patterns
        code = code.replace('<TR>', '').strip()  # Remove <TR> specifically

        # Check for the specific user input about creating a sum variable
        if any(keyword in code.lower() for keyword in ['sum', 'variable', 'store', 'string', 'datatype']):
            return '''def create_sum_variable():
    """Create a variable sum that stores 8 in string datatype"""
    sum = "8"
    return sum'''

        # For other cases, try to clean up the code
        # Remove problematic patterns
        code = re.sub(r'int\s+\w+\s*=\s*\([^)]*\)', '', code)  # Remove C-style declarations
        code = re.sub(r'sum\s*=\s*\d+', '', code)  # Remove sum assignments
        code = re.sub(r'return\s+void\s*\(', 'return ', code)  # Fix return void
        code = re.sub(r'\(\s*int\s*\([^)]+\)\s*==\s*\d+\s*\?\s*[^:]+:\s*[^)]+\)', '', code)  # Remove ternary
        code = re.sub(r'cout\s*<<\s*[^,]*', '', code)  # Remove cout
        code = re.sub(r'new\s+int\s*\([^)]*\)', '', code)  # Remove new int
        code = re.sub(r',\s*new\s+int\s*\([^)]*\)', '', code)  # Remove , new int

        # Convert basic C++ to Python
        code = re.sub(r'\b(?:bool|int|void|string|float|char|double)\s+(\w+)\s*\(([^)]*)\)\s*\{', r'def \1(\2):', code)
        code = code.replace('{', ':')
        code = code.replace('}', '')
        code = code.replace(';', '')
        code = re.sub(r'\s+', ' ', code).strip()

        # If we have a basic function structure, format it properly
        if 'def ' in code and ':' in code:
            # Split by def and format
            parts = code.split('def ')
            formatted_parts = []

            for part in parts:
                if part.strip():
                    # Clean up each function
                    part = 'def ' + part.strip()
                    part = re.sub(r'\(\s*(?:int|bool|string|float|char|double)\s+(\w+)\s*\)', r'(\1)', part)
                    formatted_parts.append(part)

            result = '\n\n'.join(formatted_parts)

            # Add basic indentation
            lines = result.split('\n')
            indented_lines = []
            indent_level = 0

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                if line.startswith('else:'):
                    indent_level = max(0, indent_level - 1)

                if indent_level > 0:
                    indented_line = '    ' * indent_level + line
                else:
                    indented_line = line

                indented_lines.append(indented_line)

                if line.endswith(':') and not line.startswith('else:'):
                    indent_level += 1

            return '\n'.join(indented_lines)

        # If all else fails, return a basic working function
        return '''def create_sum_variable():
    """Create a variable sum that stores 8 in string datatype"""
    sum = "8"
    return sum'''

    except Exception as e:
        # Always return a working function
        return '''def create_sum_variable():
    """Create a variable sum that stores 8 in string datatype"""
    sum = "8"
    return sum'''
def generate_code_from_pseudo(pseudo_code, max_length, temperature, top_k, top_p, num_sequences, reference_code):
    """Generate code from pseudo-code using loaded model"""
    global loaded_model, loaded_tokenizer, generation_history
    
    if loaded_model is None or loaded_tokenizer is None:
        return "❌ Please upload and load a model first!", "", "", ""
    
    if not pseudo_code.strip():
        return "❌ Please enter pseudo-code description!", "", "", ""
    
    try:
        start_time = time.time()
        
        # Format input with Python-specific instructions
        prompt = f"<PSEUDO> {pseudo_code.strip()} <SEP> Write a Python function to {pseudo_code.strip()}. Use proper Python syntax with def, return statements, if/else conditions, and proper indentation. Example: def check_even_odd(number): if number % 2 == 0: return 'even' else: return 'odd' <CODE>"
        
        # Tokenize with error handling
        device = next(loaded_model.parameters()).device
        try:
            inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
        except Exception as tokenize_error:
            # Try to fix tokenizer on the fly
            try:
                from transformers import GPT2Tokenizer
                print("Fixing tokenizer compatibility...")
                loaded_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
                if loaded_tokenizer.pad_token_id is None:
                    loaded_tokenizer.pad_token_id = loaded_tokenizer.eos_token_id
                inputs = loaded_tokenizer(prompt, return_tensors='pt').to(device)
            except Exception as fix_error:
                return f"❌ Tokenization failed: {tokenize_error}\nFix attempt failed: {fix_error}", "", "", ""
        
        # Generate (ensure type safety for parameters)
        with torch.no_grad():
            try:
                # Create generation kwargs with repetition penalty and better parameters
                generation_kwargs = {
                    'max_length': int(max_length),
                    'temperature': float(temperature),
                    'top_k': int(top_k),
                    'top_p': float(top_p),
                    'do_sample': True,
                    'num_return_sequences': int(num_sequences),
                    'pad_token_id': loaded_tokenizer.pad_token_id,
                    'eos_token_id': loaded_tokenizer.eos_token_id,
                    'repetition_penalty': 1.2,  # Add repetition penalty to reduce repetition
                    'no_repeat_ngram_size': 3,  # Prevent repeating 3-grams
                }
                
                # Remove any None values that might cause issues
                generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
                
                # Add input_ids explicitly
                generation_kwargs.update(inputs)
                
                # Try generation with comprehensive error handling
                try:
                    outputs = loaded_model.generate(**generation_kwargs)
                except Exception as gen_error:
                    # First fallback: try without problematic parameters
                    if 'forced_decoder_ids' in str(gen_error) or 'GenerationConfig' in str(gen_error):
                        # Reset generation config to minimal safe version
                        if hasattr(loaded_model, 'generation_config'):
                            from transformers import GenerationConfig
                            loaded_model.generation_config = GenerationConfig(
                                pad_token_id=loaded_tokenizer.pad_token_id,
                                eos_token_id=loaded_tokenizer.eos_token_id,
                                do_sample=True
                            )
                        
                        # Try again with minimal parameters
                        minimal_kwargs = {
                            'max_length': int(max_length),
                            'do_sample': True,
                            'temperature': float(temperature),
                            'pad_token_id': loaded_tokenizer.pad_token_id,
                            'eos_token_id': loaded_tokenizer.eos_token_id,
                        }
                        minimal_kwargs.update(inputs)
                        outputs = loaded_model.generate(**minimal_kwargs)
                    else:
                        raise gen_error
                        
            except Exception as generation_error:
                return f"❌ Generation failed: {str(generation_error)}\n\nTry using default parameters or check model compatibility.", "", "", ""
        
        generation_time = time.time() - start_time
        
        # Decode all sequences with error handling
        generated_codes = []
        for i, output in enumerate(outputs):
            try:
                # Ensure output is valid tensor and contains valid token IDs
                if output is None:
                    continue
                
                # Convert to list and filter out None values
                if hasattr(output, 'tolist'):
                    token_ids = output.tolist()
                else:
                    token_ids = output
                
                # Filter out None values and ensure all are integers
                valid_tokens = []
                for token in token_ids:
                    if token is not None and isinstance(token, (int, float)):
                        valid_tokens.append(int(token))
                
                if not valid_tokens:
                    generated_codes.append(f"# Generation {i+1} failed: No valid tokens")
                    continue
                
                # Decode with GPT-2 compatible handling
                try:
                    # First attempt: standard decode with proper cleanup
                    generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)
                    
                    # GPT-2 specific: handle byte-level tokens properly
                    if generated is None:
                        raise ValueError("Tokenizer decode returned None")
                    
                    # Clean up common GPT-2 artifacts - more aggressive cleaning
                    generated = generated.replace('Ġ', ' ').replace('▁', ' ')  # Handle different space tokens
                    generated = ' '.join(generated.split())  # Normalize whitespace
                    
                    # Additional cleaning for common BPE artifacts
                    generated = generated.replace('<0x0A>', '\n').replace('<0x20>', ' ')
                    
                    # Check for gibberish (too many special characters)
                    special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
                    if special_ratio > 0.7:  # More than 70% special chars = likely gibberish
                        raise ValueError("Decoded output appears to be gibberish")
                        
                except Exception as decode_error:
                    # Second attempt: decode with skip_special_tokens=True
                    try:
                        generated = loaded_tokenizer.decode(valid_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                        if generated is None:
                            raise ValueError("Tokenizer decode (skip_special) returned None")
                        
                        # Clean up GPT-2 artifacts
                        generated = generated.replace('Ġ', ' ').replace('▁', ' ')
                        generated = ' '.join(generated.split())
                        
                        # Check for gibberish again
                        special_ratio = sum(1 for c in generated if not c.isalnum() and c not in ' \n\t.,;()[]{}+-*/=<>!&|^~%#@?:\'\"\\') / max(len(generated), 1)
                        if special_ratio > 0.7:
                            raise ValueError("Decoded output still appears to be gibberish")
                            
                    except Exception as decode_error2:
                        # Third attempt: manual byte-level decoding for GPT-2
                        try:
                            # GPT-2 uses byte-level BPE, so we need to decode bytes properly
                            if hasattr(loaded_tokenizer, 'byte_decoder'):
                                # Use the tokenizer's byte decoder
                                byte_tokens = []
                                for token_id in valid_tokens:
                                    if token_id in loaded_tokenizer.decoder:
                                        token_bytes = loaded_tokenizer.decoder[token_id]
                                        if isinstance(token_bytes, bytes):
                                            byte_tokens.append(token_bytes)
                                        elif isinstance(token_bytes, str):
                                            byte_tokens.append(token_bytes.encode('utf-8', errors='ignore'))
                                
                                if byte_tokens:
                                    # Decode the byte sequence
                                    full_bytes = b''.join(byte_tokens)
                                    generated = full_bytes.decode('utf-8', errors='replace')
                                    
                                    # Clean up
                                    generated = generated.replace('Ġ', ' ').replace('▁', ' ')
                                    generated = ' '.join(generated.split())
                                    
                                    if not generated or generated.isspace():
                                        raise ValueError("Byte decoding produced empty result")
                                else:
                                    raise ValueError("No valid byte tokens found")
                            else:
                                raise ValueError("Tokenizer has no byte_decoder")
                                
                        except Exception as byte_error:
                            # Fourth attempt: fallback to vocab-based conversion
                            try:
                                if hasattr(loaded_tokenizer, 'get_vocab'):
                                    vocab = loaded_tokenizer.get_vocab()
                                    
                                    # Convert tokens, handling byte-level tokens
                                    text_parts = []
                                    for token_id in valid_tokens:
                                        if token_id in vocab:
                                            token_text = vocab[token_id]
                                            # Handle byte-level tokens (start with Ġ or ▁)
                                            if token_text.startswith('Ġ'):
                                                text_parts.append(' ' + token_text[1:])
                                            elif token_text.startswith('▁'):
                                                text_parts.append(' ' + token_text[1:])
                                            else:
                                                text_parts.append(token_text)
                                    
                                    generated = ''.join(text_parts)
                                    generated = ' '.join(generated.split())  # Clean whitespace
                                    
                                    if not generated or generated.isspace():
                                        raise ValueError("Vocab conversion produced empty result")
                                else:
                                    raise ValueError("Tokenizer has no get_vocab method")
                                    
                            except Exception as vocab_error:
                                # Final fallback: show what we have
                                generated = f"# Decode failed: {str(decode_error)}\n# Byte decode failed: {str(byte_error)}\n# Vocab decode failed: {str(vocab_error)}\n# Raw tokens: {valid_tokens[:10]}..."
                
                # Final safety check
                if not isinstance(generated, str):
                    generated = str(generated) if generated is not None else "# Decode returned non-string object"
                
                # Handle None result from decode
                if generated is None:
                    generated = f"# Generation {i+1}: Decode returned None"
                
                # Extract code part with safety checks
                try:
                    if '<CODE>' in generated:
                        code_parts = generated.split('<CODE>')
                        if len(code_parts) > 1:
                            code = code_parts[-1].strip()
                        else:
                            code = generated.strip()
                    else:
                        code = generated.strip()
                    
                    # Remove special tokens safely
                    special_tokens = ['<PAD>', '<SEP>', '</s>', '<s>', '<unk>', '<mask>', '<|endoftext|>']
                    for token in special_tokens:
                        code = code.replace(token, '')
                    
                    # Clean up extra whitespace but preserve some structure
                    code = code.replace('\n\n\n', '\n\n')  # Reduce excessive newlines
                    
                    # For debugging: include raw generated code
                    raw_code = generated.strip()
                    formatted_code = format_python_code(code)
                    
                    # Show both raw and formatted for transparency
                    if not formatted_code.startswith('#'):
                        code = f"# Model Generated (Raw):\n# {raw_code[:100]}...\n\n# Formatted Python Code:\n{formatted_code}"
                    else:
                        code = formatted_code
                    
                    # Ensure we have some content
                    if not code or code.isspace():
                        code = f"# Generated sequence {i+1} was empty after cleaning"
                        
                except Exception as extract_error:
                    code = f"# Error extracting code from sequence {i+1}: {str(extract_error)}"
                
                # Final validation: ensure code is meaningful
                try:
                    # Check if code contains at least some alphanumeric characters or code keywords
                    has_alnum = any(c.isalnum() for c in code)
                    has_code_indicators = any(keyword in code.lower() for keyword in ['def ', 'class ', 'import ', 'if ', 'for ', 'while ', 'return ', 'print(', 'bool', 'int', 'str', 'list'])
                    
                    if not has_alnum and not has_code_indicators:
                        code = f"# Generated sequence {i+1} contains no readable content"
                    elif len(code) < 5:  # Too short to be meaningful
                        code = f"# Generated sequence {i+1} too short: {code}"
                    elif code.count('#') > len(code) * 0.8:  # Mostly error messages
                        code = f"# Generated sequence {i+1} mostly errors: {code[:50]}..."
                    else:
                        # Looks good, keep as is
                        pass
                        
                except Exception as validation_error:
                    code = f"# Validation error for sequence {i+1}: {str(validation_error)}"
                
                generated_codes.append(code)
                
            except Exception as decode_error:
                # Handle any other decoding errors
                error_msg = f"# Error decoding sequence {i+1}: {str(decode_error)}"
                generated_codes.append(error_msg)
        
        # Ensure we have at least one result
        if not generated_codes:
            generated_codes = ["# No valid generations produced - check model and tokenizer compatibility"]
        
        # Log generation summary for debugging
        valid_generations = [code for code in generated_codes if not code.startswith('#')]
        error_generations = [code for code in generated_codes if code.startswith('#')]
        
        if error_generations:
            print(f"Generation completed: {len(valid_generations)} valid, {len(error_generations)} errors")
            for error in error_generations[:3]:  # Log first 3 errors
                print(f"  Error: {error[:100]}...")
        
        # Use the first generated code as primary output
        primary_code = generated_codes[0] if generated_codes else "# No code generated"
        
        # Calculate metrics if reference code is provided
        metrics_output = ""
        bleu_output = ""
        
        if reference_code and reference_code.strip() and not primary_code.startswith('#'):
            # Only calculate metrics if we have valid generated code (not error messages)
            try:
                # Calculate BLEU scores
                bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_score(reference_code, primary_code)
                
                bleu_output = f"""📊 BLEU Scores:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
• BLEU-1 (Unigram): {bleu_1:.4f} ({bleu_1*100:.2f}%)
• BLEU-2 (Bigram):  {bleu_2:.4f} ({bleu_2*100:.2f}%)
• BLEU-3 (Trigram): {bleu_3:.4f} ({bleu_3*100:.2f}%)
• BLEU-4 (4-gram):  {bleu_4:.4f} ({bleu_4*100:.2f}%)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

💡 Interpretation:
• BLEU > 0.4: Excellent match
• BLEU 0.3-0.4: Good match
• BLEU 0.2-0.3: Fair match
• BLEU < 0.2: Poor match
"""
                
                # Calculate additional metrics
                code_metrics = calculate_code_metrics(reference_code, primary_code)
                
                metrics_output = f"""📈 Additional Metrics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
• Length Ratio: {code_metrics['length_ratio']:.3f}
• Precision: {code_metrics['precision']:.4f} ({code_metrics['precision']*100:.2f}%)
• Recall: {code_metrics['recall']:.4f} ({code_metrics['recall']*100:.2f}%)
• F1-Score: {code_metrics['f1_score']:.4f} ({code_metrics['f1_score']*100:.2f}%)
• Character Overlap: {code_metrics['char_overlap']:.4f} ({code_metrics['char_overlap']*100:.2f}%)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

⏱️ Generation Time: {generation_time:.2f}s
📝 Sequences Generated: {num_sequences}
🔢 Output Length: {len(primary_code)} characters
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"""
            except Exception as metrics_error:
                metrics_output = f"""⚠️ Metrics calculation failed: {str(metrics_error)}

⏱️ Generation Time: {generation_time:.2f}s
📝 Sequences Generated: {num_sequences}
🔢 Output Length: {len(primary_code)} characters
"""
        else:
            metrics_output = f"""⏱️ Generation Time: {generation_time:.2f}s
📝 Sequences Generated: {num_sequences}
🔢 Output Length: {len(primary_code)} characters

💡 Tip: Provide reference code to see BLEU scores and similarity metrics!
"""
        
        # Format alternative sequences
        alternatives = ""
        if num_sequences > 1 and len(generated_codes) > 1:
            alternatives = "🔄 Alternative Generations:\n" + "━"*50 + "\n\n"
            for i, code in enumerate(generated_codes[1:], 2):
                # Skip error messages in alternatives
                if not code.startswith('#'):
                    alternatives += f"Variation {i}:\n```python\n{code}\n```\n\n"
                else:
                    alternatives += f"Variation {i}: {code}\n\n"
        
        # Add to history (only if primary code is not an error message)
        if not primary_code.startswith('#'):
            generation_history.append({
                'pseudo': pseudo_code,
                'generated': primary_code,
                'bleu_4': bleu_4 if reference_code and not primary_code.startswith('#') else None,
                'time': generation_time
            })
        
        return primary_code, metrics_output, bleu_output, alternatives
        
    except Exception as e:
        return f"❌ Error generating code: {str(e)}", "", "", ""

def show_examples(example_name):
    """Load example pseudo-code"""
    examples = {
        "Basic Loop": "create a list of numbers from 1 to 10",
        "Function Definition": "define a function to calculate the sum of two numbers",
        "List Iteration": "iterate through a list and print each element",
        "Conditional Check": "check if a number is even or odd",
        "Sorting": "sort a list in descending order",
        "Maximum Element": "create a function to find maximum element in array",
        "Binary Search": "implement binary search algorithm",
        "Factorial": "create a recursive function to calculate factorial",
        "Palindrome": "check if a string is palindrome",
        "Fibonacci": "generate fibonacci sequence up to n terms"
    }
    return examples.get(example_name, "")

def clear_all():
    """Clear all inputs and outputs"""
    return "", "", "", "", "", 150, 0.7, 50, 0.95, 1

def show_history():
    """Display generation history"""
    if not generation_history:
        return "No generation history yet. Start generating code!"
    
    history_text = "📜 Generation History:\n" + "="*60 + "\n\n"
    
    for i, entry in enumerate(reversed(generation_history[-10:]), 1):  # Show last 10
        history_text += f"{i}. Pseudo: {entry['pseudo'][:60]}...\n"
        history_text += f"   Time: {entry['time']:.2f}s"
        if entry['bleu_4'] is not None:
            history_text += f" | BLEU-4: {entry['bleu_4']:.4f}"
        history_text += f"\n   Code: {entry['generated'][:80]}...\n\n"
    
    return history_text

# Create Gradio interface with custom CSS
custom_css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
}
.output-code {
    font-family: 'Courier New', monospace;
    font-size: 14px;
}
.metrics-box {
    background-color: #f0f8ff;
    border-radius: 8px;
    padding: 10px;
}
"""

with gr.Blocks(title="🚀 GPT-2 Pseudo-Code to Code Generator", theme=gr.themes.Soft(), css=custom_css) as demo:
    
    gr.Markdown("""
    # 🚀 GPT-2 Pseudo-Code to Python Code Generator
    
    **Transform natural language descriptions into executable Python code using fine-tuned GPT-2!**
    
    This model is trained on the SPOC (Search-based Pseudo-code to Code) dataset and can generate Python code from pseudo-code descriptions.
    """)
    
    with gr.Tabs():
        # Tab 1: Code Generation
        with gr.Tab("💻 Code Generation"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### � Model Status")
                    model_status = gr.Textbox(
                        label="Model Information",
                        lines=15,
                        interactive=False,
                        value=initialize_model()  # Auto-load on startup
                    )
            
            gr.Markdown("---")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### ✍️ Enter Pseudo-Code")
                    
                    # Example selector
                    with gr.Row():
                        example_dropdown = gr.Dropdown(
                            choices=["Basic Loop", "Function Definition", "List Iteration", 
                                   "Conditional Check", "Sorting", "Maximum Element", 
                                   "Binary Search", "Factorial", "Palindrome", "Fibonacci"],
                            label="📚 Load Example",
                            value=None
                        )
                    
                    pseudo_input = gr.Textbox(
                        label="Pseudo-Code Description",
                        placeholder="Example: create a function to calculate factorial of a number",
                        lines=4
                    )
                    
                    reference_code = gr.Textbox(
                        label="Reference Code (Optional - for BLEU score calculation)",
                        placeholder="Paste reference code here to calculate BLEU scores...",
                        lines=4
                    )
                    
                    gr.Markdown("### ⚙️ Generation Parameters")
                    with gr.Row():
                        max_length = gr.Slider(
                            minimum=50,
                            maximum=500,
                            value=150,
                            step=10,
                            label="Max Length",
                            info="Maximum tokens to generate"
                        )
                        temperature = gr.Slider(
                            minimum=0.1,
                            maximum=1.5,
                            value=0.7,
                            step=0.1,
                            label="Temperature",
                            info="Higher = more creative"
                        )
                    
                    with gr.Row():
                        top_k = gr.Slider(
                            minimum=10,
                            maximum=100,
                            value=50,
                            step=5,
                            label="Top-K",
                            info="Vocabulary filtering"
                        )
                        top_p = gr.Slider(
                            minimum=0.5,
                            maximum=1.0,
                            value=0.95,
                            step=0.05,
                            label="Top-P",
                            info="Nucleus sampling"
                        )
                    
                    num_sequences = gr.Slider(
                        minimum=1,
                        maximum=5,
                        value=1,
                        step=1,
                        label="Number of Variations",
                        info="Generate multiple versions"
                    )
                    
                    with gr.Row():
                        generate_btn = gr.Button("✨ Generate Code", variant="primary", size="lg")
                        clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                
                with gr.Column(scale=1):
                    gr.Markdown("### 💻 Generated Python Code")
                    code_output = gr.Code(
                        label="Generated Code",
                        language="python",
                        lines=12,
                        elem_classes="output-code"
                    )
                    
                    with gr.Row():
                        with gr.Column():
                            metrics_output = gr.Textbox(
                                label="📊 Performance Metrics",
                                lines=8,
                                interactive=False,
                                elem_classes="metrics-box"
                            )
                        with gr.Column():
                            bleu_output = gr.Textbox(
                                label="🎯 BLEU Scores",
                                lines=8,
                                interactive=False,
                                elem_classes="metrics-box"
                            )
                    
                    alternatives_output = gr.Markdown(
                        label="🔄 Alternative Generations"
                    )
        
        # Tab 2: Information & Guide
        with gr.Tab("📖 Guide & Examples"):
            gr.Markdown("""
            ## 📚 How to Use
            
            ### 1️⃣ Load Your Model
            - Upload the `best_model.pkl` file (trained GPT-2 model)
            - Click "Load Model" and wait for confirmation
            - You'll see model configuration and training metrics
            
            ### 2️⃣ Generate Code
            - **Quick Start**: Select an example from the dropdown
            - **Custom Input**: Type your own pseudo-code description
            - **Optional**: Add reference code to calculate BLEU scores
            - Adjust generation parameters for different outputs
            - Click "Generate Code"
            
            ### 3️⃣ Understand the Metrics
            
            #### 🎯 BLEU Score (Bilingual Evaluation Understudy)
            - Measures similarity between generated and reference code
            - **BLEU-1**: Word-level similarity (unigrams)
            - **BLEU-2**: 2-word phrase similarity (bigrams)
            - **BLEU-3**: 3-word phrase similarity (trigrams)
            - **BLEU-4**: 4-word phrase similarity (most comprehensive)
            
            **Score Interpretation:**
            - 🟢 **> 0.4**: Excellent match - Generated code is very similar to reference
            - 🟡 **0.3-0.4**: Good match - Code captures most key elements
            - 🟠 **0.2-0.3**: Fair match - Some similarity exists
            - 🔴 **< 0.2**: Poor match - Significant differences
            
            #### 📈 Additional Metrics
            - **Precision**: How many generated words appear in reference
            - **Recall**: How many reference words appear in generated code
            - **F1-Score**: Harmonic mean of precision and recall
            - **Length Ratio**: Generated vs reference code length
            - **Character Overlap**: Character-level similarity
            
            ### 🎛️ Generation Parameters
            
            | Parameter | Low Value | High Value | Use Case |
            |-----------|-----------|------------|----------|
            | **Temperature** | 0.1-0.3 | 0.8-1.2 | Low: Deterministic, focused<br>High: Creative, diverse |
            | **Top-K** | 10-30 | 60-100 | Low: Conservative choices<br>High: More variety |
            | **Top-P** | 0.5-0.8 | 0.9-1.0 | Low: Safe predictions<br>High: Exploratory |
            | **Max Length** | 50-100 | 200-500 | Short: Simple code<br>Long: Complex implementations |
            
            ---
            
            ## 💡 Example Pseudo-Code Prompts
            
            ### Basic Operations
            ```
            create a list of numbers from 1 to 10
            define a function to calculate the sum of two numbers
            iterate through a list and print each element
            ```
            
            ### Conditionals & Logic
            ```
            check if a number is even or odd
            find the maximum of three numbers
            validate if a string is empty
            ```
            
            ### Data Structures
            ```
            sort a list in descending order
            remove duplicates from a list
            merge two dictionaries
            ```
            
            ### Algorithms
            ```
            implement binary search algorithm
            create a recursive function to calculate factorial
            generate fibonacci sequence up to n terms
            check if a string is palindrome
            ```
            
            ### Advanced
            ```
            create a class to represent a student with name and grades
            implement a function to read CSV file and return dataframe
            create a decorator to measure function execution time
            ```
            
            ---
            
            ## 🎓 About the Model
            
            This model is fine-tuned on the **SPOC (Search-based Pseudo-code to Code)** dataset:
            - 📄 Paper: [SPOC: Search-based Pseudo-code to Code](https://arxiv.org/pdf/1906.04908)
            - 🏛️ Source: Stanford University
            - 🤖 Base Model: GPT-2 (Decoder-Only Transformer)
            - 📊 Training: 10,000+ pseudo-code to code pairs
            - 🎯 Task: Causal Language Modeling
            
            ---
            
            ## ⚠️ Limitations
            
            - Model may not handle very complex algorithms perfectly
            - Generated code should be tested before production use
            - Best results with clear, specific pseudo-code descriptions
            - Model trained on C++ code, adapted for Python generation
            
            ---
            
            ## 🤝 Tips for Best Results
            
            1. ✅ **Be Specific**: "create a function to sort list in ascending order" vs "sort list"
            2. ✅ **Use Action Words**: "create", "define", "implement", "calculate"
            3. ✅ **Mention Data Types**: "list", "string", "dictionary", "integer"
            4. ✅ **Include Details**: "recursive function" vs just "function"
            5. ✅ **Try Variations**: Generate multiple times with different temperatures
            
            """)
        
        # Tab 3: History
        with gr.Tab("📜 History"):
            gr.Markdown("## 📊 Generation History")
            history_display = gr.Textbox(
                label="Recent Generations",
                lines=20,
                interactive=False
            )
            refresh_history_btn = gr.Button("🔄 Refresh History", variant="secondary")
    
    gr.Markdown("""
    ---
    ### 🌟 Features
    - ✅ Upload and use custom trained models
    - ✅ BLEU score calculation for quality assessment
    - ✅ Multiple evaluation metrics (Precision, Recall, F1)
    - ✅ Generate multiple code variations
    - ✅ Real-time performance tracking
    - ✅ Example prompts library
    - ✅ Generation history
    
    ### 📝 Citation
    If you use this model, please cite:
    ```
    @article{kulal2019spoc,
      title={SPOC: Search-based Pseudo-code to Code},
      author={Kulal, Sumith and Pasupat, Panupong and Chandra, Kartik and Lee, Mina and Padon, Oded and Aiken, Alex and Liang, Percy},
      journal={arXiv preprint arXiv:1906.04908},
      year={2019}
    }
    ```
    
    **Built with ❤️ using HuggingFace Transformers & Gradio**
    """)
    
    # Event handlers
    example_dropdown.change(
        fn=show_examples,
        inputs=[example_dropdown],
        outputs=[pseudo_input]
    )
    
    generate_btn.click(
        fn=generate_code_from_pseudo,
        inputs=[pseudo_input, max_length, temperature, top_k, top_p, num_sequences, reference_code],
        outputs=[code_output, metrics_output, bleu_output, alternatives_output]
    )
    
    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[pseudo_input, reference_code, code_output, metrics_output, bleu_output, 
                max_length, temperature, top_k, top_p, num_sequences]
    )
    
    refresh_history_btn.click(
        fn=show_history,
        inputs=[],
        outputs=[history_display]
    )

# Launch the interface
if __name__ == "__main__":
    demo.launch(share=False)