Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import os | |
| import time | |
| from PIL import Image | |
| # Only import APIs if available | |
| try: | |
| from google import genai | |
| GENAI_AVAILABLE = True | |
| except ImportError: | |
| GENAI_AVAILABLE = False | |
| try: | |
| from openai import OpenAI | |
| OPENAI_AVAILABLE = True | |
| except ImportError: | |
| OPENAI_AVAILABLE = False | |
| BASE_DIR = os.path.dirname(__file__) | |
| DATA_DIR = os.path.join(BASE_DIR, "data") | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Translation Comparison Tool", | |
| page_icon="🌐", | |
| layout="wide", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Custom CSS for Material Design with Tailwind-inspired styling | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| .main-header { | |
| font-family: 'Inter', sans-serif; | |
| font-size: 1.8rem; | |
| font-weight: 600; | |
| color: #1f2937; | |
| margin-bottom: 0.5rem; | |
| letter-spacing: -0.025em; | |
| text-align: center; | |
| } | |
| .sub-header { | |
| font-family: 'Inter', sans-serif; | |
| font-size: 1.1rem; | |
| font-weight: 400; | |
| color: #6b7280; | |
| margin-bottom: 2rem; | |
| line-height: 1.6; | |
| text-align: center; | |
| } | |
| .logo-container { | |
| display: flex; | |
| justify-content: center; | |
| margin-bottom: 2rem; | |
| } | |
| /* Bold and full-width tabs */ | |
| .stTabs [data-baseweb="tab-list"] { | |
| gap: 0px; | |
| width: 100%; | |
| } | |
| .stTabs [data-baseweb="tab"] { | |
| font-family: 'Inter', sans-serif !important; | |
| font-size: 1.1rem !important; | |
| font-weight: 600 !important; | |
| padding: 12px 24px !important; | |
| width: 50% !important; | |
| justify-content: center !important; | |
| border-radius: 0 !important; | |
| background-color: #f8f9fa !important; | |
| color: #374151 !important; | |
| border: 1px solid #e5e7eb !important; | |
| margin: 0 !important; | |
| } | |
| .stTabs [data-baseweb="tab"]:hover { | |
| background-color: #f1f3f4 !important; | |
| color: #1f2937 !important; | |
| } | |
| .stTabs [aria-selected="true"] { | |
| background-color: #3b82f6 !important; | |
| color: white !important; | |
| font-weight: 700 !important; | |
| border-color: #3b82f6 !important; | |
| } | |
| .stTabs [data-baseweb="tab-highlight"] { | |
| display: none !important; | |
| } | |
| .stTabs [data-baseweb="tab-border"] { | |
| display: none !important; | |
| } | |
| .tab-header { | |
| font-family: 'Inter', sans-serif; | |
| font-size: 1.5rem; | |
| font-weight: 600; | |
| color: #374151; | |
| margin-bottom: 1rem; | |
| } | |
| .metric-card { | |
| background: #f9fafb; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 0.75rem; | |
| padding: 1.5rem; | |
| margin: 0.5rem 0; | |
| box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1); | |
| } | |
| .metric-title { | |
| font-family: 'Inter', sans-serif; | |
| font-size: 0.875rem; | |
| font-weight: 500; | |
| color: #6b7280; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| margin-bottom: 0.25rem; | |
| } | |
| .metric-value { | |
| font-family: 'Inter', sans-serif; | |
| font-size: 2rem; | |
| font-weight: 700; | |
| color: #1f2937; | |
| line-height: 1; | |
| } | |
| .support-info { | |
| color: #5f6368; | |
| font-size: 12px; | |
| margin-top: 20px; | |
| text-align: center; | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .translate-container { | |
| border: 1px solid #e0e0e0; | |
| border-radius: 8px; | |
| margin: 20px 0; | |
| overflow: hidden; | |
| box-shadow: 0 2px 5px rgba(0,0,0,0.1); | |
| } | |
| .translate-header { | |
| background: #f8f9fa; | |
| border-bottom: 1px solid #e0e0e0; | |
| padding: 12px 16px; | |
| font-family: 'Inter', sans-serif; | |
| font-weight: 500; | |
| font-size: 14px; | |
| color: #5f6368; | |
| display: flex; | |
| align-items: center; | |
| box-sizing: border-box; | |
| } | |
| .language-tabs-container { | |
| border: 1px solid #e0e0e0; | |
| border-radius: 8px; | |
| margin: 20px 0; | |
| overflow: hidden; | |
| box-shadow: 0 2px 5px rgba(0,0,0,0.1); | |
| } | |
| .language-tabs-header { | |
| background: #f8f9fa; | |
| border-bottom: 1px solid #e0e0e0; | |
| height: 45px; | |
| display: flex; | |
| align-items: stretch; | |
| box-sizing: border-box; | |
| padding: 0; | |
| } | |
| .language-tab { | |
| flex: 1; | |
| background: #f8f9fa; | |
| border: none; | |
| border-right: 1px solid #e0e0e0; | |
| padding: 0; | |
| font-family: 'Inter', sans-serif; | |
| font-size: 14px; | |
| font-weight: 500; | |
| cursor: pointer; | |
| transition: all 0.2s ease; | |
| color: #6b7280; | |
| text-align: center; | |
| height: 45px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| box-sizing: border-box; | |
| text-decoration: none; | |
| outline: none; | |
| } | |
| .language-tab:last-child { | |
| border-right: none; | |
| } | |
| .language-tab.active { | |
| background: white; | |
| color: #3b82f6; | |
| border-bottom: 2px solid #3b82f6; | |
| font-weight: 600; | |
| } | |
| .language-tab:hover:not(.active) { | |
| background: #f1f3f4; | |
| color: #374151; | |
| } | |
| .stTextArea textarea { | |
| resize: none !important; | |
| min-height: 350px !important; | |
| max-height: 350px !important; | |
| height: 350px !important; | |
| } | |
| .stTextArea textarea[disabled] { | |
| color: #000000 !important; | |
| opacity: 1 !important; | |
| -webkit-text-fill-color: #000000 !important; | |
| } | |
| /* Make buttons rounded and complete */ | |
| .stButton > button { | |
| font-family: 'Inter', sans-serif !important; | |
| font-size: 0.75rem !important; | |
| font-weight: 500 !important; | |
| border-radius: 6px !important; /* Changed from 0 to 6px for rounded corners */ | |
| height: 35px !important; | |
| border: 1px solid #d1d5db !important; | |
| margin: 0 2px !important; /* Added small margin between buttons */ | |
| padding: 0 12px !important; /* Increased padding for better look */ | |
| cursor: pointer !important; | |
| transition: all 0.2s ease !important; | |
| } | |
| .stButton > button[data-testid="baseButton-secondary"] { | |
| background-color: #f3f4f6 !important; | |
| color: #374151 !important; | |
| border-color: #d1d5db !important; | |
| box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05) !important; | |
| } | |
| .stButton > button[data-testid="baseButton-secondary"]:hover { | |
| background-color: #e5e7eb !important; | |
| color: #1f2937 !important; | |
| border-color: #9ca3af !important; | |
| box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.1) !important; | |
| transform: translateY(-1px) !important; | |
| } | |
| .stButton > button[data-testid="baseButton-primary"] { | |
| background-color: #3b82f6 !important; | |
| color: #ffffff !important; | |
| font-weight: 600 !important; | |
| border-color: #3b82f6 !important; | |
| box-shadow: 0 2px 4px 0 rgba(59, 130, 246, 0.3) !important; | |
| } | |
| .stButton > button[data-testid="baseButton-primary"]:hover { | |
| background-color: #2563eb !important; | |
| color: #ffffff !important; | |
| border-color: #2563eb !important; | |
| transform: translateY(-1px) !important; | |
| } | |
| /* Remove the border-right rule since we're using margins now */ | |
| /* Hide the default Streamlit button styling for tab buttons */ | |
| .language-tab-button { | |
| background: none !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| height: 100% !important; | |
| width: 100% !important; | |
| color: inherit !important; | |
| font-weight: inherit !important; | |
| } | |
| .language-tab-button:hover { | |
| background: none !important; | |
| border: none !important; | |
| } | |
| .language-tab-button:focus { | |
| background: none !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .score-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 12px; | |
| padding: 20px; | |
| text-align: center; | |
| color: white; | |
| margin: 10px 0; | |
| } | |
| .score-value { | |
| font-size: 2.5rem; | |
| font-weight: 700; | |
| margin: 10px 0; | |
| } | |
| .score-label { | |
| font-size: 0.9rem; | |
| opacity: 0.9; | |
| text-transform: uppercase; | |
| letter-spacing: 1px; | |
| } | |
| .comparison-container { | |
| background: #f8fafc; | |
| border: 1px solid #e2e8f0; | |
| border-radius: 12px; | |
| padding: 24px; | |
| margin: 20px 0; | |
| } | |
| .word-diff { | |
| display: inline-block; | |
| padding: 4px 8px; | |
| margin: 2px; | |
| border-radius: 6px; | |
| font-weight: 500; | |
| } | |
| .word-added { | |
| background: #dcfce7; | |
| color: #166534; | |
| border: 1px solid #bbf7d0; | |
| } | |
| .word-removed { | |
| background: #fef2f2; | |
| color: #dc2626; | |
| border: 1px solid #fecaca; | |
| } | |
| .word-common { | |
| background: #f1f5f9; | |
| color: #475569; | |
| border: 1px solid #e2e8f0; | |
| } | |
| .block-container { | |
| padding-top: 1rem; | |
| padding-bottom: 0rem; | |
| } | |
| .main > div { | |
| padding-top: 1rem; | |
| } | |
| /* Hide Streamlit header and footer */ | |
| header[data-testid="stHeader"] { | |
| height: 0px; | |
| display: none; | |
| } | |
| .stDeployButton { | |
| display: none; | |
| } | |
| footer { | |
| display: none; | |
| } | |
| #MainMenu { | |
| display: none; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Model configurations | |
| MODEL_CONFIG = { | |
| 'Gemini': { | |
| 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], | |
| 'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'], | |
| 'default_model': 'gemini-2.0-flash-exp' | |
| }, | |
| 'GPT': { | |
| 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], | |
| 'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'], | |
| 'default_model': 'gpt-4' | |
| }, | |
| 'NLLB': { | |
| 'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available | |
| 'models': { | |
| 'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B', | |
| 'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B' | |
| } | |
| } | |
| } | |
| # Language code mappings | |
| LANGUAGE_CODES = { | |
| 'Afrikaans': 'afr', | |
| 'Northern Sotho': 'nso', | |
| 'isiZulu': 'isizulu' | |
| } | |
| # Load logo | |
| def load_logo(): | |
| """Load logo with error handling""" | |
| try: | |
| if os.path.exists(f"{BASE_DIR}/logo.png"): | |
| return Image.open(f"{BASE_DIR}/logo.png") | |
| except Exception as e: | |
| st.warning(f"Could not load logo: {str(e)}") | |
| return None | |
| # Load and cache data | |
| def load_translation_data(): | |
| """Load sample translation data""" | |
| try: | |
| sample_data = { | |
| 'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'], | |
| 'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], | |
| 'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], | |
| 'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], | |
| 'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], | |
| 'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], | |
| 'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'], | |
| 'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], | |
| 'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], | |
| 'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], | |
| 'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], | |
| 'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], | |
| 'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], | |
| 'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], | |
| 'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'] | |
| } | |
| return pd.DataFrame(sample_data) | |
| except Exception as e: | |
| st.error(f"Error loading data: {str(e)}") | |
| return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']}) | |
| def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None): | |
| """Translate text using Gemini API""" | |
| try: | |
| if not GENAI_AVAILABLE: | |
| return "❌ Gemini library not installed" | |
| if not client: | |
| return "❌ Gemini API not configured. Please check your GEMINI_API_KEY." | |
| lang_map = { | |
| 'Afrikaans': 'Afrikaans', | |
| 'Northern Sotho': 'Northern Sotho (Sepedi)', | |
| 'isiZulu': 'isiZulu' | |
| } | |
| prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations." | |
| response = client.models.generate_content( | |
| model=model_name, contents=prompt | |
| ) | |
| return response.text.strip() | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| def translate_with_openai(text, target_language, model_name="gpt-4o", client=None): | |
| """Translate text using OpenAI API with Chat Completions""" | |
| try: | |
| if not OPENAI_AVAILABLE: | |
| return "❌ OpenAI library not installed" | |
| if not client: | |
| return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY." | |
| lang_map = { | |
| 'Afrikaans': 'Afrikaans', | |
| 'Northern Sotho': 'Northern Sotho (Sepedi)', | |
| 'isiZulu': 'isiZulu' | |
| } | |
| # Use Chat Completions API (supported indefinitely) | |
| response = client.chat.completions.create( | |
| model=model_name, | |
| messages=[ | |
| {"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."}, | |
| {"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"} | |
| ], | |
| max_tokens=1000, | |
| temperature=0.3 # Lower temperature for more consistent translations | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| def initialize_apis(): | |
| """Initialize API clients with proper error handling, supporting both local and HF Spaces.""" | |
| genai_client = None | |
| openai_client = None | |
| def get_secret(name): | |
| """Fetch secret from env first (Docker Spaces), then Streamlit secrets.""" | |
| return ( | |
| os.environ.get(name) | |
| or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None) | |
| ) | |
| try: | |
| # Gemini API | |
| if GENAI_AVAILABLE: | |
| try: | |
| api_key = get_secret("GEMINI_API_KEY") | |
| if api_key: | |
| genai_client = genai.Client(api_key=api_key) | |
| else: | |
| st.warning("⚠️ Gemini API key not found") | |
| except Exception as e: | |
| st.error(f"❌ Gemini API error: {str(e)}") | |
| # OpenAI API | |
| if OPENAI_AVAILABLE: | |
| try: | |
| api_key = get_secret("OPENAI_API_KEY") | |
| if api_key: | |
| try: | |
| # Try new OpenAI API client | |
| openai_client = OpenAI(api_key=api_key) | |
| except TypeError: | |
| import openai | |
| openai.api_key = api_key | |
| openai_client = openai | |
| else: | |
| st.warning("⚠️ OpenAI API key not found") | |
| except Exception as e: | |
| st.error(f"❌ OpenAI API error: {str(e)}") | |
| except Exception as e: | |
| st.error(f"❌ API initialization error: {str(e)}") | |
| return genai_client, openai_client | |
| def translate_with_nllb(text, target_language): | |
| """Translate text using unified NLLB API""" | |
| try: | |
| import requests | |
| # Single ngrok URL for unified API | |
| API_URL = "https://4c2faecc052a.ngrok-free.app" | |
| # Map Streamlit language names to API format | |
| lang_mapping = { | |
| 'Northern Sotho': 'nso', | |
| 'isiZulu': 'zul' | |
| } | |
| api_lang = lang_mapping.get(target_language, target_language.lower()) | |
| response = requests.post( | |
| f"{API_URL}/translate_simple", | |
| params={ | |
| "text": text, | |
| "target_language": api_lang | |
| }, | |
| timeout=30 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| return result.get(api_lang, '❌ Translation not found') | |
| else: | |
| return f"❌ API Error: {response.status_code}" | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| def create_language_tabs(available_languages, current_language, key_suffix=""): | |
| """Create language tabs with proper styling""" | |
| tabs_html = '<div class="language-tabs-container"><div class="language-tabs-header">' | |
| for lang in available_languages: | |
| active_class = "active" if lang == current_language else "" | |
| tabs_html += f''' | |
| <div class="language-tab {active_class}" onclick="selectLanguage('{lang}', '{key_suffix}')"> | |
| {lang} | |
| </div> | |
| ''' | |
| tabs_html += '</div></div>' | |
| # Add JavaScript for tab functionality | |
| script = f''' | |
| <script> | |
| function selectLanguage(lang, suffix) {{ | |
| // This would normally update the session state, but since we can't do that from JavaScript, | |
| // we'll use the button approach below instead | |
| }} | |
| </script> | |
| ''' | |
| return tabs_html + script | |
| def main(): | |
| """Main application function""" | |
| # Load and display logo and title side by side | |
| logo = load_logo() | |
| # Initialize session state FIRST to avoid refreshes | |
| if 'target_language' not in st.session_state: | |
| st.session_state.target_language = 'Afrikaans' | |
| if 'translation_result' not in st.session_state: | |
| st.session_state.translation_result = "" | |
| if 'current_page' not in st.session_state: | |
| st.session_state.current_page = 1 | |
| if 'initialized' not in st.session_state: | |
| st.session_state.initialized = True | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| if logo: | |
| # Convert logo to base64 for HTML embedding | |
| import base64 | |
| from io import BytesIO | |
| buffered = BytesIO() | |
| logo.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode() | |
| st.markdown(f''' | |
| <div style="display: flex; align-items: center; justify-content: center; gap: 0px; margin-bottom: 1rem;"> | |
| <img src="data:image/png;base64,{img_str}" width="180"> | |
| <h1 class="main-header" style="margin: 20px;">UP Translate</h1> | |
| </div> | |
| ''', unsafe_allow_html=True) | |
| else: | |
| st.markdown('<h1 class="main-header" style="margin-bottom: 1rem;">UP Translate</h1>', unsafe_allow_html=True) | |
| # Initialize APIs | |
| genai_client, openai_client = initialize_apis() | |
| # Initialize session state | |
| if 'target_language' not in st.session_state: | |
| st.session_state.target_language = 'Afrikaans' | |
| if 'translation_result' not in st.session_state: | |
| st.session_state.translation_result = "" | |
| # Create tabs | |
| tab1, tab2 = st.tabs(["🤖 Live Translations", "📊 Existing Translations"]) | |
| with tab1: | |
| # st.markdown('<h2 class="tab-header">Live Translation</h2>', unsafe_allow_html=True) | |
| # Create simplified model options | |
| model_options = [] | |
| model_mapping = {} | |
| # Add Gemini models | |
| for model in MODEL_CONFIG['Gemini']['models']: | |
| display_name = f"Gemini - {model}" | |
| model_options.append(display_name) | |
| model_mapping[display_name] = ('Gemini', None, model) | |
| # Add GPT models | |
| for model in MODEL_CONFIG['GPT']['models']: | |
| display_name = f"GPT - {model}" | |
| model_options.append(display_name) | |
| model_mapping[display_name] = ('GPT', None, model) | |
| # Add single NLLB option | |
| model_options.append("NLLB - Specialized Models") | |
| model_mapping["NLLB - Specialized Models"] = ('NLLB', None, None) | |
| # Model selection with inline label | |
| label_col, dropdown_col = st.columns([2, 10]) | |
| with label_col: | |
| st.markdown('<div style="margin-top: 8px; font-weight: 500;">Select Model:</div>', unsafe_allow_html=True) | |
| with dropdown_col: | |
| selected_model_option = st.selectbox( | |
| "Select Model:", | |
| model_options, | |
| index=0, | |
| key="model_selection_dropdown", | |
| label_visibility="collapsed" | |
| ) | |
| selected_provider, _, selected_model = model_mapping[selected_model_option] | |
| # Translation interface | |
| col_left, col_center, col_right = st.columns([5, 1, 5]) | |
| # Left side - English Input | |
| with col_left: | |
| st.markdown('<div class="translate-container">', unsafe_allow_html=True) | |
| st.markdown('<div class="translate-header">English</div>', unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| input_text = st.text_area( | |
| "Input", | |
| placeholder="Input text here", | |
| height=350, | |
| key="input_text_live", | |
| label_visibility="collapsed" | |
| ) | |
| # Center - Translate Button | |
| with col_center: | |
| # Add spacing to align button with text areas | |
| st.markdown('<div style="height: 150px;"></div>', unsafe_allow_html=True) | |
| translate_clicked = st.button( | |
| "Translate", | |
| key="translate_btn_live", | |
| help="Translate text", | |
| type="primary", | |
| use_container_width=True | |
| ) | |
| # Right side - Translation Output | |
| with col_right: | |
| # Determine available languages based on selected provider | |
| if selected_provider == 'NLLB': | |
| available_languages = MODEL_CONFIG['NLLB']['languages'] | |
| else: | |
| available_languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] | |
| # Set default language to first available if current selection not available | |
| if st.session_state.target_language not in available_languages: | |
| st.session_state.target_language = available_languages[0] | |
| # Create container with custom styling | |
| st.markdown('<div class="translate-container">', unsafe_allow_html=True) | |
| # Language selection buttons | |
| lang_cols = st.columns(len(available_languages)) | |
| for i, lang in enumerate(available_languages): | |
| with lang_cols[i]: | |
| button_type = "primary" if lang == st.session_state.target_language else "secondary" | |
| if st.button( | |
| lang, | |
| key=f"lang_btn_{lang}_live", | |
| type=button_type, | |
| use_container_width=True | |
| ): | |
| if st.session_state.target_language != lang: # Only update if different | |
| st.session_state.target_language = lang | |
| st.session_state.translation_result = "" # Clear previous result | |
| st.rerun() | |
| # Translation logic | |
| if translate_clicked and input_text: | |
| with st.spinner("Translating..."): | |
| target_lang = st.session_state.target_language | |
| if selected_provider == 'Gemini': | |
| result = translate_with_gemini(input_text, target_lang, selected_model, genai_client) | |
| elif selected_provider == 'GPT': | |
| result = translate_with_openai(input_text, target_lang, selected_model, openai_client) | |
| elif selected_provider == 'NLLB': | |
| result = translate_with_nllb(input_text, target_lang) | |
| st.session_state.translation_result = result | |
| # Translation output area with proper labeling | |
| st.text_area( | |
| f"Translation ({st.session_state.target_language})", # Dynamic label | |
| value=st.session_state.translation_result, | |
| placeholder="Translation will appear here", | |
| height=350, | |
| key="translation_output_live_fixed", # Changed key to avoid conflicts | |
| disabled=True, | |
| label_visibility="collapsed" | |
| ) | |
| # Support information | |
| st.markdown(""" | |
| <div class="support-info"> | |
| <strong>Available Models:</strong><br> | |
| 🔮 <strong>Gemini:</strong> All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)<br> | |
| 🧠 <strong>GPT:</strong> All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)<br> | |
| 🤗 <strong>NLLB:</strong> Northern Sotho, isiZulu only (specialized models) | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with tab2: | |
| # Load data from base directory automatically | |
| def load_analysis_data(): | |
| """Load all analysis data from base directory""" | |
| df_translations = None | |
| df_bleu = None | |
| df_chrf = None | |
| df_comet = None | |
| try: | |
| # Try to load translations data | |
| if os.path.exists(f"{DATA_DIR}/translations.tsv"): | |
| df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t") | |
| # Convert new CSV format to expected format for analysis | |
| # New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb | |
| # Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc. | |
| # Rename zul columns to isizulu for backward compatibility with analysis code | |
| column_mapping = { | |
| 'zul_human': 'isizulu_human', | |
| 'zul_revised': 'isizulu_revised', | |
| 'zul_gemini': 'isizulu_mt_gemini', | |
| 'zul_gpt': 'isizulu_mt_gpt', | |
| 'zul_nllb': 'isizulu_mt_nllb', | |
| 'afr_gemini': 'afr_mt_gemini', | |
| 'afr_gpt': 'afr_mt_gpt', | |
| 'nso_gemini': 'nso_mt_gemini', | |
| 'nso_gpt': 'nso_mt_gpt', | |
| 'nso_nllb': 'nso_mt_nllb' | |
| } | |
| df_translations = df_translations.rename(columns=column_mapping) | |
| elif os.path.exists(f"{DATA_DIR}/translation_data.csv"): | |
| df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv") | |
| else: | |
| print("No translation data found, using sample data") | |
| df_translations = load_translation_data() # Fallback to sample data | |
| # Try to load BLEU scores | |
| if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"): | |
| df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv") | |
| # Convert zul references to isizulu for compatibility | |
| df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_') | |
| df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct | |
| else: | |
| # Sample BLEU data (using isizulu for compatibility with existing analysis code) | |
| df_bleu = pd.DataFrame({ | |
| 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'], | |
| 'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73], | |
| 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu'] | |
| }) | |
| # Try to load COMET scores | |
| if os.path.exists(f"{DATA_DIR}/comet_scores.csv"): | |
| df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv") | |
| else: | |
| # Sample COMET data | |
| df_comet = pd.DataFrame({ | |
| 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], | |
| 'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90], | |
| 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] | |
| }) | |
| # Try to load CHRF scores | |
| if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"): | |
| df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv") | |
| else: | |
| # Sample CHRF data | |
| df_chrf = pd.DataFrame({ | |
| 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], | |
| 'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85], | |
| 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] | |
| }) | |
| return df_translations, df_bleu, df_comet, df_chrf | |
| except Exception as e: | |
| st.error(f"Error loading data: {str(e)}") | |
| return None, None, None, None | |
| # Load all data | |
| df_translations, df_bleu, df_comet, df_chrf = load_analysis_data() | |
| if df_translations is not None: | |
| # Language selection in columns | |
| lang_col1, lang_col2 = st.columns([2, 10]) | |
| with lang_col1: | |
| st.markdown('<div style="margin-top: 8px; font-weight: 500;">Select Language:</div>', unsafe_allow_html=True) | |
| with lang_col2: | |
| languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] | |
| selected_lang = st.selectbox( | |
| "Select Language for Analysis:", | |
| languages, | |
| key="global_lang_select", | |
| label_visibility="collapsed" | |
| ) | |
| # Get language code | |
| lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} | |
| code = lang_codes[selected_lang] | |
| # Create analysis tabs | |
| analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"]) | |
| with analysis_tab1: | |
| # Translation Samples Tab | |
| st.markdown(""" | |
| <div style="margin: 20px 0;"> | |
| <h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;"> | |
| 📝 Translation Samples for {selected_lang} | |
| </h4> | |
| </div> | |
| """.format(selected_lang=selected_lang), unsafe_allow_html=True) | |
| # Use the global language selection | |
| samples_code = code | |
| # Show sample translations for the selected language | |
| display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)] | |
| if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column | |
| # Control panel | |
| control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2]) | |
| with control_col1: | |
| st.markdown('<div style="margin-top: 8px; font-weight: 500;">Samples per page:</div>', unsafe_allow_html=True) | |
| with control_col2: | |
| page_size = st.selectbox( | |
| "Samples per page:", | |
| [10, 25, 50, 100], | |
| index=0, | |
| key="page_size_select", | |
| label_visibility="collapsed" | |
| ) | |
| # Initialize session state for pagination | |
| if 'current_page' not in st.session_state: | |
| st.session_state.current_page = 1 | |
| # Filter data and calculate pagination | |
| available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all') | |
| total_samples = len(available_data) | |
| total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division | |
| # Ensure current page is valid | |
| if st.session_state.current_page > total_pages: | |
| st.session_state.current_page = 1 | |
| # Calculate start and end indices | |
| start_idx = (st.session_state.current_page - 1) * page_size | |
| end_idx = min(start_idx + page_size, total_samples) | |
| # Get current page data | |
| current_page_data = available_data.iloc[start_idx:end_idx] | |
| with control_col3: | |
| st.markdown('<div style="margin-top: 8px; font-weight: 500;">Page:</div>', unsafe_allow_html=True) | |
| with control_col4: | |
| # Page navigation | |
| nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1]) | |
| with nav_col1: | |
| if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)): | |
| st.session_state.current_page = 1 | |
| st.rerun() | |
| with nav_col2: | |
| if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)): | |
| st.session_state.current_page -= 1 | |
| st.rerun() | |
| with nav_col3: | |
| st.markdown(f'<div style="text-align: center; margin-top: 8px; font-weight: 500;">{st.session_state.current_page} / {total_pages}</div>', unsafe_allow_html=True) | |
| with nav_col4: | |
| if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)): | |
| st.session_state.current_page += 1 | |
| st.rerun() | |
| with nav_col5: | |
| if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)): | |
| st.session_state.current_page = total_pages | |
| st.rerun() | |
| # Statistics cards | |
| stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4) | |
| with stats_col1: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Showing</div> | |
| <div class="metric-value">{len(current_page_data)}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with stats_col2: | |
| available_systems = len([col for col in display_cols if col != 'english']) | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Translation Systems</div> | |
| <div class="metric-value">{available_systems}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with stats_col3: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Total Available</div> | |
| <div class="metric-value">{total_samples}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with stats_col4: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Current Page</div> | |
| <div class="metric-value">{st.session_state.current_page}/{total_pages}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Display the samples table | |
| st.markdown("### Translation Examples") | |
| if len(current_page_data) > 0: | |
| # Create a styled dataframe with better column names | |
| display_df = current_page_data.copy() | |
| # Rename columns for better display | |
| column_rename = { | |
| 'english': 'English (Source)', | |
| } | |
| # Add human-readable names for translation columns | |
| for col in display_df.columns: | |
| if col.startswith(samples_code): | |
| if '_human' in col: | |
| column_rename[col] = f'{selected_lang} (Human)' | |
| elif '_revised' in col: | |
| column_rename[col] = f'{selected_lang} (Revised)' | |
| elif '_mt_gemini' in col or '_gemini' in col: | |
| column_rename[col] = f'{selected_lang} (Gemini)' | |
| elif '_mt_gpt' in col or '_gpt' in col: | |
| column_rename[col] = f'{selected_lang} (GPT)' | |
| elif '_mt_nllb' in col or '_nllb' in col: | |
| column_rename[col] = f'{selected_lang} (NLLB)' | |
| else: | |
| # Generic fallback | |
| clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title() | |
| column_rename[col] = f'{selected_lang} ({clean_name})' | |
| display_df = display_df.rename(columns=column_rename) | |
| # Add row numbers based on actual position in full dataset | |
| display_df.index = range(start_idx + 1, end_idx + 1) | |
| display_df.index.name = 'Sample #' | |
| st.dataframe( | |
| display_df, | |
| use_container_width=True, | |
| height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content | |
| column_config={ | |
| col: st.column_config.TextColumn(col, width="medium") | |
| for col in display_df.columns | |
| } | |
| ) | |
| # Page info summary | |
| st.markdown(f""" | |
| <div style="margin-top: 16px; padding: 12px; background: #f8fafc; border-radius: 6px; text-align: center; color: #6b7280; font-size: 0.9rem;"> | |
| 📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Quick jump to page | |
| if total_pages > 5: # Only show quick jump for datasets with many pages | |
| st.markdown("### Quick Navigation") | |
| jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1]) | |
| with jump_col2: | |
| target_page = st.number_input( | |
| f"Jump to page (1-{total_pages}):", | |
| min_value=1, | |
| max_value=total_pages, | |
| value=st.session_state.current_page, | |
| key="page_jump" | |
| ) | |
| if st.button("🔗 Go to Page", use_container_width=True): | |
| if target_page != st.session_state.current_page: | |
| st.session_state.current_page = target_page | |
| st.rerun() | |
| else: | |
| st.warning("⚠️ No translation samples found for the current page.") | |
| else: | |
| st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'") | |
| # Debug information | |
| available_columns = [col for col in df_translations.columns if col.startswith(samples_code)] | |
| if available_columns: | |
| st.info(f"🔍 Found columns: {', '.join(available_columns)}") | |
| else: | |
| all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])] | |
| if all_lang_columns: | |
| st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}") | |
| with analysis_tab2: | |
| st.markdown(""" | |
| <div style="margin: 20px 0;"> | |
| <h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;"> | |
| 📈 Quality Metrics for {selected_lang} | |
| </h4> | |
| </div> | |
| """.format(selected_lang=selected_lang), unsafe_allow_html=True) | |
| # Get language code | |
| lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} | |
| code = lang_codes[selected_lang] | |
| # Score visualizations | |
| if df_bleu is not None and df_chrf is not None and df_comet is not None: | |
| # Filter scores for selected language | |
| lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu | |
| lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf | |
| lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet | |
| # Check if we have domain-level data | |
| has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and | |
| 'domain' in lang_comet.columns and | |
| len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0) | |
| if has_domain_data: | |
| # Add domain filter | |
| available_domains = sorted(lang_bleu['domain'].unique()) | |
| domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall'] | |
| selected_domain = st.selectbox( | |
| "📍 Select Domain for Analysis:", | |
| domain_options, | |
| key=f"domain_selector_{selected_lang}" | |
| ) | |
| # Filter data based on selected domain | |
| if selected_domain == 'Overall': | |
| display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall'] | |
| display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall'] | |
| display_comet = lang_comet[lang_comet['domain'] == 'Overall'] | |
| chart_title_suffix = " - Overall" | |
| else: | |
| display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain] | |
| display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain] | |
| display_comet = lang_comet[lang_comet['domain'] == selected_domain] | |
| chart_title_suffix = f" - {selected_domain}" | |
| else: | |
| # Use all data if no domain column | |
| display_bleu = lang_bleu | |
| display_chrf = lang_chrf | |
| display_comet = lang_comet | |
| chart_title_suffix = "" | |
| # Create score charts | |
| if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: | |
| chart_col1, chart_col2, chart_col3 = st.columns(3) | |
| with chart_col1: | |
| # chrF Score Chart | |
| fig_chrf = px.bar( | |
| display_chrf, | |
| x='comparison_pair', | |
| y='chrf_score', | |
| title=f'chrF Scores - {selected_lang}{chart_title_suffix}', | |
| color='chrf_score', | |
| color_continuous_scale='oranges' | |
| ) | |
| fig_chrf.update_layout( | |
| xaxis_title="Translation Pairs", | |
| yaxis_title="chrF Score", | |
| xaxis_tickangle=-45, | |
| height=400, | |
| font=dict(family="Inter", size=12) | |
| ) | |
| st.plotly_chart(fig_chrf, use_container_width=True) | |
| with chart_col2: | |
| # BLEU Score Chart | |
| fig_bleu = px.bar( | |
| display_bleu, | |
| x='comparison_pair', | |
| y='bleu_score', | |
| title=f'BLEU Scores - {selected_lang}{chart_title_suffix}', | |
| color='bleu_score', | |
| color_continuous_scale='blues' | |
| ) | |
| fig_bleu.update_layout( | |
| xaxis_title="Translation Pairs", | |
| yaxis_title="BLEU Score", | |
| xaxis_tickangle=-45, | |
| height=400, | |
| font=dict(family="Inter", size=12) | |
| ) | |
| st.plotly_chart(fig_bleu, use_container_width=True) | |
| with chart_col3: | |
| # COMET Score Chart | |
| fig_comet = px.bar( | |
| display_comet, | |
| x='comparison_pair', | |
| y='comet_score', | |
| title=f'COMET Scores - {selected_lang}{chart_title_suffix}', | |
| color='comet_score', | |
| color_continuous_scale='greens' | |
| ) | |
| fig_comet.update_layout( | |
| xaxis_title="Translation Pairs", | |
| yaxis_title="COMET Score", | |
| xaxis_tickangle=-45, | |
| height=400, | |
| font=dict(family="Inter", size=12) | |
| ) | |
| st.plotly_chart(fig_comet, use_container_width=True) | |
| # PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise | |
| if has_domain_data: | |
| st.markdown(f""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 20px 0 16px 0;"> | |
| 🕸️ Domain Performance Spider Charts - {selected_lang} | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| # Filter out "Overall" so only domain-level values are shown | |
| domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall'] | |
| domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall'] | |
| domain_comet = lang_comet[lang_comet['domain'] != 'Overall'] | |
| # Pivot all metrics | |
| pivot_bleu = domain_bleu.pivot( | |
| index='comparison_pair', | |
| columns='domain', | |
| values='bleu_score' | |
| ).fillna(0) | |
| pivot_chrf = domain_chrf.pivot( | |
| index='comparison_pair', | |
| columns='domain', | |
| values='chrf_score' | |
| ).fillna(0) | |
| pivot_comet = domain_comet.pivot( | |
| index='comparison_pair', | |
| columns='domain', | |
| values='comet_score' | |
| ).fillna(0) | |
| # Ensure domains are in the same order for all metrics | |
| domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns)) | |
| pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0) | |
| pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0) | |
| pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0) | |
| # Define distinct colors with reduced opacity | |
| distinct_colors = [ | |
| 'rgba(255, 99, 132, 0.4)', # Red | |
| 'rgba(54, 162, 235, 0.4)', # Blue | |
| 'rgba(99, 255, 132, 0.4)', # Green | |
| 'rgba(75, 192, 192, 0.4)', # Teal | |
| 'rgba(255, 205, 86, 0.4)', # Yellow | |
| 'rgba(153, 102, 255, 0.4)', # Purple | |
| 'rgba(255, 159, 64, 0.4)', # Orange | |
| 'rgba(199, 199, 199, 0.4)', # Grey | |
| 'rgba(83, 102, 255, 0.4)', # Indigo | |
| 'rgba(255, 99, 255, 0.4)', # Magenta | |
| ] | |
| # Border colors (same colors but full opacity for borders) | |
| border_colors = [ | |
| 'rgba(255, 99, 132, 1.0)', # Red | |
| 'rgba(54, 162, 235, 1.0)', # Blue | |
| 'rgba(99, 255, 132, 1.0)', # Green | |
| 'rgba(75, 192, 192, 1.0)', # Teal | |
| 'rgba(255, 205, 86, 1.0)', # Yellow | |
| 'rgba(153, 102, 255, 1.0)', # Purple | |
| 'rgba(255, 159, 64, 1.0)', # Orange | |
| 'rgba(199, 199, 199, 1.0)', # Grey | |
| 'rgba(83, 102, 255, 1.0)', # Indigo | |
| 'rgba(255, 99, 255, 1.0)', # Magenta | |
| ] | |
| # Layout for three side-by-side spider charts | |
| spider_col1, spider_col2, spider_col3 = st.columns(3) | |
| # ---------------- CHRF SPIDER ---------------- | |
| with spider_col1: | |
| fig_chrf_spider = go.Figure() | |
| for i, (model_name, row) in enumerate(pivot_chrf.iterrows()): | |
| color_idx = i % len(distinct_colors) | |
| fig_chrf_spider.add_trace(go.Scatterpolar( | |
| r=row.tolist() + [row.tolist()[0]], # close loop | |
| theta=domains + [domains[0]], | |
| fill='toself', | |
| name=model_name.split('_')[-1].upper(), | |
| fillcolor=distinct_colors[color_idx], | |
| line=dict(color=border_colors[color_idx], width=2), | |
| opacity=0.7, | |
| showlegend=False # Hide legend on first chart | |
| )) | |
| fig_chrf_spider.update_layout( | |
| polar=dict(radialaxis=dict(visible=True, range=[0, 1])), | |
| showlegend=False, | |
| title=dict(text=f"Domain Performance (chrF) - {selected_lang}"), | |
| height=450 | |
| ) | |
| st.plotly_chart(fig_chrf_spider, use_container_width=True) | |
| # ---------------- BLEU SPIDER ---------------- | |
| with spider_col2: | |
| fig_bleu_spider = go.Figure() | |
| for i, (model_name, row) in enumerate(pivot_bleu.iterrows()): | |
| color_idx = i % len(distinct_colors) | |
| fig_bleu_spider.add_trace(go.Scatterpolar( | |
| r=row.tolist() + [row.tolist()[0]], # close loop | |
| theta=domains + [domains[0]], | |
| fill='toself', | |
| name=model_name.split('_')[-1].upper(), | |
| fillcolor=distinct_colors[color_idx], | |
| line=dict(color=border_colors[color_idx], width=2), | |
| opacity=0.7, | |
| showlegend=True # Show legend on middle chart | |
| )) | |
| fig_bleu_spider.update_layout( | |
| polar=dict(radialaxis=dict(visible=True, range=[0, 1])), | |
| showlegend=True, | |
| title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"), | |
| height=450, | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=-0.3, | |
| xanchor="center", | |
| x=0.5 | |
| ) | |
| ) | |
| st.plotly_chart(fig_bleu_spider, use_container_width=True) | |
| # ---------------- COMET SPIDER ---------------- | |
| with spider_col3: | |
| fig_comet_spider = go.Figure() | |
| for i, (model_name, row) in enumerate(pivot_comet.iterrows()): | |
| color_idx = i % len(distinct_colors) | |
| fig_comet_spider.add_trace(go.Scatterpolar( | |
| r=row.tolist() + [row.tolist()[0]], # close loop | |
| theta=domains + [domains[0]], | |
| fill='toself', | |
| name=model_name.split('_')[-1].upper(), | |
| fillcolor=distinct_colors[color_idx], | |
| line=dict(color=border_colors[color_idx], width=2), | |
| opacity=0.7, | |
| showlegend=False # Hide legend on last chart | |
| )) | |
| fig_comet_spider.update_layout( | |
| polar=dict(radialaxis=dict(visible=True, range=[0, 1])), | |
| showlegend=False, | |
| title=dict(text=f"Domain Performance (COMET) - {selected_lang}"), | |
| height=450 | |
| ) | |
| st.plotly_chart(fig_comet_spider, use_container_width=True) | |
| # # Overall Performance Summary | |
| # st.markdown(""" | |
| # <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| # 📋 Overall Performance Summary | |
| # </h4> | |
| # """, unsafe_allow_html=True) | |
| # # Create overall summary table | |
| # if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: | |
| # # Merge all three metrics | |
| # merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf')) | |
| # merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair') | |
| # merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper()) | |
| # summary_data = [] | |
| # for _, row in merged_scores.iterrows(): | |
| # summary_data.append({ | |
| # 'Model': row['model'], | |
| # 'BLEU Score': f"{row['bleu_score']:.3f}", | |
| # 'chrF Score': f"{row['chrf_score']:.3f}", | |
| # 'COMET Score': f"{row['comet_score']:.3f}", | |
| # 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}" | |
| # }) | |
| # summary_df = pd.DataFrame(summary_data) | |
| # # Only sort if dataframe has data and 'Average' column exists | |
| # if len(summary_df) > 0 and 'Average' in summary_df.columns: | |
| # summary_df = summary_df.sort_values('Average', ascending=False) | |
| # # Style the dataframe | |
| # st.dataframe( | |
| # summary_df, | |
| # use_container_width=True, | |
| # hide_index=True, | |
| # column_config={ | |
| # "Model": st.column_config.TextColumn("Model", width="medium"), | |
| # "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"), | |
| # "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"), | |
| # "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"), | |
| # "Average": st.column_config.NumberColumn("Average", format="%.3f") | |
| # } | |
| # ) | |
| with analysis_tab3: | |
| # Revision Analysis Tab | |
| st.markdown(""" | |
| <div style="margin: 20px 0;"> | |
| <h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;"> | |
| ✏️ Human Translation Revision Analysis for {selected_lang} | |
| </h4> | |
| </div> | |
| """.format(selected_lang=selected_lang), unsafe_allow_html=True) | |
| # Use the global language selection | |
| rev_code = code | |
| # Check for revision columns | |
| human_col = f"{rev_code}_human" | |
| revised_col = f"{rev_code}_revised" | |
| if human_col in df_translations.columns and revised_col in df_translations.columns: | |
| # Get all rows with human translations for this language | |
| df_lang_data = df_translations[[human_col, revised_col]].copy() | |
| # Remove rows where human translation is missing (can't analyze revisions without original) | |
| df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy() | |
| total_human_translations = len(df_lang_data) | |
| if total_human_translations == 0: | |
| st.warning(f"⚠️ No human translations found for {selected_lang}") | |
| else: | |
| # Calculate revision statistics | |
| # For missing revised translations, we assume no revision was made (same as original) | |
| df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col]) | |
| # Count actual changes | |
| revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col]) | |
| revision_rate = (revisions_made / total_human_translations) * 100 | |
| # Count how many had revision data available | |
| revisions_available = sum(df_translations[revised_col].notna()) | |
| # Calculate revision types | |
| def categorize_revision(original, revised): | |
| if pd.isna(original) or pd.isna(revised): | |
| return "Missing Data" | |
| if str(original).strip() == str(revised).strip(): | |
| return "No Change" | |
| orig_words = str(original).lower().split() | |
| rev_words = str(revised).lower().split() | |
| if len(rev_words) > len(orig_words): | |
| return "Expansion" | |
| elif len(rev_words) < len(orig_words): | |
| return "Reduction" | |
| else: | |
| return "Modification" | |
| df_lang_data['revision_type'] = df_lang_data.apply( | |
| lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1 | |
| ) | |
| # Revision statistics cards | |
| rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4) | |
| with rev_col1: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Human Translations</div> | |
| <div class="metric-value">{total_human_translations}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with rev_col2: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Revisions Available</div> | |
| <div class="metric-value">{revisions_available}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with rev_col3: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Changes Made</div> | |
| <div class="metric-value">{revisions_made}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with rev_col4: | |
| st.markdown(f""" | |
| <div class="metric-card"> | |
| <div class="metric-title">Revision Rate</div> | |
| <div class="metric-value">{revision_rate:.1f}%</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Revision type analysis | |
| st.markdown(""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| 📈 Revision Pattern Analysis | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| revision_counts = df_lang_data['revision_type'].value_counts() | |
| if len(revision_counts) > 0: | |
| # Create revision type charts | |
| rev_chart_col1, rev_chart_col2 = st.columns(2) | |
| with rev_chart_col1: | |
| # Pie chart of revision types | |
| fig_pie = px.pie( | |
| values=revision_counts.values, | |
| names=revision_counts.index, | |
| title=f"Revision Types Distribution", | |
| color_discrete_sequence=px.colors.qualitative.Set3 | |
| ) | |
| fig_pie.update_layout(height=400, font=dict(family="Inter", size=12)) | |
| st.plotly_chart(fig_pie, use_container_width=True) | |
| with rev_chart_col2: | |
| # Bar chart of revision types | |
| fig_bar = px.bar( | |
| x=revision_counts.values, | |
| y=revision_counts.index, | |
| orientation='h', | |
| title=f"Revision Frequency", | |
| color=revision_counts.values, | |
| color_continuous_scale='viridis' | |
| ) | |
| fig_bar.update_layout( | |
| height=400, | |
| xaxis_title="Count", | |
| yaxis_title="Revision Type", | |
| font=dict(family="Inter", size=12) | |
| ) | |
| st.plotly_chart(fig_bar, use_container_width=True) | |
| # Word-level revision analysis | |
| st.markdown(""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| 🔤 Word-Level Changes Analysis | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| # Calculate word changes only for actual revisions | |
| words_added = [] | |
| words_removed = [] | |
| changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change'] | |
| for _, row in changed_revisions.iterrows(): | |
| if pd.notna(row[human_col]) and pd.notna(row[revised_col]): | |
| orig_words = set(str(row[human_col]).lower().split()) | |
| rev_words = set(str(row[revised_col]).lower().split()) | |
| added = rev_words - orig_words | |
| removed = orig_words - rev_words | |
| words_added.extend(list(added)) | |
| words_removed.extend(list(removed)) | |
| from collections import Counter | |
| added_counts = Counter(words_added) | |
| removed_counts = Counter(words_removed) | |
| word_analysis_col1, word_analysis_col2 = st.columns(2) | |
| with word_analysis_col1: | |
| st.markdown("**🟢 Most Added Words**") | |
| if added_counts: | |
| top_added = dict(added_counts.most_common(15)) | |
| # Create horizontal bar chart for added words | |
| fig_added = px.bar( | |
| x=list(top_added.values()), | |
| y=list(top_added.keys()), | |
| orientation='h', | |
| title="Most Frequently Added Words", | |
| color=list(top_added.values()), | |
| color_continuous_scale='Greens' | |
| ) | |
| fig_added.update_layout( | |
| height=400, | |
| xaxis_title="Frequency", | |
| yaxis_title="Words", | |
| font=dict(family="Inter", size=10) | |
| ) | |
| st.plotly_chart(fig_added, use_container_width=True) | |
| else: | |
| st.markdown("*No words added in revisions*") | |
| with word_analysis_col2: | |
| st.markdown("**🔴 Most Removed Words**") | |
| if removed_counts: | |
| top_removed = dict(removed_counts.most_common(15)) | |
| # Create horizontal bar chart for removed words | |
| fig_removed = px.bar( | |
| x=list(top_removed.values()), | |
| y=list(top_removed.keys()), | |
| orientation='h', | |
| title="Most Frequently Removed Words", | |
| color=list(top_removed.values()), | |
| color_continuous_scale='Reds' | |
| ) | |
| fig_removed.update_layout( | |
| height=400, | |
| xaxis_title="Frequency", | |
| yaxis_title="Words", | |
| font=dict(family="Inter", size=10) | |
| ) | |
| st.plotly_chart(fig_removed, use_container_width=True) | |
| else: | |
| st.markdown("*No words removed in revisions*") | |
| # Revision examples | |
| st.markdown(""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| 📝 Revision Examples | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| # Show examples of different types of revisions | |
| revision_examples = changed_revisions.head(10) | |
| if len(revision_examples) > 0: | |
| # Create tabs for different revision types | |
| available_types = revision_examples['revision_type'].unique() | |
| if len(available_types) > 1: | |
| type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})" | |
| for rtype in available_types]) | |
| for i, rtype in enumerate(available_types): | |
| with type_tabs[i]: | |
| type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5) | |
| for idx, row in type_examples.iterrows(): | |
| st.markdown(f""" | |
| <div style="background: #f8fafc; border-left: 4px solid #3b82f6; padding: 16px; margin: 10px 0; border-radius: 0 8px 8px 0;"> | |
| <div style="font-weight: 600; color: #1e40af; margin-bottom: 8px;">Original:</div> | |
| <div style="margin-bottom: 12px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[human_col]}</div> | |
| <div style="font-weight: 600; color: #059669; margin-bottom: 8px;">Revised:</div> | |
| <div style="margin-bottom: 8px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[revised_col]}</div> | |
| <div style="font-size: 0.875rem; color: #6b7280;">Type: <strong>{row['revision_type']}</strong></div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| # Single type, show directly | |
| for idx, row in revision_examples.iterrows(): | |
| st.markdown(f""" | |
| <div style="background: #f8fafc; border-left: 4px solid #3b82f6; padding: 16px; margin: 10px 0; border-radius: 0 8px 8px 0;"> | |
| <div style="font-weight: 600; color: #1e40af; margin-bottom: 8px;">Original:</div> | |
| <div style="margin-bottom: 12px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[human_col]}</div> | |
| <div style="font-weight: 600; color: #059669; margin-bottom: 8px;">Revised:</div> | |
| <div style="margin-bottom: 8px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[revised_col]}</div> | |
| <div style="font-size: 0.875rem; color: #6b7280;">Type: <strong>{row['revision_type']}</strong></div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.info(f"No revisions found for {selected_lang}.") | |
| else: | |
| st.info(f"No revision data available for analysis.") | |
| else: | |
| st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`") | |
| with analysis_tab4: | |
| # Translation comparison section | |
| st.markdown(""" | |
| <div style="margin: 20px 0;"> | |
| <h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;"> | |
| 🔍 Translation Comparison & Word Analysis for {selected_lang} | |
| </h4> | |
| </div> | |
| """.format(selected_lang=selected_lang), unsafe_allow_html=True) | |
| # Use the global language selection | |
| comp_code = code | |
| # Get available translation columns for selected language | |
| available_cols = [] | |
| for col in df_translations.columns: | |
| if col.startswith(comp_code) and col != 'english': | |
| available_cols.append(col) | |
| if len(available_cols) >= 2: | |
| comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1]) | |
| with comp_col1: | |
| col1_selection = st.selectbox( | |
| "First Translation:", | |
| available_cols, | |
| key="col1_select" | |
| ) | |
| with comp_col2: | |
| col2_selection = st.selectbox( | |
| "Second Translation:", | |
| [col for col in available_cols if col != col1_selection], | |
| key="col2_select" | |
| ) | |
| with comp_col3: | |
| # Add spacing to align button with selectboxes | |
| st.markdown('<div style="margin-top: 25px;"></div>', unsafe_allow_html=True) | |
| analyze_clicked = st.button( | |
| "🔍 Analyze", | |
| type="primary", | |
| use_container_width=True, | |
| key="analyze_word_diff_btn" | |
| ) | |
| if analyze_clicked: | |
| # Perform word analysis with ALL available data | |
| def get_word_differences(text1, text2): | |
| # Handle missing data by using available text | |
| if pd.isna(text1) and pd.isna(text2): | |
| return set(), set(), set() | |
| # If one is missing, treat it as empty for comparison | |
| words1 = set(str(text1).lower().split()) if pd.notna(text1) else set() | |
| words2 = set(str(text2).lower().split()) if pd.notna(text2) else set() | |
| only_in_1 = words1 - words2 | |
| only_in_2 = words2 - words1 | |
| common = words1 & words2 | |
| return only_in_1, only_in_2, common | |
| # Analyze ALL rows with available data | |
| unique_words_1 = [] | |
| unique_words_2 = [] | |
| common_words = [] | |
| all_words_1 = [] # For frequency counting | |
| all_words_2 = [] # For frequency counting | |
| # Process all rows, including those with missing revisions | |
| for _, row in df_translations.iterrows(): | |
| # Get text from columns, using original if revision is missing | |
| text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None | |
| text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None | |
| # Skip if both are missing | |
| if text1 is None and text2 is None: | |
| continue | |
| # Collect ALL words from each column for frequency analysis | |
| if text1 is not None: | |
| words_from_1 = str(text1).lower().split() | |
| all_words_1.extend(words_from_1) | |
| if text2 is not None: | |
| words_from_2 = str(text2).lower().split() | |
| all_words_2.extend(words_from_2) | |
| # Only do comparison if both texts exist | |
| if text1 is not None and text2 is not None: | |
| only_1, only_2, common = get_word_differences(text1, text2) | |
| unique_words_1.extend(list(only_1)) | |
| unique_words_2.extend(list(only_2)) | |
| common_words.extend(list(common)) | |
| from collections import Counter | |
| # Count frequencies from ALL words | |
| all_freq_1 = Counter(all_words_1) # All words from column 1 | |
| all_freq_2 = Counter(all_words_2) # All words from column 2 | |
| unique_freq_1 = Counter(unique_words_1) # Only unique words | |
| unique_freq_2 = Counter(unique_words_2) # Only unique words | |
| common_freq = Counter(common_words) # Only common words | |
| # Display statistics | |
| st.markdown('<div class="comparison-container">', unsafe_allow_html=True) | |
| col_result1, col_result2, col_result3, col_result4 = st.columns(4) | |
| with col_result1: | |
| st.markdown(f""" | |
| <div style="text-align: center; padding: 15px;"> | |
| <h5 style="color: #dc2626; margin-bottom: 10px;">Unique to {col1_selection.replace('_', ' ').title()}</h5> | |
| <div style="font-size: 1.3rem; font-weight: bold; color: #dc2626;">{len(unique_freq_1)}</div> | |
| <div style="color: #6b7280; font-size: 0.8rem;">unique words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col_result2: | |
| st.markdown(f""" | |
| <div style="text-align: center; padding: 15px;"> | |
| <h5 style="color: #166534; margin-bottom: 10px;">Unique to {col2_selection.replace('_', ' ').title()}</h5> | |
| <div style="font-size: 1.3rem; font-weight: bold; color: #166534;">{len(unique_freq_2)}</div> | |
| <div style="color: #6b7280; font-size: 0.8rem;">unique words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col_result3: | |
| st.markdown(f""" | |
| <div style="text-align: center; padding: 15px;"> | |
| <h5 style="color: #475569; margin-bottom: 10px;">Common Words</h5> | |
| <div style="font-size: 1.3rem; font-weight: bold; color: #475569;">{len(common_freq)}</div> | |
| <div style="color: #6b7280; font-size: 0.8rem;">shared words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col_result4: | |
| st.markdown(f""" | |
| <div style="text-align: center; padding: 15px;"> | |
| <h5 style="color: #7c3aed; margin-bottom: 10px;">Total Vocabulary</h5> | |
| <div style="font-size: 1.3rem; font-weight: bold; color: #7c3aed;">{len(set(all_words_1 + all_words_2))}</div> | |
| <div style="color: #6b7280; font-size: 0.8rem;">total unique words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| # Word Clouds Section | |
| st.markdown(""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| ☁️ Word Clouds Visualization | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| # Generate word clouds using matplotlib and wordcloud | |
| try: | |
| # Show loading spinner while generating word clouds | |
| with st.spinner("🎨 Generating word clouds... This may take a moment."): | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| import io | |
| import base64 | |
| # Function to create word cloud image (optimized) | |
| def create_wordcloud_image(word_freq, title, color_scheme='viridis'): | |
| if not word_freq or len(word_freq) == 0: | |
| return None | |
| try: | |
| # Create word cloud with all frequency data, but limit max_words to 25 | |
| wordcloud = WordCloud( | |
| width=300, # Reduced size | |
| height=200, # Reduced size | |
| background_color='white', | |
| colormap=color_scheme, | |
| max_words=25, # Display top 25 words | |
| relative_scaling=0.6, | |
| random_state=42, | |
| min_font_size=8, | |
| max_font_size=60, | |
| prefer_horizontal=0.9, | |
| collocations=False # Avoid word combinations | |
| ).generate_from_frequencies(word_freq) # Use ALL frequency data | |
| # Create matplotlib figure with smaller size | |
| fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure | |
| ax.imshow(wordcloud, interpolation='bilinear') | |
| ax.axis('off') | |
| ax.set_title(title, fontsize=10, fontweight='bold', pad=10) | |
| # Convert to base64 for HTML display | |
| buffer = io.BytesIO() | |
| plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI | |
| buffer.seek(0) | |
| image_base64 = base64.b64encode(buffer.getvalue()).decode() | |
| plt.close(fig) # Important: close figure to free memory | |
| return image_base64 | |
| except Exception as e: | |
| st.warning(f"Error creating word cloud for {title}: {str(e)}") | |
| return None | |
| # Create all word clouds in one row | |
| cloud_col1, cloud_col2, cloud_col3 = st.columns(3) | |
| with cloud_col1: | |
| if unique_freq_1 and len(unique_freq_1) > 0: | |
| # Use ALL unique words but display top 25 in cloud | |
| img1 = create_wordcloud_image( | |
| dict(unique_freq_1), # Use ALL unique words for frequency | |
| f"Unique: {col1_selection.replace('_', ' ').title()}", | |
| 'Reds' | |
| ) | |
| if img1: | |
| st.markdown(f''' | |
| <div style="text-align: center; margin: 10px 0;"> | |
| <img src="data:image/png;base64,{img1}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);"> | |
| </div> | |
| <div style="text-align: center; font-size: 0.8rem; color: #6b7280;"> | |
| Showing top 25 of {len(unique_freq_1)} unique words | |
| </div> | |
| ''', unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #fef2f2; border-radius: 6px; color: #dc2626;"> | |
| <div style="font-size: 2rem;">📝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No unique words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;"> | |
| <div style="font-size: 2rem;">📝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No unique words found</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with cloud_col2: | |
| if unique_freq_2 and len(unique_freq_2) > 0: | |
| # Use ALL unique words but display top 25 in cloud | |
| img2 = create_wordcloud_image( | |
| dict(unique_freq_2), # Use ALL unique words for frequency | |
| f"Unique: {col2_selection.replace('_', ' ').title()}", | |
| 'Greens' | |
| ) | |
| if img2: | |
| st.markdown(f''' | |
| <div style="text-align: center; margin: 10px 0;"> | |
| <img src="data:image/png;base64,{img2}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);"> | |
| </div> | |
| <div style="text-align: center; font-size: 0.8rem; color: #6b7280;"> | |
| Showing top 25 of {len(unique_freq_2)} unique words | |
| </div> | |
| ''', unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #f0fdf4; border-radius: 6px; color: #166534;"> | |
| <div style="font-size: 2rem;">📝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No unique words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;"> | |
| <div style="font-size: 2rem;">📝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No unique words found</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with cloud_col3: | |
| if common_freq and len(common_freq) > 0: | |
| # Use ALL common words but display top 25 in cloud | |
| img3 = create_wordcloud_image( | |
| dict(common_freq), # Use ALL common words for frequency | |
| "Common Words", | |
| 'Blues' | |
| ) | |
| if img3: | |
| st.markdown(f''' | |
| <div style="text-align: center; margin: 10px 0;"> | |
| <img src="data:image/png;base64,{img3}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);"> | |
| </div> | |
| <div style="text-align: center; font-size: 0.8rem; color: #6b7280;"> | |
| Showing top 25 of {len(common_freq)} common words | |
| </div> | |
| ''', unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #eff6ff; border-radius: 6px; color: #1d4ed8;"> | |
| <div style="font-size: 2rem;">📝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No common words</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;"> | |
| <div style="font-size: 2rem;">🤝</div> | |
| <div style="font-size: 0.9rem; margin-top: 8px;">No common words found</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| except ImportError: | |
| st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`") | |
| # Fallback to top words lists | |
| st.markdown("**📋 Top Unique Words (Fallback)**") | |
| fallback_col1, fallback_col2, fallback_col3 = st.columns(3) | |
| with fallback_col1: | |
| st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**") | |
| if unique_freq_1: | |
| for word, count in unique_freq_1.most_common(10): | |
| st.markdown(f"• {word} ({count})") | |
| else: | |
| st.markdown("*No unique words*") | |
| with fallback_col2: | |
| st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**") | |
| if unique_freq_2: | |
| for word, count in unique_freq_2.most_common(10): | |
| st.markdown(f"• {word} ({count})") | |
| else: | |
| st.markdown("*No unique words*") | |
| with fallback_col3: | |
| st.markdown("**🔵 Common Words**") | |
| if common_freq: | |
| for word, count in common_freq.most_common(10): | |
| st.markdown(f"• {word} ({count})") | |
| else: | |
| st.markdown("*No common words*") | |
| # Word frequency bar charts as additional analysis | |
| st.markdown(""" | |
| <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;"> | |
| 📊 Top Words Frequency Comparison | |
| </h4> | |
| """, unsafe_allow_html=True) | |
| freq_col1, freq_col2 = st.columns(2) | |
| with freq_col1: | |
| if unique_freq_1: | |
| top_words_1 = dict(unique_freq_1.most_common(10)) | |
| fig_freq1 = px.bar( | |
| x=list(top_words_1.values()), | |
| y=list(top_words_1.keys()), | |
| orientation='h', | |
| title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}", | |
| color=list(top_words_1.values()), | |
| color_continuous_scale='Reds' | |
| ) | |
| fig_freq1.update_layout( | |
| height=400, | |
| xaxis_title="Frequency", | |
| yaxis_title="Words", | |
| font=dict(family="Inter", size=10) | |
| ) | |
| st.plotly_chart(fig_freq1, use_container_width=True) | |
| with freq_col2: | |
| if unique_freq_2: | |
| top_words_2 = dict(unique_freq_2.most_common(10)) | |
| fig_freq2 = px.bar( | |
| x=list(top_words_2.values()), | |
| y=list(top_words_2.keys()), | |
| orientation='h', | |
| title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}", | |
| color=list(top_words_2.values()), | |
| color_continuous_scale='Greens' | |
| ) | |
| fig_freq2.update_layout( | |
| height=400, | |
| xaxis_title="Frequency", | |
| yaxis_title="Words", | |
| font=dict(family="Inter", size=10) | |
| ) | |
| st.plotly_chart(fig_freq2, use_container_width=True) | |
| else: | |
| st.warning("⚠️ Need at least 2 translation columns for comparison analysis.") | |
| else: | |
| st.markdown(""" | |
| <div style="background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; padding: 24px; margin: 16px 0; text-align: center;"> | |
| <h3 style="font-family: 'Inter', sans-serif; color: #dc2626; margin: 0 0 12px 0;">❌ No Data Available</h3> | |
| <p style="font-family: 'Inter', sans-serif; color: #7f1d1d; margin: 0;"> | |
| Please ensure translation data files are available in the data directory. | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="text-align: center; color: #6b7280; font-family: 'Inter', sans-serif; font-size: 0.875rem;"> | |
| Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() |