import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import os import time from PIL import Image # Only import APIs if available try: from google import genai GENAI_AVAILABLE = True except ImportError: GENAI_AVAILABLE = False try: from openai import OpenAI OPENAI_AVAILABLE = True except ImportError: OPENAI_AVAILABLE = False BASE_DIR = os.path.dirname(__file__) DATA_DIR = os.path.join(BASE_DIR, "data") # Page configuration st.set_page_config( page_title="Translation Comparison Tool", page_icon="🌐", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for Material Design with Tailwind-inspired styling st.markdown(""" """, unsafe_allow_html=True) # Model configurations MODEL_CONFIG = { 'Gemini': { 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], 'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'], 'default_model': 'gemini-2.0-flash-exp' }, 'GPT': { 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], 'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'], 'default_model': 'gpt-4' }, 'NLLB': { 'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available 'models': { 'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B', 'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B' } } } # Language code mappings LANGUAGE_CODES = { 'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu' } # Load logo def load_logo(): """Load logo with error handling""" try: if os.path.exists(f"{BASE_DIR}/logo.png"): return Image.open(f"{BASE_DIR}/logo.png") except Exception as e: st.warning(f"Could not load logo: {str(e)}") return None # Load and cache data @st.cache_data def load_translation_data(): """Load sample translation data""" try: sample_data = { 'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'], 'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], 'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], 'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], 'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], 'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], 'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'], 'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], 'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], 'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], 'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], 'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], 'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], 'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], 'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'] } return pd.DataFrame(sample_data) except Exception as e: st.error(f"Error loading data: {str(e)}") return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']}) def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None): """Translate text using Gemini API""" try: if not GENAI_AVAILABLE: return "❌ Gemini library not installed" if not client: return "❌ Gemini API not configured. Please check your GEMINI_API_KEY." lang_map = { 'Afrikaans': 'Afrikaans', 'Northern Sotho': 'Northern Sotho (Sepedi)', 'isiZulu': 'isiZulu' } prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations." response = client.models.generate_content( model=model_name, contents=prompt ) return response.text.strip() except Exception as e: return f"❌ Error: {str(e)}" def translate_with_openai(text, target_language, model_name="gpt-4o", client=None): """Translate text using OpenAI API with Chat Completions""" try: if not OPENAI_AVAILABLE: return "❌ OpenAI library not installed" if not client: return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY." lang_map = { 'Afrikaans': 'Afrikaans', 'Northern Sotho': 'Northern Sotho (Sepedi)', 'isiZulu': 'isiZulu' } # Use Chat Completions API (supported indefinitely) response = client.chat.completions.create( model=model_name, messages=[ {"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."}, {"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"} ], max_tokens=1000, temperature=0.3 # Lower temperature for more consistent translations ) return response.choices[0].message.content.strip() except Exception as e: return f"❌ Error: {str(e)}" @st.cache_resource def initialize_apis(): """Initialize API clients with proper error handling, supporting both local and HF Spaces.""" genai_client = None openai_client = None def get_secret(name): """Fetch secret from env first (Docker Spaces), then Streamlit secrets.""" return ( os.environ.get(name) or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None) ) try: # Gemini API if GENAI_AVAILABLE: try: api_key = get_secret("GEMINI_API_KEY") if api_key: genai_client = genai.Client(api_key=api_key) else: st.warning("⚠️ Gemini API key not found") except Exception as e: st.error(f"❌ Gemini API error: {str(e)}") # OpenAI API if OPENAI_AVAILABLE: try: api_key = get_secret("OPENAI_API_KEY") if api_key: try: # Try new OpenAI API client openai_client = OpenAI(api_key=api_key) except TypeError: import openai openai.api_key = api_key openai_client = openai else: st.warning("⚠️ OpenAI API key not found") except Exception as e: st.error(f"❌ OpenAI API error: {str(e)}") except Exception as e: st.error(f"❌ API initialization error: {str(e)}") return genai_client, openai_client def translate_with_nllb(text, target_language): """Translate text using unified NLLB API""" try: import requests # Single ngrok URL for unified API API_URL = "https://4c2faecc052a.ngrok-free.app" # Map Streamlit language names to API format lang_mapping = { 'Northern Sotho': 'nso', 'isiZulu': 'zul' } api_lang = lang_mapping.get(target_language, target_language.lower()) response = requests.post( f"{API_URL}/translate_simple", params={ "text": text, "target_language": api_lang }, timeout=30 ) if response.status_code == 200: result = response.json() return result.get(api_lang, '❌ Translation not found') else: return f"❌ API Error: {response.status_code}" except Exception as e: return f"❌ Error: {str(e)}" def create_language_tabs(available_languages, current_language, key_suffix=""): """Create language tabs with proper styling""" tabs_html = '
' for lang in available_languages: active_class = "active" if lang == current_language else "" tabs_html += f'''
{lang}
''' tabs_html += '
' # Add JavaScript for tab functionality script = f''' ''' return tabs_html + script def main(): """Main application function""" # Load and display logo and title side by side logo = load_logo() # Initialize session state FIRST to avoid refreshes if 'target_language' not in st.session_state: st.session_state.target_language = 'Afrikaans' if 'translation_result' not in st.session_state: st.session_state.translation_result = "" if 'current_page' not in st.session_state: st.session_state.current_page = 1 if 'initialized' not in st.session_state: st.session_state.initialized = True col1, col2, col3 = st.columns([1, 2, 1]) with col2: if logo: # Convert logo to base64 for HTML embedding import base64 from io import BytesIO buffered = BytesIO() logo.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() st.markdown(f'''

UP Translate

''', unsafe_allow_html=True) else: st.markdown('

UP Translate

', unsafe_allow_html=True) # Initialize APIs genai_client, openai_client = initialize_apis() # Initialize session state if 'target_language' not in st.session_state: st.session_state.target_language = 'Afrikaans' if 'translation_result' not in st.session_state: st.session_state.translation_result = "" # Create tabs tab1, tab2 = st.tabs(["🤖 Live Translations", "📊 Existing Translations"]) with tab1: # st.markdown('

Live Translation

', unsafe_allow_html=True) # Create simplified model options model_options = [] model_mapping = {} # Add Gemini models for model in MODEL_CONFIG['Gemini']['models']: display_name = f"Gemini - {model}" model_options.append(display_name) model_mapping[display_name] = ('Gemini', None, model) # Add GPT models for model in MODEL_CONFIG['GPT']['models']: display_name = f"GPT - {model}" model_options.append(display_name) model_mapping[display_name] = ('GPT', None, model) # Add single NLLB option model_options.append("NLLB - Specialized Models") model_mapping["NLLB - Specialized Models"] = ('NLLB', None, None) # Model selection with inline label label_col, dropdown_col = st.columns([2, 10]) with label_col: st.markdown('
Select Model:
', unsafe_allow_html=True) with dropdown_col: selected_model_option = st.selectbox( "Select Model:", model_options, index=0, key="model_selection_dropdown", label_visibility="collapsed" ) selected_provider, _, selected_model = model_mapping[selected_model_option] # Translation interface col_left, col_center, col_right = st.columns([5, 1, 5]) # Left side - English Input with col_left: st.markdown('
', unsafe_allow_html=True) st.markdown('
English
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) input_text = st.text_area( "Input", placeholder="Input text here", height=350, key="input_text_live", label_visibility="collapsed" ) # Center - Translate Button with col_center: # Add spacing to align button with text areas st.markdown('
', unsafe_allow_html=True) translate_clicked = st.button( "Translate", key="translate_btn_live", help="Translate text", type="primary", use_container_width=True ) # Right side - Translation Output with col_right: # Determine available languages based on selected provider if selected_provider == 'NLLB': available_languages = MODEL_CONFIG['NLLB']['languages'] else: available_languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] # Set default language to first available if current selection not available if st.session_state.target_language not in available_languages: st.session_state.target_language = available_languages[0] # Create container with custom styling st.markdown('
', unsafe_allow_html=True) # Language selection buttons lang_cols = st.columns(len(available_languages)) for i, lang in enumerate(available_languages): with lang_cols[i]: button_type = "primary" if lang == st.session_state.target_language else "secondary" if st.button( lang, key=f"lang_btn_{lang}_live", type=button_type, use_container_width=True ): if st.session_state.target_language != lang: # Only update if different st.session_state.target_language = lang st.session_state.translation_result = "" # Clear previous result st.rerun() # Translation logic if translate_clicked and input_text: with st.spinner("Translating..."): target_lang = st.session_state.target_language if selected_provider == 'Gemini': result = translate_with_gemini(input_text, target_lang, selected_model, genai_client) elif selected_provider == 'GPT': result = translate_with_openai(input_text, target_lang, selected_model, openai_client) elif selected_provider == 'NLLB': result = translate_with_nllb(input_text, target_lang) st.session_state.translation_result = result # Translation output area with proper labeling st.text_area( f"Translation ({st.session_state.target_language})", # Dynamic label value=st.session_state.translation_result, placeholder="Translation will appear here", height=350, key="translation_output_live_fixed", # Changed key to avoid conflicts disabled=True, label_visibility="collapsed" ) # Support information st.markdown("""
Available Models:
🔮 Gemini: All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)
🧠 GPT: All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)
🤗 NLLB: Northern Sotho, isiZulu only (specialized models)
""", unsafe_allow_html=True) with tab2: # Load data from base directory automatically @st.cache_data def load_analysis_data(): """Load all analysis data from base directory""" df_translations = None df_bleu = None df_chrf = None df_comet = None try: # Try to load translations data if os.path.exists(f"{DATA_DIR}/translations.tsv"): df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t") # Convert new CSV format to expected format for analysis # New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb # Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc. # Rename zul columns to isizulu for backward compatibility with analysis code column_mapping = { 'zul_human': 'isizulu_human', 'zul_revised': 'isizulu_revised', 'zul_gemini': 'isizulu_mt_gemini', 'zul_gpt': 'isizulu_mt_gpt', 'zul_nllb': 'isizulu_mt_nllb', 'afr_gemini': 'afr_mt_gemini', 'afr_gpt': 'afr_mt_gpt', 'nso_gemini': 'nso_mt_gemini', 'nso_gpt': 'nso_mt_gpt', 'nso_nllb': 'nso_mt_nllb' } df_translations = df_translations.rename(columns=column_mapping) elif os.path.exists(f"{DATA_DIR}/translation_data.csv"): df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv") else: print("No translation data found, using sample data") df_translations = load_translation_data() # Fallback to sample data # Try to load BLEU scores if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"): df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv") # Convert zul references to isizulu for compatibility df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_') df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct else: # Sample BLEU data (using isizulu for compatibility with existing analysis code) df_bleu = pd.DataFrame({ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'], 'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73], 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu'] }) # Try to load COMET scores if os.path.exists(f"{DATA_DIR}/comet_scores.csv"): df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv") else: # Sample COMET data df_comet = pd.DataFrame({ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], 'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90], 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] }) # Try to load CHRF scores if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"): df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv") else: # Sample CHRF data df_chrf = pd.DataFrame({ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], 'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85], 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] }) return df_translations, df_bleu, df_comet, df_chrf except Exception as e: st.error(f"Error loading data: {str(e)}") return None, None, None, None # Load all data df_translations, df_bleu, df_comet, df_chrf = load_analysis_data() if df_translations is not None: # Language selection in columns lang_col1, lang_col2 = st.columns([2, 10]) with lang_col1: st.markdown('
Select Language:
', unsafe_allow_html=True) with lang_col2: languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] selected_lang = st.selectbox( "Select Language for Analysis:", languages, key="global_lang_select", label_visibility="collapsed" ) # Get language code lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} code = lang_codes[selected_lang] # Create analysis tabs analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"]) with analysis_tab1: # Translation Samples Tab st.markdown("""

📝 Translation Samples for {selected_lang}

""".format(selected_lang=selected_lang), unsafe_allow_html=True) # Use the global language selection samples_code = code # Show sample translations for the selected language display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)] if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column # Control panel control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2]) with control_col1: st.markdown('
Samples per page:
', unsafe_allow_html=True) with control_col2: page_size = st.selectbox( "Samples per page:", [10, 25, 50, 100], index=0, key="page_size_select", label_visibility="collapsed" ) # Initialize session state for pagination if 'current_page' not in st.session_state: st.session_state.current_page = 1 # Filter data and calculate pagination available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all') total_samples = len(available_data) total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division # Ensure current page is valid if st.session_state.current_page > total_pages: st.session_state.current_page = 1 # Calculate start and end indices start_idx = (st.session_state.current_page - 1) * page_size end_idx = min(start_idx + page_size, total_samples) # Get current page data current_page_data = available_data.iloc[start_idx:end_idx] with control_col3: st.markdown('
Page:
', unsafe_allow_html=True) with control_col4: # Page navigation nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1]) with nav_col1: if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)): st.session_state.current_page = 1 st.rerun() with nav_col2: if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)): st.session_state.current_page -= 1 st.rerun() with nav_col3: st.markdown(f'
{st.session_state.current_page} / {total_pages}
', unsafe_allow_html=True) with nav_col4: if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)): st.session_state.current_page += 1 st.rerun() with nav_col5: if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)): st.session_state.current_page = total_pages st.rerun() # Statistics cards stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4) with stats_col1: st.markdown(f"""
Showing
{len(current_page_data)}
""", unsafe_allow_html=True) with stats_col2: available_systems = len([col for col in display_cols if col != 'english']) st.markdown(f"""
Translation Systems
{available_systems}
""", unsafe_allow_html=True) with stats_col3: st.markdown(f"""
Total Available
{total_samples}
""", unsafe_allow_html=True) with stats_col4: st.markdown(f"""
Current Page
{st.session_state.current_page}/{total_pages}
""", unsafe_allow_html=True) # Display the samples table st.markdown("### Translation Examples") if len(current_page_data) > 0: # Create a styled dataframe with better column names display_df = current_page_data.copy() # Rename columns for better display column_rename = { 'english': 'English (Source)', } # Add human-readable names for translation columns for col in display_df.columns: if col.startswith(samples_code): if '_human' in col: column_rename[col] = f'{selected_lang} (Human)' elif '_revised' in col: column_rename[col] = f'{selected_lang} (Revised)' elif '_mt_gemini' in col or '_gemini' in col: column_rename[col] = f'{selected_lang} (Gemini)' elif '_mt_gpt' in col or '_gpt' in col: column_rename[col] = f'{selected_lang} (GPT)' elif '_mt_nllb' in col or '_nllb' in col: column_rename[col] = f'{selected_lang} (NLLB)' else: # Generic fallback clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title() column_rename[col] = f'{selected_lang} ({clean_name})' display_df = display_df.rename(columns=column_rename) # Add row numbers based on actual position in full dataset display_df.index = range(start_idx + 1, end_idx + 1) display_df.index.name = 'Sample #' st.dataframe( display_df, use_container_width=True, height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content column_config={ col: st.column_config.TextColumn(col, width="medium") for col in display_df.columns } ) # Page info summary st.markdown(f"""
📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages}
""", unsafe_allow_html=True) # Quick jump to page if total_pages > 5: # Only show quick jump for datasets with many pages st.markdown("### Quick Navigation") jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1]) with jump_col2: target_page = st.number_input( f"Jump to page (1-{total_pages}):", min_value=1, max_value=total_pages, value=st.session_state.current_page, key="page_jump" ) if st.button("🔗 Go to Page", use_container_width=True): if target_page != st.session_state.current_page: st.session_state.current_page = target_page st.rerun() else: st.warning("⚠️ No translation samples found for the current page.") else: st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'") # Debug information available_columns = [col for col in df_translations.columns if col.startswith(samples_code)] if available_columns: st.info(f"🔍 Found columns: {', '.join(available_columns)}") else: all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])] if all_lang_columns: st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}") with analysis_tab2: st.markdown("""

📈 Quality Metrics for {selected_lang}

""".format(selected_lang=selected_lang), unsafe_allow_html=True) # Get language code lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} code = lang_codes[selected_lang] # Score visualizations if df_bleu is not None and df_chrf is not None and df_comet is not None: # Filter scores for selected language lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet # Check if we have domain-level data has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and 'domain' in lang_comet.columns and len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0) if has_domain_data: # Add domain filter available_domains = sorted(lang_bleu['domain'].unique()) domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall'] selected_domain = st.selectbox( "📍 Select Domain for Analysis:", domain_options, key=f"domain_selector_{selected_lang}" ) # Filter data based on selected domain if selected_domain == 'Overall': display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall'] display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall'] display_comet = lang_comet[lang_comet['domain'] == 'Overall'] chart_title_suffix = " - Overall" else: display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain] display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain] display_comet = lang_comet[lang_comet['domain'] == selected_domain] chart_title_suffix = f" - {selected_domain}" else: # Use all data if no domain column display_bleu = lang_bleu display_chrf = lang_chrf display_comet = lang_comet chart_title_suffix = "" # Create score charts if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: chart_col1, chart_col2, chart_col3 = st.columns(3) with chart_col1: # chrF Score Chart fig_chrf = px.bar( display_chrf, x='comparison_pair', y='chrf_score', title=f'chrF Scores - {selected_lang}{chart_title_suffix}', color='chrf_score', color_continuous_scale='oranges' ) fig_chrf.update_layout( xaxis_title="Translation Pairs", yaxis_title="chrF Score", xaxis_tickangle=-45, height=400, font=dict(family="Inter", size=12) ) st.plotly_chart(fig_chrf, use_container_width=True) with chart_col2: # BLEU Score Chart fig_bleu = px.bar( display_bleu, x='comparison_pair', y='bleu_score', title=f'BLEU Scores - {selected_lang}{chart_title_suffix}', color='bleu_score', color_continuous_scale='blues' ) fig_bleu.update_layout( xaxis_title="Translation Pairs", yaxis_title="BLEU Score", xaxis_tickangle=-45, height=400, font=dict(family="Inter", size=12) ) st.plotly_chart(fig_bleu, use_container_width=True) with chart_col3: # COMET Score Chart fig_comet = px.bar( display_comet, x='comparison_pair', y='comet_score', title=f'COMET Scores - {selected_lang}{chart_title_suffix}', color='comet_score', color_continuous_scale='greens' ) fig_comet.update_layout( xaxis_title="Translation Pairs", yaxis_title="COMET Score", xaxis_tickangle=-45, height=400, font=dict(family="Inter", size=12) ) st.plotly_chart(fig_comet, use_container_width=True) # PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise if has_domain_data: st.markdown(f"""

🕸️ Domain Performance Spider Charts - {selected_lang}

""", unsafe_allow_html=True) # Filter out "Overall" so only domain-level values are shown domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall'] domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall'] domain_comet = lang_comet[lang_comet['domain'] != 'Overall'] # Pivot all metrics pivot_bleu = domain_bleu.pivot( index='comparison_pair', columns='domain', values='bleu_score' ).fillna(0) pivot_chrf = domain_chrf.pivot( index='comparison_pair', columns='domain', values='chrf_score' ).fillna(0) pivot_comet = domain_comet.pivot( index='comparison_pair', columns='domain', values='comet_score' ).fillna(0) # Ensure domains are in the same order for all metrics domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns)) pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0) pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0) pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0) # Define distinct colors with reduced opacity distinct_colors = [ 'rgba(255, 99, 132, 0.4)', # Red 'rgba(54, 162, 235, 0.4)', # Blue 'rgba(99, 255, 132, 0.4)', # Green 'rgba(75, 192, 192, 0.4)', # Teal 'rgba(255, 205, 86, 0.4)', # Yellow 'rgba(153, 102, 255, 0.4)', # Purple 'rgba(255, 159, 64, 0.4)', # Orange 'rgba(199, 199, 199, 0.4)', # Grey 'rgba(83, 102, 255, 0.4)', # Indigo 'rgba(255, 99, 255, 0.4)', # Magenta ] # Border colors (same colors but full opacity for borders) border_colors = [ 'rgba(255, 99, 132, 1.0)', # Red 'rgba(54, 162, 235, 1.0)', # Blue 'rgba(99, 255, 132, 1.0)', # Green 'rgba(75, 192, 192, 1.0)', # Teal 'rgba(255, 205, 86, 1.0)', # Yellow 'rgba(153, 102, 255, 1.0)', # Purple 'rgba(255, 159, 64, 1.0)', # Orange 'rgba(199, 199, 199, 1.0)', # Grey 'rgba(83, 102, 255, 1.0)', # Indigo 'rgba(255, 99, 255, 1.0)', # Magenta ] # Layout for three side-by-side spider charts spider_col1, spider_col2, spider_col3 = st.columns(3) # ---------------- CHRF SPIDER ---------------- with spider_col1: fig_chrf_spider = go.Figure() for i, (model_name, row) in enumerate(pivot_chrf.iterrows()): color_idx = i % len(distinct_colors) fig_chrf_spider.add_trace(go.Scatterpolar( r=row.tolist() + [row.tolist()[0]], # close loop theta=domains + [domains[0]], fill='toself', name=model_name.split('_')[-1].upper(), fillcolor=distinct_colors[color_idx], line=dict(color=border_colors[color_idx], width=2), opacity=0.7, showlegend=False # Hide legend on first chart )) fig_chrf_spider.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=False, title=dict(text=f"Domain Performance (chrF) - {selected_lang}"), height=450 ) st.plotly_chart(fig_chrf_spider, use_container_width=True) # ---------------- BLEU SPIDER ---------------- with spider_col2: fig_bleu_spider = go.Figure() for i, (model_name, row) in enumerate(pivot_bleu.iterrows()): color_idx = i % len(distinct_colors) fig_bleu_spider.add_trace(go.Scatterpolar( r=row.tolist() + [row.tolist()[0]], # close loop theta=domains + [domains[0]], fill='toself', name=model_name.split('_')[-1].upper(), fillcolor=distinct_colors[color_idx], line=dict(color=border_colors[color_idx], width=2), opacity=0.7, showlegend=True # Show legend on middle chart )) fig_bleu_spider.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=True, title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"), height=450, legend=dict( orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5 ) ) st.plotly_chart(fig_bleu_spider, use_container_width=True) # ---------------- COMET SPIDER ---------------- with spider_col3: fig_comet_spider = go.Figure() for i, (model_name, row) in enumerate(pivot_comet.iterrows()): color_idx = i % len(distinct_colors) fig_comet_spider.add_trace(go.Scatterpolar( r=row.tolist() + [row.tolist()[0]], # close loop theta=domains + [domains[0]], fill='toself', name=model_name.split('_')[-1].upper(), fillcolor=distinct_colors[color_idx], line=dict(color=border_colors[color_idx], width=2), opacity=0.7, showlegend=False # Hide legend on last chart )) fig_comet_spider.update_layout( polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=False, title=dict(text=f"Domain Performance (COMET) - {selected_lang}"), height=450 ) st.plotly_chart(fig_comet_spider, use_container_width=True) # # Overall Performance Summary # st.markdown(""" #

# 📋 Overall Performance Summary #

# """, unsafe_allow_html=True) # # Create overall summary table # if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: # # Merge all three metrics # merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf')) # merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair') # merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper()) # summary_data = [] # for _, row in merged_scores.iterrows(): # summary_data.append({ # 'Model': row['model'], # 'BLEU Score': f"{row['bleu_score']:.3f}", # 'chrF Score': f"{row['chrf_score']:.3f}", # 'COMET Score': f"{row['comet_score']:.3f}", # 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}" # }) # summary_df = pd.DataFrame(summary_data) # # Only sort if dataframe has data and 'Average' column exists # if len(summary_df) > 0 and 'Average' in summary_df.columns: # summary_df = summary_df.sort_values('Average', ascending=False) # # Style the dataframe # st.dataframe( # summary_df, # use_container_width=True, # hide_index=True, # column_config={ # "Model": st.column_config.TextColumn("Model", width="medium"), # "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"), # "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"), # "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"), # "Average": st.column_config.NumberColumn("Average", format="%.3f") # } # ) with analysis_tab3: # Revision Analysis Tab st.markdown("""

✏️ Human Translation Revision Analysis for {selected_lang}

""".format(selected_lang=selected_lang), unsafe_allow_html=True) # Use the global language selection rev_code = code # Check for revision columns human_col = f"{rev_code}_human" revised_col = f"{rev_code}_revised" if human_col in df_translations.columns and revised_col in df_translations.columns: # Get all rows with human translations for this language df_lang_data = df_translations[[human_col, revised_col]].copy() # Remove rows where human translation is missing (can't analyze revisions without original) df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy() total_human_translations = len(df_lang_data) if total_human_translations == 0: st.warning(f"⚠️ No human translations found for {selected_lang}") else: # Calculate revision statistics # For missing revised translations, we assume no revision was made (same as original) df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col]) # Count actual changes revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col]) revision_rate = (revisions_made / total_human_translations) * 100 # Count how many had revision data available revisions_available = sum(df_translations[revised_col].notna()) # Calculate revision types def categorize_revision(original, revised): if pd.isna(original) or pd.isna(revised): return "Missing Data" if str(original).strip() == str(revised).strip(): return "No Change" orig_words = str(original).lower().split() rev_words = str(revised).lower().split() if len(rev_words) > len(orig_words): return "Expansion" elif len(rev_words) < len(orig_words): return "Reduction" else: return "Modification" df_lang_data['revision_type'] = df_lang_data.apply( lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1 ) # Revision statistics cards rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4) with rev_col1: st.markdown(f"""
Human Translations
{total_human_translations}
""", unsafe_allow_html=True) with rev_col2: st.markdown(f"""
Revisions Available
{revisions_available}
""", unsafe_allow_html=True) with rev_col3: st.markdown(f"""
Changes Made
{revisions_made}
""", unsafe_allow_html=True) with rev_col4: st.markdown(f"""
Revision Rate
{revision_rate:.1f}%
""", unsafe_allow_html=True) # Revision type analysis st.markdown("""

📈 Revision Pattern Analysis

""", unsafe_allow_html=True) revision_counts = df_lang_data['revision_type'].value_counts() if len(revision_counts) > 0: # Create revision type charts rev_chart_col1, rev_chart_col2 = st.columns(2) with rev_chart_col1: # Pie chart of revision types fig_pie = px.pie( values=revision_counts.values, names=revision_counts.index, title=f"Revision Types Distribution", color_discrete_sequence=px.colors.qualitative.Set3 ) fig_pie.update_layout(height=400, font=dict(family="Inter", size=12)) st.plotly_chart(fig_pie, use_container_width=True) with rev_chart_col2: # Bar chart of revision types fig_bar = px.bar( x=revision_counts.values, y=revision_counts.index, orientation='h', title=f"Revision Frequency", color=revision_counts.values, color_continuous_scale='viridis' ) fig_bar.update_layout( height=400, xaxis_title="Count", yaxis_title="Revision Type", font=dict(family="Inter", size=12) ) st.plotly_chart(fig_bar, use_container_width=True) # Word-level revision analysis st.markdown("""

🔤 Word-Level Changes Analysis

""", unsafe_allow_html=True) # Calculate word changes only for actual revisions words_added = [] words_removed = [] changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change'] for _, row in changed_revisions.iterrows(): if pd.notna(row[human_col]) and pd.notna(row[revised_col]): orig_words = set(str(row[human_col]).lower().split()) rev_words = set(str(row[revised_col]).lower().split()) added = rev_words - orig_words removed = orig_words - rev_words words_added.extend(list(added)) words_removed.extend(list(removed)) from collections import Counter added_counts = Counter(words_added) removed_counts = Counter(words_removed) word_analysis_col1, word_analysis_col2 = st.columns(2) with word_analysis_col1: st.markdown("**🟢 Most Added Words**") if added_counts: top_added = dict(added_counts.most_common(15)) # Create horizontal bar chart for added words fig_added = px.bar( x=list(top_added.values()), y=list(top_added.keys()), orientation='h', title="Most Frequently Added Words", color=list(top_added.values()), color_continuous_scale='Greens' ) fig_added.update_layout( height=400, xaxis_title="Frequency", yaxis_title="Words", font=dict(family="Inter", size=10) ) st.plotly_chart(fig_added, use_container_width=True) else: st.markdown("*No words added in revisions*") with word_analysis_col2: st.markdown("**🔴 Most Removed Words**") if removed_counts: top_removed = dict(removed_counts.most_common(15)) # Create horizontal bar chart for removed words fig_removed = px.bar( x=list(top_removed.values()), y=list(top_removed.keys()), orientation='h', title="Most Frequently Removed Words", color=list(top_removed.values()), color_continuous_scale='Reds' ) fig_removed.update_layout( height=400, xaxis_title="Frequency", yaxis_title="Words", font=dict(family="Inter", size=10) ) st.plotly_chart(fig_removed, use_container_width=True) else: st.markdown("*No words removed in revisions*") # Revision examples st.markdown("""

📝 Revision Examples

""", unsafe_allow_html=True) # Show examples of different types of revisions revision_examples = changed_revisions.head(10) if len(revision_examples) > 0: # Create tabs for different revision types available_types = revision_examples['revision_type'].unique() if len(available_types) > 1: type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})" for rtype in available_types]) for i, rtype in enumerate(available_types): with type_tabs[i]: type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5) for idx, row in type_examples.iterrows(): st.markdown(f"""
Original:
{row[human_col]}
Revised:
{row[revised_col]}
Type: {row['revision_type']}
""", unsafe_allow_html=True) else: # Single type, show directly for idx, row in revision_examples.iterrows(): st.markdown(f"""
Original:
{row[human_col]}
Revised:
{row[revised_col]}
Type: {row['revision_type']}
""", unsafe_allow_html=True) else: st.info(f"No revisions found for {selected_lang}.") else: st.info(f"No revision data available for analysis.") else: st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`") with analysis_tab4: # Translation comparison section st.markdown("""

🔍 Translation Comparison & Word Analysis for {selected_lang}

""".format(selected_lang=selected_lang), unsafe_allow_html=True) # Use the global language selection comp_code = code # Get available translation columns for selected language available_cols = [] for col in df_translations.columns: if col.startswith(comp_code) and col != 'english': available_cols.append(col) if len(available_cols) >= 2: comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1]) with comp_col1: col1_selection = st.selectbox( "First Translation:", available_cols, key="col1_select" ) with comp_col2: col2_selection = st.selectbox( "Second Translation:", [col for col in available_cols if col != col1_selection], key="col2_select" ) with comp_col3: # Add spacing to align button with selectboxes st.markdown('
', unsafe_allow_html=True) analyze_clicked = st.button( "🔍 Analyze", type="primary", use_container_width=True, key="analyze_word_diff_btn" ) if analyze_clicked: # Perform word analysis with ALL available data def get_word_differences(text1, text2): # Handle missing data by using available text if pd.isna(text1) and pd.isna(text2): return set(), set(), set() # If one is missing, treat it as empty for comparison words1 = set(str(text1).lower().split()) if pd.notna(text1) else set() words2 = set(str(text2).lower().split()) if pd.notna(text2) else set() only_in_1 = words1 - words2 only_in_2 = words2 - words1 common = words1 & words2 return only_in_1, only_in_2, common # Analyze ALL rows with available data unique_words_1 = [] unique_words_2 = [] common_words = [] all_words_1 = [] # For frequency counting all_words_2 = [] # For frequency counting # Process all rows, including those with missing revisions for _, row in df_translations.iterrows(): # Get text from columns, using original if revision is missing text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None # Skip if both are missing if text1 is None and text2 is None: continue # Collect ALL words from each column for frequency analysis if text1 is not None: words_from_1 = str(text1).lower().split() all_words_1.extend(words_from_1) if text2 is not None: words_from_2 = str(text2).lower().split() all_words_2.extend(words_from_2) # Only do comparison if both texts exist if text1 is not None and text2 is not None: only_1, only_2, common = get_word_differences(text1, text2) unique_words_1.extend(list(only_1)) unique_words_2.extend(list(only_2)) common_words.extend(list(common)) from collections import Counter # Count frequencies from ALL words all_freq_1 = Counter(all_words_1) # All words from column 1 all_freq_2 = Counter(all_words_2) # All words from column 2 unique_freq_1 = Counter(unique_words_1) # Only unique words unique_freq_2 = Counter(unique_words_2) # Only unique words common_freq = Counter(common_words) # Only common words # Display statistics st.markdown('
', unsafe_allow_html=True) col_result1, col_result2, col_result3, col_result4 = st.columns(4) with col_result1: st.markdown(f"""
Unique to {col1_selection.replace('_', ' ').title()}
{len(unique_freq_1)}
unique words
""", unsafe_allow_html=True) with col_result2: st.markdown(f"""
Unique to {col2_selection.replace('_', ' ').title()}
{len(unique_freq_2)}
unique words
""", unsafe_allow_html=True) with col_result3: st.markdown(f"""
Common Words
{len(common_freq)}
shared words
""", unsafe_allow_html=True) with col_result4: st.markdown(f"""
Total Vocabulary
{len(set(all_words_1 + all_words_2))}
total unique words
""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Word Clouds Section st.markdown("""

☁️ Word Clouds Visualization

""", unsafe_allow_html=True) # Generate word clouds using matplotlib and wordcloud try: # Show loading spinner while generating word clouds with st.spinner("🎨 Generating word clouds... This may take a moment."): import matplotlib.pyplot as plt from wordcloud import WordCloud import io import base64 # Function to create word cloud image (optimized) def create_wordcloud_image(word_freq, title, color_scheme='viridis'): if not word_freq or len(word_freq) == 0: return None try: # Create word cloud with all frequency data, but limit max_words to 25 wordcloud = WordCloud( width=300, # Reduced size height=200, # Reduced size background_color='white', colormap=color_scheme, max_words=25, # Display top 25 words relative_scaling=0.6, random_state=42, min_font_size=8, max_font_size=60, prefer_horizontal=0.9, collocations=False # Avoid word combinations ).generate_from_frequencies(word_freq) # Use ALL frequency data # Create matplotlib figure with smaller size fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title(title, fontsize=10, fontweight='bold', pad=10) # Convert to base64 for HTML display buffer = io.BytesIO() plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI buffer.seek(0) image_base64 = base64.b64encode(buffer.getvalue()).decode() plt.close(fig) # Important: close figure to free memory return image_base64 except Exception as e: st.warning(f"Error creating word cloud for {title}: {str(e)}") return None # Create all word clouds in one row cloud_col1, cloud_col2, cloud_col3 = st.columns(3) with cloud_col1: if unique_freq_1 and len(unique_freq_1) > 0: # Use ALL unique words but display top 25 in cloud img1 = create_wordcloud_image( dict(unique_freq_1), # Use ALL unique words for frequency f"Unique: {col1_selection.replace('_', ' ').title()}", 'Reds' ) if img1: st.markdown(f'''
Showing top 25 of {len(unique_freq_1)} unique words
''', unsafe_allow_html=True) else: st.markdown("""
📝
No unique words
""", unsafe_allow_html=True) else: st.markdown("""
📝
No unique words found
""", unsafe_allow_html=True) with cloud_col2: if unique_freq_2 and len(unique_freq_2) > 0: # Use ALL unique words but display top 25 in cloud img2 = create_wordcloud_image( dict(unique_freq_2), # Use ALL unique words for frequency f"Unique: {col2_selection.replace('_', ' ').title()}", 'Greens' ) if img2: st.markdown(f'''
Showing top 25 of {len(unique_freq_2)} unique words
''', unsafe_allow_html=True) else: st.markdown("""
📝
No unique words
""", unsafe_allow_html=True) else: st.markdown("""
📝
No unique words found
""", unsafe_allow_html=True) with cloud_col3: if common_freq and len(common_freq) > 0: # Use ALL common words but display top 25 in cloud img3 = create_wordcloud_image( dict(common_freq), # Use ALL common words for frequency "Common Words", 'Blues' ) if img3: st.markdown(f'''
Showing top 25 of {len(common_freq)} common words
''', unsafe_allow_html=True) else: st.markdown("""
📝
No common words
""", unsafe_allow_html=True) else: st.markdown("""
🤝
No common words found
""", unsafe_allow_html=True) except ImportError: st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`") # Fallback to top words lists st.markdown("**📋 Top Unique Words (Fallback)**") fallback_col1, fallback_col2, fallback_col3 = st.columns(3) with fallback_col1: st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**") if unique_freq_1: for word, count in unique_freq_1.most_common(10): st.markdown(f"• {word} ({count})") else: st.markdown("*No unique words*") with fallback_col2: st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**") if unique_freq_2: for word, count in unique_freq_2.most_common(10): st.markdown(f"• {word} ({count})") else: st.markdown("*No unique words*") with fallback_col3: st.markdown("**🔵 Common Words**") if common_freq: for word, count in common_freq.most_common(10): st.markdown(f"• {word} ({count})") else: st.markdown("*No common words*") # Word frequency bar charts as additional analysis st.markdown("""

📊 Top Words Frequency Comparison

""", unsafe_allow_html=True) freq_col1, freq_col2 = st.columns(2) with freq_col1: if unique_freq_1: top_words_1 = dict(unique_freq_1.most_common(10)) fig_freq1 = px.bar( x=list(top_words_1.values()), y=list(top_words_1.keys()), orientation='h', title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}", color=list(top_words_1.values()), color_continuous_scale='Reds' ) fig_freq1.update_layout( height=400, xaxis_title="Frequency", yaxis_title="Words", font=dict(family="Inter", size=10) ) st.plotly_chart(fig_freq1, use_container_width=True) with freq_col2: if unique_freq_2: top_words_2 = dict(unique_freq_2.most_common(10)) fig_freq2 = px.bar( x=list(top_words_2.values()), y=list(top_words_2.keys()), orientation='h', title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}", color=list(top_words_2.values()), color_continuous_scale='Greens' ) fig_freq2.update_layout( height=400, xaxis_title="Frequency", yaxis_title="Words", font=dict(family="Inter", size=10) ) st.plotly_chart(fig_freq2, use_container_width=True) else: st.warning("⚠️ Need at least 2 translation columns for comparison analysis.") else: st.markdown("""

❌ No Data Available

Please ensure translation data files are available in the data directory.

""", unsafe_allow_html=True) # Footer st.markdown("---") st.markdown("""
Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact
""", unsafe_allow_html=True) if __name__ == "__main__": main()