import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import time
from PIL import Image
# Only import APIs if available
try:
from google import genai
GENAI_AVAILABLE = True
except ImportError:
GENAI_AVAILABLE = False
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
BASE_DIR = os.path.dirname(__file__)
DATA_DIR = os.path.join(BASE_DIR, "data")
# Page configuration
st.set_page_config(
page_title="Translation Comparison Tool",
page_icon="🌐",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for Material Design with Tailwind-inspired styling
st.markdown("""
""", unsafe_allow_html=True)
# Model configurations
MODEL_CONFIG = {
'Gemini': {
'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'],
'default_model': 'gemini-2.0-flash-exp'
},
'GPT': {
'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'],
'default_model': 'gpt-4'
},
'NLLB': {
'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available
'models': {
'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B',
'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B'
}
}
}
# Language code mappings
LANGUAGE_CODES = {
'Afrikaans': 'afr',
'Northern Sotho': 'nso',
'isiZulu': 'isizulu'
}
# Load logo
def load_logo():
"""Load logo with error handling"""
try:
if os.path.exists(f"{BASE_DIR}/logo.png"):
return Image.open(f"{BASE_DIR}/logo.png")
except Exception as e:
st.warning(f"Could not load logo: {str(e)}")
return None
# Load and cache data
@st.cache_data
def load_translation_data():
"""Load sample translation data"""
try:
sample_data = {
'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'],
'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'],
'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle']
}
return pd.DataFrame(sample_data)
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']})
def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None):
"""Translate text using Gemini API"""
try:
if not GENAI_AVAILABLE:
return "❌ Gemini library not installed"
if not client:
return "❌ Gemini API not configured. Please check your GEMINI_API_KEY."
lang_map = {
'Afrikaans': 'Afrikaans',
'Northern Sotho': 'Northern Sotho (Sepedi)',
'isiZulu': 'isiZulu'
}
prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations."
response = client.models.generate_content(
model=model_name, contents=prompt
)
return response.text.strip()
except Exception as e:
return f"❌ Error: {str(e)}"
def translate_with_openai(text, target_language, model_name="gpt-4o", client=None):
"""Translate text using OpenAI API with Chat Completions"""
try:
if not OPENAI_AVAILABLE:
return "❌ OpenAI library not installed"
if not client:
return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY."
lang_map = {
'Afrikaans': 'Afrikaans',
'Northern Sotho': 'Northern Sotho (Sepedi)',
'isiZulu': 'isiZulu'
}
# Use Chat Completions API (supported indefinitely)
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."},
{"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"}
],
max_tokens=1000,
temperature=0.3 # Lower temperature for more consistent translations
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"❌ Error: {str(e)}"
@st.cache_resource
def initialize_apis():
"""Initialize API clients with proper error handling, supporting both local and HF Spaces."""
genai_client = None
openai_client = None
def get_secret(name):
"""Fetch secret from env first (Docker Spaces), then Streamlit secrets."""
return (
os.environ.get(name)
or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None)
)
try:
# Gemini API
if GENAI_AVAILABLE:
try:
api_key = get_secret("GEMINI_API_KEY")
if api_key:
genai_client = genai.Client(api_key=api_key)
else:
st.warning("⚠️ Gemini API key not found")
except Exception as e:
st.error(f"❌ Gemini API error: {str(e)}")
# OpenAI API
if OPENAI_AVAILABLE:
try:
api_key = get_secret("OPENAI_API_KEY")
if api_key:
try:
# Try new OpenAI API client
openai_client = OpenAI(api_key=api_key)
except TypeError:
import openai
openai.api_key = api_key
openai_client = openai
else:
st.warning("⚠️ OpenAI API key not found")
except Exception as e:
st.error(f"❌ OpenAI API error: {str(e)}")
except Exception as e:
st.error(f"❌ API initialization error: {str(e)}")
return genai_client, openai_client
def translate_with_nllb(text, target_language):
"""Translate text using unified NLLB API"""
try:
import requests
# Single ngrok URL for unified API
API_URL = "https://4c2faecc052a.ngrok-free.app"
# Map Streamlit language names to API format
lang_mapping = {
'Northern Sotho': 'nso',
'isiZulu': 'zul'
}
api_lang = lang_mapping.get(target_language, target_language.lower())
response = requests.post(
f"{API_URL}/translate_simple",
params={
"text": text,
"target_language": api_lang
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result.get(api_lang, '❌ Translation not found')
else:
return f"❌ API Error: {response.status_code}"
except Exception as e:
return f"❌ Error: {str(e)}"
def create_language_tabs(available_languages, current_language, key_suffix=""):
"""Create language tabs with proper styling"""
tabs_html = '
', unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
input_text = st.text_area(
"Input",
placeholder="Input text here",
height=350,
key="input_text_live",
label_visibility="collapsed"
)
# Center - Translate Button
with col_center:
# Add spacing to align button with text areas
st.markdown('', unsafe_allow_html=True)
# Language selection buttons
lang_cols = st.columns(len(available_languages))
for i, lang in enumerate(available_languages):
with lang_cols[i]:
button_type = "primary" if lang == st.session_state.target_language else "secondary"
if st.button(
lang,
key=f"lang_btn_{lang}_live",
type=button_type,
use_container_width=True
):
if st.session_state.target_language != lang: # Only update if different
st.session_state.target_language = lang
st.session_state.translation_result = "" # Clear previous result
st.rerun()
# Translation logic
if translate_clicked and input_text:
with st.spinner("Translating..."):
target_lang = st.session_state.target_language
if selected_provider == 'Gemini':
result = translate_with_gemini(input_text, target_lang, selected_model, genai_client)
elif selected_provider == 'GPT':
result = translate_with_openai(input_text, target_lang, selected_model, openai_client)
elif selected_provider == 'NLLB':
result = translate_with_nllb(input_text, target_lang)
st.session_state.translation_result = result
# Translation output area with proper labeling
st.text_area(
f"Translation ({st.session_state.target_language})", # Dynamic label
value=st.session_state.translation_result,
placeholder="Translation will appear here",
height=350,
key="translation_output_live_fixed", # Changed key to avoid conflicts
disabled=True,
label_visibility="collapsed"
)
# Support information
st.markdown("""
Available Models:
🔮 Gemini: All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)
🧠 GPT: All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)
🤗 NLLB: Northern Sotho, isiZulu only (specialized models)
""", unsafe_allow_html=True)
with tab2:
# Load data from base directory automatically
@st.cache_data
def load_analysis_data():
"""Load all analysis data from base directory"""
df_translations = None
df_bleu = None
df_chrf = None
df_comet = None
try:
# Try to load translations data
if os.path.exists(f"{DATA_DIR}/translations.tsv"):
df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t")
# Convert new CSV format to expected format for analysis
# New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb
# Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc.
# Rename zul columns to isizulu for backward compatibility with analysis code
column_mapping = {
'zul_human': 'isizulu_human',
'zul_revised': 'isizulu_revised',
'zul_gemini': 'isizulu_mt_gemini',
'zul_gpt': 'isizulu_mt_gpt',
'zul_nllb': 'isizulu_mt_nllb',
'afr_gemini': 'afr_mt_gemini',
'afr_gpt': 'afr_mt_gpt',
'nso_gemini': 'nso_mt_gemini',
'nso_gpt': 'nso_mt_gpt',
'nso_nllb': 'nso_mt_nllb'
}
df_translations = df_translations.rename(columns=column_mapping)
elif os.path.exists(f"{DATA_DIR}/translation_data.csv"):
df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv")
else:
print("No translation data found, using sample data")
df_translations = load_translation_data() # Fallback to sample data
# Try to load BLEU scores
if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"):
df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv")
# Convert zul references to isizulu for compatibility
df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_')
df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct
else:
# Sample BLEU data (using isizulu for compatibility with existing analysis code)
df_bleu = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'],
'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu']
})
# Try to load COMET scores
if os.path.exists(f"{DATA_DIR}/comet_scores.csv"):
df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv")
else:
# Sample COMET data
df_comet = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
})
# Try to load CHRF scores
if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"):
df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv")
else:
# Sample CHRF data
df_chrf = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
})
return df_translations, df_bleu, df_comet, df_chrf
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return None, None, None, None
# Load all data
df_translations, df_bleu, df_comet, df_chrf = load_analysis_data()
if df_translations is not None:
# Language selection in columns
lang_col1, lang_col2 = st.columns([2, 10])
with lang_col1:
st.markdown('
Select Language:
', unsafe_allow_html=True)
with lang_col2:
languages = ['Afrikaans', 'Northern Sotho', 'isiZulu']
selected_lang = st.selectbox(
"Select Language for Analysis:",
languages,
key="global_lang_select",
label_visibility="collapsed"
)
# Get language code
lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
code = lang_codes[selected_lang]
# Create analysis tabs
analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"])
with analysis_tab1:
# Translation Samples Tab
st.markdown("""
📝 Translation Samples for {selected_lang}
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
samples_code = code
# Show sample translations for the selected language
display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)]
if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column
# Control panel
control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2])
with control_col1:
st.markdown('
Samples per page:
', unsafe_allow_html=True)
with control_col2:
page_size = st.selectbox(
"Samples per page:",
[10, 25, 50, 100],
index=0,
key="page_size_select",
label_visibility="collapsed"
)
# Initialize session state for pagination
if 'current_page' not in st.session_state:
st.session_state.current_page = 1
# Filter data and calculate pagination
available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all')
total_samples = len(available_data)
total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division
# Ensure current page is valid
if st.session_state.current_page > total_pages:
st.session_state.current_page = 1
# Calculate start and end indices
start_idx = (st.session_state.current_page - 1) * page_size
end_idx = min(start_idx + page_size, total_samples)
# Get current page data
current_page_data = available_data.iloc[start_idx:end_idx]
with control_col3:
st.markdown('
Page:
', unsafe_allow_html=True)
with control_col4:
# Page navigation
nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1])
with nav_col1:
if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)):
st.session_state.current_page = 1
st.rerun()
with nav_col2:
if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)):
st.session_state.current_page -= 1
st.rerun()
with nav_col3:
st.markdown(f'
{st.session_state.current_page} / {total_pages}
', unsafe_allow_html=True)
with nav_col4:
if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)):
st.session_state.current_page += 1
st.rerun()
with nav_col5:
if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)):
st.session_state.current_page = total_pages
st.rerun()
# Statistics cards
stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4)
with stats_col1:
st.markdown(f"""
Showing
{len(current_page_data)}
""", unsafe_allow_html=True)
with stats_col2:
available_systems = len([col for col in display_cols if col != 'english'])
st.markdown(f"""
Translation Systems
{available_systems}
""", unsafe_allow_html=True)
with stats_col3:
st.markdown(f"""
Total Available
{total_samples}
""", unsafe_allow_html=True)
with stats_col4:
st.markdown(f"""
Current Page
{st.session_state.current_page}/{total_pages}
""", unsafe_allow_html=True)
# Display the samples table
st.markdown("### Translation Examples")
if len(current_page_data) > 0:
# Create a styled dataframe with better column names
display_df = current_page_data.copy()
# Rename columns for better display
column_rename = {
'english': 'English (Source)',
}
# Add human-readable names for translation columns
for col in display_df.columns:
if col.startswith(samples_code):
if '_human' in col:
column_rename[col] = f'{selected_lang} (Human)'
elif '_revised' in col:
column_rename[col] = f'{selected_lang} (Revised)'
elif '_mt_gemini' in col or '_gemini' in col:
column_rename[col] = f'{selected_lang} (Gemini)'
elif '_mt_gpt' in col or '_gpt' in col:
column_rename[col] = f'{selected_lang} (GPT)'
elif '_mt_nllb' in col or '_nllb' in col:
column_rename[col] = f'{selected_lang} (NLLB)'
else:
# Generic fallback
clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title()
column_rename[col] = f'{selected_lang} ({clean_name})'
display_df = display_df.rename(columns=column_rename)
# Add row numbers based on actual position in full dataset
display_df.index = range(start_idx + 1, end_idx + 1)
display_df.index.name = 'Sample #'
st.dataframe(
display_df,
use_container_width=True,
height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content
column_config={
col: st.column_config.TextColumn(col, width="medium")
for col in display_df.columns
}
)
# Page info summary
st.markdown(f"""
📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages}
""", unsafe_allow_html=True)
# Quick jump to page
if total_pages > 5: # Only show quick jump for datasets with many pages
st.markdown("### Quick Navigation")
jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1])
with jump_col2:
target_page = st.number_input(
f"Jump to page (1-{total_pages}):",
min_value=1,
max_value=total_pages,
value=st.session_state.current_page,
key="page_jump"
)
if st.button("🔗 Go to Page", use_container_width=True):
if target_page != st.session_state.current_page:
st.session_state.current_page = target_page
st.rerun()
else:
st.warning("⚠️ No translation samples found for the current page.")
else:
st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'")
# Debug information
available_columns = [col for col in df_translations.columns if col.startswith(samples_code)]
if available_columns:
st.info(f"🔍 Found columns: {', '.join(available_columns)}")
else:
all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])]
if all_lang_columns:
st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}")
with analysis_tab2:
st.markdown("""
📈 Quality Metrics for {selected_lang}
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Get language code
lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
code = lang_codes[selected_lang]
# Score visualizations
if df_bleu is not None and df_chrf is not None and df_comet is not None:
# Filter scores for selected language
lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu
lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf
lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet
# Check if we have domain-level data
has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and
'domain' in lang_comet.columns and
len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0)
if has_domain_data:
# Add domain filter
available_domains = sorted(lang_bleu['domain'].unique())
domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall']
selected_domain = st.selectbox(
"📍 Select Domain for Analysis:",
domain_options,
key=f"domain_selector_{selected_lang}"
)
# Filter data based on selected domain
if selected_domain == 'Overall':
display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall']
display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall']
display_comet = lang_comet[lang_comet['domain'] == 'Overall']
chart_title_suffix = " - Overall"
else:
display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain]
display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain]
display_comet = lang_comet[lang_comet['domain'] == selected_domain]
chart_title_suffix = f" - {selected_domain}"
else:
# Use all data if no domain column
display_bleu = lang_bleu
display_chrf = lang_chrf
display_comet = lang_comet
chart_title_suffix = ""
# Create score charts
if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
chart_col1, chart_col2, chart_col3 = st.columns(3)
with chart_col1:
# chrF Score Chart
fig_chrf = px.bar(
display_chrf,
x='comparison_pair',
y='chrf_score',
title=f'chrF Scores - {selected_lang}{chart_title_suffix}',
color='chrf_score',
color_continuous_scale='oranges'
)
fig_chrf.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="chrF Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_chrf, use_container_width=True)
with chart_col2:
# BLEU Score Chart
fig_bleu = px.bar(
display_bleu,
x='comparison_pair',
y='bleu_score',
title=f'BLEU Scores - {selected_lang}{chart_title_suffix}',
color='bleu_score',
color_continuous_scale='blues'
)
fig_bleu.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="BLEU Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_bleu, use_container_width=True)
with chart_col3:
# COMET Score Chart
fig_comet = px.bar(
display_comet,
x='comparison_pair',
y='comet_score',
title=f'COMET Scores - {selected_lang}{chart_title_suffix}',
color='comet_score',
color_continuous_scale='greens'
)
fig_comet.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="COMET Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_comet, use_container_width=True)
# PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise
if has_domain_data:
st.markdown(f"""
🕸️ Domain Performance Spider Charts - {selected_lang}
""", unsafe_allow_html=True)
# Filter out "Overall" so only domain-level values are shown
domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall']
domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall']
domain_comet = lang_comet[lang_comet['domain'] != 'Overall']
# Pivot all metrics
pivot_bleu = domain_bleu.pivot(
index='comparison_pair',
columns='domain',
values='bleu_score'
).fillna(0)
pivot_chrf = domain_chrf.pivot(
index='comparison_pair',
columns='domain',
values='chrf_score'
).fillna(0)
pivot_comet = domain_comet.pivot(
index='comparison_pair',
columns='domain',
values='comet_score'
).fillna(0)
# Ensure domains are in the same order for all metrics
domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns))
pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0)
pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0)
pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0)
# Define distinct colors with reduced opacity
distinct_colors = [
'rgba(255, 99, 132, 0.4)', # Red
'rgba(54, 162, 235, 0.4)', # Blue
'rgba(99, 255, 132, 0.4)', # Green
'rgba(75, 192, 192, 0.4)', # Teal
'rgba(255, 205, 86, 0.4)', # Yellow
'rgba(153, 102, 255, 0.4)', # Purple
'rgba(255, 159, 64, 0.4)', # Orange
'rgba(199, 199, 199, 0.4)', # Grey
'rgba(83, 102, 255, 0.4)', # Indigo
'rgba(255, 99, 255, 0.4)', # Magenta
]
# Border colors (same colors but full opacity for borders)
border_colors = [
'rgba(255, 99, 132, 1.0)', # Red
'rgba(54, 162, 235, 1.0)', # Blue
'rgba(99, 255, 132, 1.0)', # Green
'rgba(75, 192, 192, 1.0)', # Teal
'rgba(255, 205, 86, 1.0)', # Yellow
'rgba(153, 102, 255, 1.0)', # Purple
'rgba(255, 159, 64, 1.0)', # Orange
'rgba(199, 199, 199, 1.0)', # Grey
'rgba(83, 102, 255, 1.0)', # Indigo
'rgba(255, 99, 255, 1.0)', # Magenta
]
# Layout for three side-by-side spider charts
spider_col1, spider_col2, spider_col3 = st.columns(3)
# ---------------- CHRF SPIDER ----------------
with spider_col1:
fig_chrf_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_chrf.iterrows()):
color_idx = i % len(distinct_colors)
fig_chrf_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=False # Hide legend on first chart
))
fig_chrf_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=False,
title=dict(text=f"Domain Performance (chrF) - {selected_lang}"),
height=450
)
st.plotly_chart(fig_chrf_spider, use_container_width=True)
# ---------------- BLEU SPIDER ----------------
with spider_col2:
fig_bleu_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_bleu.iterrows()):
color_idx = i % len(distinct_colors)
fig_bleu_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=True # Show legend on middle chart
))
fig_bleu_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=True,
title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"),
height=450,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.3,
xanchor="center",
x=0.5
)
)
st.plotly_chart(fig_bleu_spider, use_container_width=True)
# ---------------- COMET SPIDER ----------------
with spider_col3:
fig_comet_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_comet.iterrows()):
color_idx = i % len(distinct_colors)
fig_comet_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=False # Hide legend on last chart
))
fig_comet_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=False,
title=dict(text=f"Domain Performance (COMET) - {selected_lang}"),
height=450
)
st.plotly_chart(fig_comet_spider, use_container_width=True)
# # Overall Performance Summary
# st.markdown("""
#
# 📋 Overall Performance Summary
#
# """, unsafe_allow_html=True)
# # Create overall summary table
# if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
# # Merge all three metrics
# merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf'))
# merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair')
# merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper())
# summary_data = []
# for _, row in merged_scores.iterrows():
# summary_data.append({
# 'Model': row['model'],
# 'BLEU Score': f"{row['bleu_score']:.3f}",
# 'chrF Score': f"{row['chrf_score']:.3f}",
# 'COMET Score': f"{row['comet_score']:.3f}",
# 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}"
# })
# summary_df = pd.DataFrame(summary_data)
# # Only sort if dataframe has data and 'Average' column exists
# if len(summary_df) > 0 and 'Average' in summary_df.columns:
# summary_df = summary_df.sort_values('Average', ascending=False)
# # Style the dataframe
# st.dataframe(
# summary_df,
# use_container_width=True,
# hide_index=True,
# column_config={
# "Model": st.column_config.TextColumn("Model", width="medium"),
# "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"),
# "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"),
# "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"),
# "Average": st.column_config.NumberColumn("Average", format="%.3f")
# }
# )
with analysis_tab3:
# Revision Analysis Tab
st.markdown("""
✏️ Human Translation Revision Analysis for {selected_lang}
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
rev_code = code
# Check for revision columns
human_col = f"{rev_code}_human"
revised_col = f"{rev_code}_revised"
if human_col in df_translations.columns and revised_col in df_translations.columns:
# Get all rows with human translations for this language
df_lang_data = df_translations[[human_col, revised_col]].copy()
# Remove rows where human translation is missing (can't analyze revisions without original)
df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy()
total_human_translations = len(df_lang_data)
if total_human_translations == 0:
st.warning(f"⚠️ No human translations found for {selected_lang}")
else:
# Calculate revision statistics
# For missing revised translations, we assume no revision was made (same as original)
df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col])
# Count actual changes
revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col])
revision_rate = (revisions_made / total_human_translations) * 100
# Count how many had revision data available
revisions_available = sum(df_translations[revised_col].notna())
# Calculate revision types
def categorize_revision(original, revised):
if pd.isna(original) or pd.isna(revised):
return "Missing Data"
if str(original).strip() == str(revised).strip():
return "No Change"
orig_words = str(original).lower().split()
rev_words = str(revised).lower().split()
if len(rev_words) > len(orig_words):
return "Expansion"
elif len(rev_words) < len(orig_words):
return "Reduction"
else:
return "Modification"
df_lang_data['revision_type'] = df_lang_data.apply(
lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1
)
# Revision statistics cards
rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4)
with rev_col1:
st.markdown(f"""
Human Translations
{total_human_translations}
""", unsafe_allow_html=True)
with rev_col2:
st.markdown(f"""
Revisions Available
{revisions_available}
""", unsafe_allow_html=True)
with rev_col3:
st.markdown(f"""
Changes Made
{revisions_made}
""", unsafe_allow_html=True)
with rev_col4:
st.markdown(f"""
Revision Rate
{revision_rate:.1f}%
""", unsafe_allow_html=True)
# Revision type analysis
st.markdown("""
📈 Revision Pattern Analysis
""", unsafe_allow_html=True)
revision_counts = df_lang_data['revision_type'].value_counts()
if len(revision_counts) > 0:
# Create revision type charts
rev_chart_col1, rev_chart_col2 = st.columns(2)
with rev_chart_col1:
# Pie chart of revision types
fig_pie = px.pie(
values=revision_counts.values,
names=revision_counts.index,
title=f"Revision Types Distribution",
color_discrete_sequence=px.colors.qualitative.Set3
)
fig_pie.update_layout(height=400, font=dict(family="Inter", size=12))
st.plotly_chart(fig_pie, use_container_width=True)
with rev_chart_col2:
# Bar chart of revision types
fig_bar = px.bar(
x=revision_counts.values,
y=revision_counts.index,
orientation='h',
title=f"Revision Frequency",
color=revision_counts.values,
color_continuous_scale='viridis'
)
fig_bar.update_layout(
height=400,
xaxis_title="Count",
yaxis_title="Revision Type",
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_bar, use_container_width=True)
# Word-level revision analysis
st.markdown("""
🔤 Word-Level Changes Analysis
""", unsafe_allow_html=True)
# Calculate word changes only for actual revisions
words_added = []
words_removed = []
changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change']
for _, row in changed_revisions.iterrows():
if pd.notna(row[human_col]) and pd.notna(row[revised_col]):
orig_words = set(str(row[human_col]).lower().split())
rev_words = set(str(row[revised_col]).lower().split())
added = rev_words - orig_words
removed = orig_words - rev_words
words_added.extend(list(added))
words_removed.extend(list(removed))
from collections import Counter
added_counts = Counter(words_added)
removed_counts = Counter(words_removed)
word_analysis_col1, word_analysis_col2 = st.columns(2)
with word_analysis_col1:
st.markdown("**🟢 Most Added Words**")
if added_counts:
top_added = dict(added_counts.most_common(15))
# Create horizontal bar chart for added words
fig_added = px.bar(
x=list(top_added.values()),
y=list(top_added.keys()),
orientation='h',
title="Most Frequently Added Words",
color=list(top_added.values()),
color_continuous_scale='Greens'
)
fig_added.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_added, use_container_width=True)
else:
st.markdown("*No words added in revisions*")
with word_analysis_col2:
st.markdown("**🔴 Most Removed Words**")
if removed_counts:
top_removed = dict(removed_counts.most_common(15))
# Create horizontal bar chart for removed words
fig_removed = px.bar(
x=list(top_removed.values()),
y=list(top_removed.keys()),
orientation='h',
title="Most Frequently Removed Words",
color=list(top_removed.values()),
color_continuous_scale='Reds'
)
fig_removed.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_removed, use_container_width=True)
else:
st.markdown("*No words removed in revisions*")
# Revision examples
st.markdown("""
📝 Revision Examples
""", unsafe_allow_html=True)
# Show examples of different types of revisions
revision_examples = changed_revisions.head(10)
if len(revision_examples) > 0:
# Create tabs for different revision types
available_types = revision_examples['revision_type'].unique()
if len(available_types) > 1:
type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})"
for rtype in available_types])
for i, rtype in enumerate(available_types):
with type_tabs[i]:
type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5)
for idx, row in type_examples.iterrows():
st.markdown(f"""
Original:
{row[human_col]}
Revised:
{row[revised_col]}
Type: {row['revision_type']}
""", unsafe_allow_html=True)
else:
# Single type, show directly
for idx, row in revision_examples.iterrows():
st.markdown(f"""
Original:
{row[human_col]}
Revised:
{row[revised_col]}
Type: {row['revision_type']}
""", unsafe_allow_html=True)
else:
st.info(f"No revisions found for {selected_lang}.")
else:
st.info(f"No revision data available for analysis.")
else:
st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`")
with analysis_tab4:
# Translation comparison section
st.markdown("""
🔍 Translation Comparison & Word Analysis for {selected_lang}
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
comp_code = code
# Get available translation columns for selected language
available_cols = []
for col in df_translations.columns:
if col.startswith(comp_code) and col != 'english':
available_cols.append(col)
if len(available_cols) >= 2:
comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1])
with comp_col1:
col1_selection = st.selectbox(
"First Translation:",
available_cols,
key="col1_select"
)
with comp_col2:
col2_selection = st.selectbox(
"Second Translation:",
[col for col in available_cols if col != col1_selection],
key="col2_select"
)
with comp_col3:
# Add spacing to align button with selectboxes
st.markdown('
', unsafe_allow_html=True)
analyze_clicked = st.button(
"🔍 Analyze",
type="primary",
use_container_width=True,
key="analyze_word_diff_btn"
)
if analyze_clicked:
# Perform word analysis with ALL available data
def get_word_differences(text1, text2):
# Handle missing data by using available text
if pd.isna(text1) and pd.isna(text2):
return set(), set(), set()
# If one is missing, treat it as empty for comparison
words1 = set(str(text1).lower().split()) if pd.notna(text1) else set()
words2 = set(str(text2).lower().split()) if pd.notna(text2) else set()
only_in_1 = words1 - words2
only_in_2 = words2 - words1
common = words1 & words2
return only_in_1, only_in_2, common
# Analyze ALL rows with available data
unique_words_1 = []
unique_words_2 = []
common_words = []
all_words_1 = [] # For frequency counting
all_words_2 = [] # For frequency counting
# Process all rows, including those with missing revisions
for _, row in df_translations.iterrows():
# Get text from columns, using original if revision is missing
text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None
text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None
# Skip if both are missing
if text1 is None and text2 is None:
continue
# Collect ALL words from each column for frequency analysis
if text1 is not None:
words_from_1 = str(text1).lower().split()
all_words_1.extend(words_from_1)
if text2 is not None:
words_from_2 = str(text2).lower().split()
all_words_2.extend(words_from_2)
# Only do comparison if both texts exist
if text1 is not None and text2 is not None:
only_1, only_2, common = get_word_differences(text1, text2)
unique_words_1.extend(list(only_1))
unique_words_2.extend(list(only_2))
common_words.extend(list(common))
from collections import Counter
# Count frequencies from ALL words
all_freq_1 = Counter(all_words_1) # All words from column 1
all_freq_2 = Counter(all_words_2) # All words from column 2
unique_freq_1 = Counter(unique_words_1) # Only unique words
unique_freq_2 = Counter(unique_words_2) # Only unique words
common_freq = Counter(common_words) # Only common words
# Display statistics
st.markdown('
', unsafe_allow_html=True)
col_result1, col_result2, col_result3, col_result4 = st.columns(4)
with col_result1:
st.markdown(f"""
Unique to {col1_selection.replace('_', ' ').title()}
{len(unique_freq_1)}
unique words
""", unsafe_allow_html=True)
with col_result2:
st.markdown(f"""
Unique to {col2_selection.replace('_', ' ').title()}
{len(unique_freq_2)}
unique words
""", unsafe_allow_html=True)
with col_result3:
st.markdown(f"""
Common Words
{len(common_freq)}
shared words
""", unsafe_allow_html=True)
with col_result4:
st.markdown(f"""
Total Vocabulary
{len(set(all_words_1 + all_words_2))}
total unique words
""", unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
# Word Clouds Section
st.markdown("""
☁️ Word Clouds Visualization
""", unsafe_allow_html=True)
# Generate word clouds using matplotlib and wordcloud
try:
# Show loading spinner while generating word clouds
with st.spinner("🎨 Generating word clouds... This may take a moment."):
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io
import base64
# Function to create word cloud image (optimized)
def create_wordcloud_image(word_freq, title, color_scheme='viridis'):
if not word_freq or len(word_freq) == 0:
return None
try:
# Create word cloud with all frequency data, but limit max_words to 25
wordcloud = WordCloud(
width=300, # Reduced size
height=200, # Reduced size
background_color='white',
colormap=color_scheme,
max_words=25, # Display top 25 words
relative_scaling=0.6,
random_state=42,
min_font_size=8,
max_font_size=60,
prefer_horizontal=0.9,
collocations=False # Avoid word combinations
).generate_from_frequencies(word_freq) # Use ALL frequency data
# Create matplotlib figure with smaller size
fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title, fontsize=10, fontweight='bold', pad=10)
# Convert to base64 for HTML display
buffer = io.BytesIO()
plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI
buffer.seek(0)
image_base64 = base64.b64encode(buffer.getvalue()).decode()
plt.close(fig) # Important: close figure to free memory
return image_base64
except Exception as e:
st.warning(f"Error creating word cloud for {title}: {str(e)}")
return None
# Create all word clouds in one row
cloud_col1, cloud_col2, cloud_col3 = st.columns(3)
with cloud_col1:
if unique_freq_1 and len(unique_freq_1) > 0:
# Use ALL unique words but display top 25 in cloud
img1 = create_wordcloud_image(
dict(unique_freq_1), # Use ALL unique words for frequency
f"Unique: {col1_selection.replace('_', ' ').title()}",
'Reds'
)
if img1:
st.markdown(f'''
Showing top 25 of {len(unique_freq_1)} unique words
''', unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
with cloud_col2:
if unique_freq_2 and len(unique_freq_2) > 0:
# Use ALL unique words but display top 25 in cloud
img2 = create_wordcloud_image(
dict(unique_freq_2), # Use ALL unique words for frequency
f"Unique: {col2_selection.replace('_', ' ').title()}",
'Greens'
)
if img2:
st.markdown(f'''
Showing top 25 of {len(unique_freq_2)} unique words
''', unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
with cloud_col3:
if common_freq and len(common_freq) > 0:
# Use ALL common words but display top 25 in cloud
img3 = create_wordcloud_image(
dict(common_freq), # Use ALL common words for frequency
"Common Words",
'Blues'
)
if img3:
st.markdown(f'''
Showing top 25 of {len(common_freq)} common words
''', unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
else:
st.markdown("""
""", unsafe_allow_html=True)
except ImportError:
st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`")
# Fallback to top words lists
st.markdown("**📋 Top Unique Words (Fallback)**")
fallback_col1, fallback_col2, fallback_col3 = st.columns(3)
with fallback_col1:
st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**")
if unique_freq_1:
for word, count in unique_freq_1.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No unique words*")
with fallback_col2:
st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**")
if unique_freq_2:
for word, count in unique_freq_2.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No unique words*")
with fallback_col3:
st.markdown("**🔵 Common Words**")
if common_freq:
for word, count in common_freq.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No common words*")
# Word frequency bar charts as additional analysis
st.markdown("""
📊 Top Words Frequency Comparison
""", unsafe_allow_html=True)
freq_col1, freq_col2 = st.columns(2)
with freq_col1:
if unique_freq_1:
top_words_1 = dict(unique_freq_1.most_common(10))
fig_freq1 = px.bar(
x=list(top_words_1.values()),
y=list(top_words_1.keys()),
orientation='h',
title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}",
color=list(top_words_1.values()),
color_continuous_scale='Reds'
)
fig_freq1.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_freq1, use_container_width=True)
with freq_col2:
if unique_freq_2:
top_words_2 = dict(unique_freq_2.most_common(10))
fig_freq2 = px.bar(
x=list(top_words_2.values()),
y=list(top_words_2.keys()),
orientation='h',
title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}",
color=list(top_words_2.values()),
color_continuous_scale='Greens'
)
fig_freq2.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_freq2, use_container_width=True)
else:
st.warning("⚠️ Need at least 2 translation columns for comparison analysis.")
else:
st.markdown("""
❌ No Data Available
Please ensure translation data files are available in the data directory.
""", unsafe_allow_html=True)
# Footer
st.markdown("---")
st.markdown("""
Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()