UPTranslate / src /streamlit_app.py
abumafrim's picture
first commit
4a4996a
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import time
from PIL import Image
# Only import APIs if available
try:
from google import genai
GENAI_AVAILABLE = True
except ImportError:
GENAI_AVAILABLE = False
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
BASE_DIR = os.path.dirname(__file__)
DATA_DIR = os.path.join(BASE_DIR, "data")
# Page configuration
st.set_page_config(
page_title="Translation Comparison Tool",
page_icon="🌐",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for Material Design with Tailwind-inspired styling
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
.main-header {
font-family: 'Inter', sans-serif;
font-size: 1.8rem;
font-weight: 600;
color: #1f2937;
margin-bottom: 0.5rem;
letter-spacing: -0.025em;
text-align: center;
}
.sub-header {
font-family: 'Inter', sans-serif;
font-size: 1.1rem;
font-weight: 400;
color: #6b7280;
margin-bottom: 2rem;
line-height: 1.6;
text-align: center;
}
.logo-container {
display: flex;
justify-content: center;
margin-bottom: 2rem;
}
/* Bold and full-width tabs */
.stTabs [data-baseweb="tab-list"] {
gap: 0px;
width: 100%;
}
.stTabs [data-baseweb="tab"] {
font-family: 'Inter', sans-serif !important;
font-size: 1.1rem !important;
font-weight: 600 !important;
padding: 12px 24px !important;
width: 50% !important;
justify-content: center !important;
border-radius: 0 !important;
background-color: #f8f9fa !important;
color: #374151 !important;
border: 1px solid #e5e7eb !important;
margin: 0 !important;
}
.stTabs [data-baseweb="tab"]:hover {
background-color: #f1f3f4 !important;
color: #1f2937 !important;
}
.stTabs [aria-selected="true"] {
background-color: #3b82f6 !important;
color: white !important;
font-weight: 700 !important;
border-color: #3b82f6 !important;
}
.stTabs [data-baseweb="tab-highlight"] {
display: none !important;
}
.stTabs [data-baseweb="tab-border"] {
display: none !important;
}
.tab-header {
font-family: 'Inter', sans-serif;
font-size: 1.5rem;
font-weight: 600;
color: #374151;
margin-bottom: 1rem;
}
.metric-card {
background: #f9fafb;
border: 1px solid #e5e7eb;
border-radius: 0.75rem;
padding: 1.5rem;
margin: 0.5rem 0;
box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
}
.metric-title {
font-family: 'Inter', sans-serif;
font-size: 0.875rem;
font-weight: 500;
color: #6b7280;
text-transform: uppercase;
letter-spacing: 0.05em;
margin-bottom: 0.25rem;
}
.metric-value {
font-family: 'Inter', sans-serif;
font-size: 2rem;
font-weight: 700;
color: #1f2937;
line-height: 1;
}
.support-info {
color: #5f6368;
font-size: 12px;
margin-top: 20px;
text-align: center;
font-family: 'Inter', sans-serif;
}
.translate-container {
border: 1px solid #e0e0e0;
border-radius: 8px;
margin: 20px 0;
overflow: hidden;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.translate-header {
background: #f8f9fa;
border-bottom: 1px solid #e0e0e0;
padding: 12px 16px;
font-family: 'Inter', sans-serif;
font-weight: 500;
font-size: 14px;
color: #5f6368;
display: flex;
align-items: center;
box-sizing: border-box;
}
.language-tabs-container {
border: 1px solid #e0e0e0;
border-radius: 8px;
margin: 20px 0;
overflow: hidden;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.language-tabs-header {
background: #f8f9fa;
border-bottom: 1px solid #e0e0e0;
height: 45px;
display: flex;
align-items: stretch;
box-sizing: border-box;
padding: 0;
}
.language-tab {
flex: 1;
background: #f8f9fa;
border: none;
border-right: 1px solid #e0e0e0;
padding: 0;
font-family: 'Inter', sans-serif;
font-size: 14px;
font-weight: 500;
cursor: pointer;
transition: all 0.2s ease;
color: #6b7280;
text-align: center;
height: 45px;
display: flex;
align-items: center;
justify-content: center;
box-sizing: border-box;
text-decoration: none;
outline: none;
}
.language-tab:last-child {
border-right: none;
}
.language-tab.active {
background: white;
color: #3b82f6;
border-bottom: 2px solid #3b82f6;
font-weight: 600;
}
.language-tab:hover:not(.active) {
background: #f1f3f4;
color: #374151;
}
.stTextArea textarea {
resize: none !important;
min-height: 350px !important;
max-height: 350px !important;
height: 350px !important;
}
.stTextArea textarea[disabled] {
color: #000000 !important;
opacity: 1 !important;
-webkit-text-fill-color: #000000 !important;
}
/* Make buttons rounded and complete */
.stButton > button {
font-family: 'Inter', sans-serif !important;
font-size: 0.75rem !important;
font-weight: 500 !important;
border-radius: 6px !important; /* Changed from 0 to 6px for rounded corners */
height: 35px !important;
border: 1px solid #d1d5db !important;
margin: 0 2px !important; /* Added small margin between buttons */
padding: 0 12px !important; /* Increased padding for better look */
cursor: pointer !important;
transition: all 0.2s ease !important;
}
.stButton > button[data-testid="baseButton-secondary"] {
background-color: #f3f4f6 !important;
color: #374151 !important;
border-color: #d1d5db !important;
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05) !important;
}
.stButton > button[data-testid="baseButton-secondary"]:hover {
background-color: #e5e7eb !important;
color: #1f2937 !important;
border-color: #9ca3af !important;
box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.1) !important;
transform: translateY(-1px) !important;
}
.stButton > button[data-testid="baseButton-primary"] {
background-color: #3b82f6 !important;
color: #ffffff !important;
font-weight: 600 !important;
border-color: #3b82f6 !important;
box-shadow: 0 2px 4px 0 rgba(59, 130, 246, 0.3) !important;
}
.stButton > button[data-testid="baseButton-primary"]:hover {
background-color: #2563eb !important;
color: #ffffff !important;
border-color: #2563eb !important;
transform: translateY(-1px) !important;
}
/* Remove the border-right rule since we're using margins now */
/* Hide the default Streamlit button styling for tab buttons */
.language-tab-button {
background: none !important;
border: none !important;
padding: 0 !important;
margin: 0 !important;
height: 100% !important;
width: 100% !important;
color: inherit !important;
font-weight: inherit !important;
}
.language-tab-button:hover {
background: none !important;
border: none !important;
}
.language-tab-button:focus {
background: none !important;
border: none !important;
box-shadow: none !important;
}
.score-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 12px;
padding: 20px;
text-align: center;
color: white;
margin: 10px 0;
}
.score-value {
font-size: 2.5rem;
font-weight: 700;
margin: 10px 0;
}
.score-label {
font-size: 0.9rem;
opacity: 0.9;
text-transform: uppercase;
letter-spacing: 1px;
}
.comparison-container {
background: #f8fafc;
border: 1px solid #e2e8f0;
border-radius: 12px;
padding: 24px;
margin: 20px 0;
}
.word-diff {
display: inline-block;
padding: 4px 8px;
margin: 2px;
border-radius: 6px;
font-weight: 500;
}
.word-added {
background: #dcfce7;
color: #166534;
border: 1px solid #bbf7d0;
}
.word-removed {
background: #fef2f2;
color: #dc2626;
border: 1px solid #fecaca;
}
.word-common {
background: #f1f5f9;
color: #475569;
border: 1px solid #e2e8f0;
}
.block-container {
padding-top: 1rem;
padding-bottom: 0rem;
}
.main > div {
padding-top: 1rem;
}
/* Hide Streamlit header and footer */
header[data-testid="stHeader"] {
height: 0px;
display: none;
}
.stDeployButton {
display: none;
}
footer {
display: none;
}
#MainMenu {
display: none;
}
</style>
""", unsafe_allow_html=True)
# Model configurations
MODEL_CONFIG = {
'Gemini': {
'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'],
'default_model': 'gemini-2.0-flash-exp'
},
'GPT': {
'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'],
'default_model': 'gpt-4'
},
'NLLB': {
'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available
'models': {
'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B',
'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B'
}
}
}
# Language code mappings
LANGUAGE_CODES = {
'Afrikaans': 'afr',
'Northern Sotho': 'nso',
'isiZulu': 'isizulu'
}
# Load logo
def load_logo():
"""Load logo with error handling"""
try:
if os.path.exists(f"{BASE_DIR}/logo.png"):
return Image.open(f"{BASE_DIR}/logo.png")
except Exception as e:
st.warning(f"Could not load logo: {str(e)}")
return None
# Load and cache data
@st.cache_data
def load_translation_data():
"""Load sample translation data"""
try:
sample_data = {
'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'],
'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'],
'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle']
}
return pd.DataFrame(sample_data)
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']})
def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None):
"""Translate text using Gemini API"""
try:
if not GENAI_AVAILABLE:
return "❌ Gemini library not installed"
if not client:
return "❌ Gemini API not configured. Please check your GEMINI_API_KEY."
lang_map = {
'Afrikaans': 'Afrikaans',
'Northern Sotho': 'Northern Sotho (Sepedi)',
'isiZulu': 'isiZulu'
}
prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations."
response = client.models.generate_content(
model=model_name, contents=prompt
)
return response.text.strip()
except Exception as e:
return f"❌ Error: {str(e)}"
def translate_with_openai(text, target_language, model_name="gpt-4o", client=None):
"""Translate text using OpenAI API with Chat Completions"""
try:
if not OPENAI_AVAILABLE:
return "❌ OpenAI library not installed"
if not client:
return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY."
lang_map = {
'Afrikaans': 'Afrikaans',
'Northern Sotho': 'Northern Sotho (Sepedi)',
'isiZulu': 'isiZulu'
}
# Use Chat Completions API (supported indefinitely)
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."},
{"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"}
],
max_tokens=1000,
temperature=0.3 # Lower temperature for more consistent translations
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"❌ Error: {str(e)}"
@st.cache_resource
def initialize_apis():
"""Initialize API clients with proper error handling, supporting both local and HF Spaces."""
genai_client = None
openai_client = None
def get_secret(name):
"""Fetch secret from env first (Docker Spaces), then Streamlit secrets."""
return (
os.environ.get(name)
or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None)
)
try:
# Gemini API
if GENAI_AVAILABLE:
try:
api_key = get_secret("GEMINI_API_KEY")
if api_key:
genai_client = genai.Client(api_key=api_key)
else:
st.warning("⚠️ Gemini API key not found")
except Exception as e:
st.error(f"❌ Gemini API error: {str(e)}")
# OpenAI API
if OPENAI_AVAILABLE:
try:
api_key = get_secret("OPENAI_API_KEY")
if api_key:
try:
# Try new OpenAI API client
openai_client = OpenAI(api_key=api_key)
except TypeError:
import openai
openai.api_key = api_key
openai_client = openai
else:
st.warning("⚠️ OpenAI API key not found")
except Exception as e:
st.error(f"❌ OpenAI API error: {str(e)}")
except Exception as e:
st.error(f"❌ API initialization error: {str(e)}")
return genai_client, openai_client
def translate_with_nllb(text, target_language):
"""Translate text using unified NLLB API"""
try:
import requests
# Single ngrok URL for unified API
API_URL = "https://4c2faecc052a.ngrok-free.app"
# Map Streamlit language names to API format
lang_mapping = {
'Northern Sotho': 'nso',
'isiZulu': 'zul'
}
api_lang = lang_mapping.get(target_language, target_language.lower())
response = requests.post(
f"{API_URL}/translate_simple",
params={
"text": text,
"target_language": api_lang
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result.get(api_lang, '❌ Translation not found')
else:
return f"❌ API Error: {response.status_code}"
except Exception as e:
return f"❌ Error: {str(e)}"
def create_language_tabs(available_languages, current_language, key_suffix=""):
"""Create language tabs with proper styling"""
tabs_html = '<div class="language-tabs-container"><div class="language-tabs-header">'
for lang in available_languages:
active_class = "active" if lang == current_language else ""
tabs_html += f'''
<div class="language-tab {active_class}" onclick="selectLanguage('{lang}', '{key_suffix}')">
{lang}
</div>
'''
tabs_html += '</div></div>'
# Add JavaScript for tab functionality
script = f'''
<script>
function selectLanguage(lang, suffix) {{
// This would normally update the session state, but since we can't do that from JavaScript,
// we'll use the button approach below instead
}}
</script>
'''
return tabs_html + script
def main():
"""Main application function"""
# Load and display logo and title side by side
logo = load_logo()
# Initialize session state FIRST to avoid refreshes
if 'target_language' not in st.session_state:
st.session_state.target_language = 'Afrikaans'
if 'translation_result' not in st.session_state:
st.session_state.translation_result = ""
if 'current_page' not in st.session_state:
st.session_state.current_page = 1
if 'initialized' not in st.session_state:
st.session_state.initialized = True
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if logo:
# Convert logo to base64 for HTML embedding
import base64
from io import BytesIO
buffered = BytesIO()
logo.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
st.markdown(f'''
<div style="display: flex; align-items: center; justify-content: center; gap: 0px; margin-bottom: 1rem;">
<img src="data:image/png;base64,{img_str}" width="180">
<h1 class="main-header" style="margin: 20px;">UP Translate</h1>
</div>
''', unsafe_allow_html=True)
else:
st.markdown('<h1 class="main-header" style="margin-bottom: 1rem;">UP Translate</h1>', unsafe_allow_html=True)
# Initialize APIs
genai_client, openai_client = initialize_apis()
# Initialize session state
if 'target_language' not in st.session_state:
st.session_state.target_language = 'Afrikaans'
if 'translation_result' not in st.session_state:
st.session_state.translation_result = ""
# Create tabs
tab1, tab2 = st.tabs(["🤖 Live Translations", "📊 Existing Translations"])
with tab1:
# st.markdown('<h2 class="tab-header">Live Translation</h2>', unsafe_allow_html=True)
# Create simplified model options
model_options = []
model_mapping = {}
# Add Gemini models
for model in MODEL_CONFIG['Gemini']['models']:
display_name = f"Gemini - {model}"
model_options.append(display_name)
model_mapping[display_name] = ('Gemini', None, model)
# Add GPT models
for model in MODEL_CONFIG['GPT']['models']:
display_name = f"GPT - {model}"
model_options.append(display_name)
model_mapping[display_name] = ('GPT', None, model)
# Add single NLLB option
model_options.append("NLLB - Specialized Models")
model_mapping["NLLB - Specialized Models"] = ('NLLB', None, None)
# Model selection with inline label
label_col, dropdown_col = st.columns([2, 10])
with label_col:
st.markdown('<div style="margin-top: 8px; font-weight: 500;">Select Model:</div>', unsafe_allow_html=True)
with dropdown_col:
selected_model_option = st.selectbox(
"Select Model:",
model_options,
index=0,
key="model_selection_dropdown",
label_visibility="collapsed"
)
selected_provider, _, selected_model = model_mapping[selected_model_option]
# Translation interface
col_left, col_center, col_right = st.columns([5, 1, 5])
# Left side - English Input
with col_left:
st.markdown('<div class="translate-container">', unsafe_allow_html=True)
st.markdown('<div class="translate-header">English</div>', unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
input_text = st.text_area(
"Input",
placeholder="Input text here",
height=350,
key="input_text_live",
label_visibility="collapsed"
)
# Center - Translate Button
with col_center:
# Add spacing to align button with text areas
st.markdown('<div style="height: 150px;"></div>', unsafe_allow_html=True)
translate_clicked = st.button(
"Translate",
key="translate_btn_live",
help="Translate text",
type="primary",
use_container_width=True
)
# Right side - Translation Output
with col_right:
# Determine available languages based on selected provider
if selected_provider == 'NLLB':
available_languages = MODEL_CONFIG['NLLB']['languages']
else:
available_languages = ['Afrikaans', 'Northern Sotho', 'isiZulu']
# Set default language to first available if current selection not available
if st.session_state.target_language not in available_languages:
st.session_state.target_language = available_languages[0]
# Create container with custom styling
st.markdown('<div class="translate-container">', unsafe_allow_html=True)
# Language selection buttons
lang_cols = st.columns(len(available_languages))
for i, lang in enumerate(available_languages):
with lang_cols[i]:
button_type = "primary" if lang == st.session_state.target_language else "secondary"
if st.button(
lang,
key=f"lang_btn_{lang}_live",
type=button_type,
use_container_width=True
):
if st.session_state.target_language != lang: # Only update if different
st.session_state.target_language = lang
st.session_state.translation_result = "" # Clear previous result
st.rerun()
# Translation logic
if translate_clicked and input_text:
with st.spinner("Translating..."):
target_lang = st.session_state.target_language
if selected_provider == 'Gemini':
result = translate_with_gemini(input_text, target_lang, selected_model, genai_client)
elif selected_provider == 'GPT':
result = translate_with_openai(input_text, target_lang, selected_model, openai_client)
elif selected_provider == 'NLLB':
result = translate_with_nllb(input_text, target_lang)
st.session_state.translation_result = result
# Translation output area with proper labeling
st.text_area(
f"Translation ({st.session_state.target_language})", # Dynamic label
value=st.session_state.translation_result,
placeholder="Translation will appear here",
height=350,
key="translation_output_live_fixed", # Changed key to avoid conflicts
disabled=True,
label_visibility="collapsed"
)
# Support information
st.markdown("""
<div class="support-info">
<strong>Available Models:</strong><br>
🔮 <strong>Gemini:</strong> All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)<br>
🧠 <strong>GPT:</strong> All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)<br>
🤗 <strong>NLLB:</strong> Northern Sotho, isiZulu only (specialized models)
</div>
""", unsafe_allow_html=True)
with tab2:
# Load data from base directory automatically
@st.cache_data
def load_analysis_data():
"""Load all analysis data from base directory"""
df_translations = None
df_bleu = None
df_chrf = None
df_comet = None
try:
# Try to load translations data
if os.path.exists(f"{DATA_DIR}/translations.tsv"):
df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t")
# Convert new CSV format to expected format for analysis
# New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb
# Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc.
# Rename zul columns to isizulu for backward compatibility with analysis code
column_mapping = {
'zul_human': 'isizulu_human',
'zul_revised': 'isizulu_revised',
'zul_gemini': 'isizulu_mt_gemini',
'zul_gpt': 'isizulu_mt_gpt',
'zul_nllb': 'isizulu_mt_nllb',
'afr_gemini': 'afr_mt_gemini',
'afr_gpt': 'afr_mt_gpt',
'nso_gemini': 'nso_mt_gemini',
'nso_gpt': 'nso_mt_gpt',
'nso_nllb': 'nso_mt_nllb'
}
df_translations = df_translations.rename(columns=column_mapping)
elif os.path.exists(f"{DATA_DIR}/translation_data.csv"):
df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv")
else:
print("No translation data found, using sample data")
df_translations = load_translation_data() # Fallback to sample data
# Try to load BLEU scores
if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"):
df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv")
# Convert zul references to isizulu for compatibility
df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_')
df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct
else:
# Sample BLEU data (using isizulu for compatibility with existing analysis code)
df_bleu = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'],
'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu']
})
# Try to load COMET scores
if os.path.exists(f"{DATA_DIR}/comet_scores.csv"):
df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv")
else:
# Sample COMET data
df_comet = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
})
# Try to load CHRF scores
if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"):
df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv")
else:
# Sample CHRF data
df_chrf = pd.DataFrame({
'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85],
'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
})
return df_translations, df_bleu, df_comet, df_chrf
except Exception as e:
st.error(f"Error loading data: {str(e)}")
return None, None, None, None
# Load all data
df_translations, df_bleu, df_comet, df_chrf = load_analysis_data()
if df_translations is not None:
# Language selection in columns
lang_col1, lang_col2 = st.columns([2, 10])
with lang_col1:
st.markdown('<div style="margin-top: 8px; font-weight: 500;">Select Language:</div>', unsafe_allow_html=True)
with lang_col2:
languages = ['Afrikaans', 'Northern Sotho', 'isiZulu']
selected_lang = st.selectbox(
"Select Language for Analysis:",
languages,
key="global_lang_select",
label_visibility="collapsed"
)
# Get language code
lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
code = lang_codes[selected_lang]
# Create analysis tabs
analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"])
with analysis_tab1:
# Translation Samples Tab
st.markdown("""
<div style="margin: 20px 0;">
<h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;">
📝 Translation Samples for {selected_lang}
</h4>
</div>
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
samples_code = code
# Show sample translations for the selected language
display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)]
if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column
# Control panel
control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2])
with control_col1:
st.markdown('<div style="margin-top: 8px; font-weight: 500;">Samples per page:</div>', unsafe_allow_html=True)
with control_col2:
page_size = st.selectbox(
"Samples per page:",
[10, 25, 50, 100],
index=0,
key="page_size_select",
label_visibility="collapsed"
)
# Initialize session state for pagination
if 'current_page' not in st.session_state:
st.session_state.current_page = 1
# Filter data and calculate pagination
available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all')
total_samples = len(available_data)
total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division
# Ensure current page is valid
if st.session_state.current_page > total_pages:
st.session_state.current_page = 1
# Calculate start and end indices
start_idx = (st.session_state.current_page - 1) * page_size
end_idx = min(start_idx + page_size, total_samples)
# Get current page data
current_page_data = available_data.iloc[start_idx:end_idx]
with control_col3:
st.markdown('<div style="margin-top: 8px; font-weight: 500;">Page:</div>', unsafe_allow_html=True)
with control_col4:
# Page navigation
nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1])
with nav_col1:
if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)):
st.session_state.current_page = 1
st.rerun()
with nav_col2:
if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)):
st.session_state.current_page -= 1
st.rerun()
with nav_col3:
st.markdown(f'<div style="text-align: center; margin-top: 8px; font-weight: 500;">{st.session_state.current_page} / {total_pages}</div>', unsafe_allow_html=True)
with nav_col4:
if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)):
st.session_state.current_page += 1
st.rerun()
with nav_col5:
if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)):
st.session_state.current_page = total_pages
st.rerun()
# Statistics cards
stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4)
with stats_col1:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Showing</div>
<div class="metric-value">{len(current_page_data)}</div>
</div>
""", unsafe_allow_html=True)
with stats_col2:
available_systems = len([col for col in display_cols if col != 'english'])
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Translation Systems</div>
<div class="metric-value">{available_systems}</div>
</div>
""", unsafe_allow_html=True)
with stats_col3:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Total Available</div>
<div class="metric-value">{total_samples}</div>
</div>
""", unsafe_allow_html=True)
with stats_col4:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Current Page</div>
<div class="metric-value">{st.session_state.current_page}/{total_pages}</div>
</div>
""", unsafe_allow_html=True)
# Display the samples table
st.markdown("### Translation Examples")
if len(current_page_data) > 0:
# Create a styled dataframe with better column names
display_df = current_page_data.copy()
# Rename columns for better display
column_rename = {
'english': 'English (Source)',
}
# Add human-readable names for translation columns
for col in display_df.columns:
if col.startswith(samples_code):
if '_human' in col:
column_rename[col] = f'{selected_lang} (Human)'
elif '_revised' in col:
column_rename[col] = f'{selected_lang} (Revised)'
elif '_mt_gemini' in col or '_gemini' in col:
column_rename[col] = f'{selected_lang} (Gemini)'
elif '_mt_gpt' in col or '_gpt' in col:
column_rename[col] = f'{selected_lang} (GPT)'
elif '_mt_nllb' in col or '_nllb' in col:
column_rename[col] = f'{selected_lang} (NLLB)'
else:
# Generic fallback
clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title()
column_rename[col] = f'{selected_lang} ({clean_name})'
display_df = display_df.rename(columns=column_rename)
# Add row numbers based on actual position in full dataset
display_df.index = range(start_idx + 1, end_idx + 1)
display_df.index.name = 'Sample #'
st.dataframe(
display_df,
use_container_width=True,
height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content
column_config={
col: st.column_config.TextColumn(col, width="medium")
for col in display_df.columns
}
)
# Page info summary
st.markdown(f"""
<div style="margin-top: 16px; padding: 12px; background: #f8fafc; border-radius: 6px; text-align: center; color: #6b7280; font-size: 0.9rem;">
📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages}
</div>
""", unsafe_allow_html=True)
# Quick jump to page
if total_pages > 5: # Only show quick jump for datasets with many pages
st.markdown("### Quick Navigation")
jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1])
with jump_col2:
target_page = st.number_input(
f"Jump to page (1-{total_pages}):",
min_value=1,
max_value=total_pages,
value=st.session_state.current_page,
key="page_jump"
)
if st.button("🔗 Go to Page", use_container_width=True):
if target_page != st.session_state.current_page:
st.session_state.current_page = target_page
st.rerun()
else:
st.warning("⚠️ No translation samples found for the current page.")
else:
st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'")
# Debug information
available_columns = [col for col in df_translations.columns if col.startswith(samples_code)]
if available_columns:
st.info(f"🔍 Found columns: {', '.join(available_columns)}")
else:
all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])]
if all_lang_columns:
st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}")
with analysis_tab2:
st.markdown("""
<div style="margin: 20px 0;">
<h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;">
📈 Quality Metrics for {selected_lang}
</h4>
</div>
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Get language code
lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
code = lang_codes[selected_lang]
# Score visualizations
if df_bleu is not None and df_chrf is not None and df_comet is not None:
# Filter scores for selected language
lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu
lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf
lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet
# Check if we have domain-level data
has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and
'domain' in lang_comet.columns and
len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0)
if has_domain_data:
# Add domain filter
available_domains = sorted(lang_bleu['domain'].unique())
domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall']
selected_domain = st.selectbox(
"📍 Select Domain for Analysis:",
domain_options,
key=f"domain_selector_{selected_lang}"
)
# Filter data based on selected domain
if selected_domain == 'Overall':
display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall']
display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall']
display_comet = lang_comet[lang_comet['domain'] == 'Overall']
chart_title_suffix = " - Overall"
else:
display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain]
display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain]
display_comet = lang_comet[lang_comet['domain'] == selected_domain]
chart_title_suffix = f" - {selected_domain}"
else:
# Use all data if no domain column
display_bleu = lang_bleu
display_chrf = lang_chrf
display_comet = lang_comet
chart_title_suffix = ""
# Create score charts
if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
chart_col1, chart_col2, chart_col3 = st.columns(3)
with chart_col1:
# chrF Score Chart
fig_chrf = px.bar(
display_chrf,
x='comparison_pair',
y='chrf_score',
title=f'chrF Scores - {selected_lang}{chart_title_suffix}',
color='chrf_score',
color_continuous_scale='oranges'
)
fig_chrf.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="chrF Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_chrf, use_container_width=True)
with chart_col2:
# BLEU Score Chart
fig_bleu = px.bar(
display_bleu,
x='comparison_pair',
y='bleu_score',
title=f'BLEU Scores - {selected_lang}{chart_title_suffix}',
color='bleu_score',
color_continuous_scale='blues'
)
fig_bleu.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="BLEU Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_bleu, use_container_width=True)
with chart_col3:
# COMET Score Chart
fig_comet = px.bar(
display_comet,
x='comparison_pair',
y='comet_score',
title=f'COMET Scores - {selected_lang}{chart_title_suffix}',
color='comet_score',
color_continuous_scale='greens'
)
fig_comet.update_layout(
xaxis_title="Translation Pairs",
yaxis_title="COMET Score",
xaxis_tickangle=-45,
height=400,
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_comet, use_container_width=True)
# PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise
if has_domain_data:
st.markdown(f"""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 20px 0 16px 0;">
🕸️ Domain Performance Spider Charts - {selected_lang}
</h4>
""", unsafe_allow_html=True)
# Filter out "Overall" so only domain-level values are shown
domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall']
domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall']
domain_comet = lang_comet[lang_comet['domain'] != 'Overall']
# Pivot all metrics
pivot_bleu = domain_bleu.pivot(
index='comparison_pair',
columns='domain',
values='bleu_score'
).fillna(0)
pivot_chrf = domain_chrf.pivot(
index='comparison_pair',
columns='domain',
values='chrf_score'
).fillna(0)
pivot_comet = domain_comet.pivot(
index='comparison_pair',
columns='domain',
values='comet_score'
).fillna(0)
# Ensure domains are in the same order for all metrics
domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns))
pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0)
pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0)
pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0)
# Define distinct colors with reduced opacity
distinct_colors = [
'rgba(255, 99, 132, 0.4)', # Red
'rgba(54, 162, 235, 0.4)', # Blue
'rgba(99, 255, 132, 0.4)', # Green
'rgba(75, 192, 192, 0.4)', # Teal
'rgba(255, 205, 86, 0.4)', # Yellow
'rgba(153, 102, 255, 0.4)', # Purple
'rgba(255, 159, 64, 0.4)', # Orange
'rgba(199, 199, 199, 0.4)', # Grey
'rgba(83, 102, 255, 0.4)', # Indigo
'rgba(255, 99, 255, 0.4)', # Magenta
]
# Border colors (same colors but full opacity for borders)
border_colors = [
'rgba(255, 99, 132, 1.0)', # Red
'rgba(54, 162, 235, 1.0)', # Blue
'rgba(99, 255, 132, 1.0)', # Green
'rgba(75, 192, 192, 1.0)', # Teal
'rgba(255, 205, 86, 1.0)', # Yellow
'rgba(153, 102, 255, 1.0)', # Purple
'rgba(255, 159, 64, 1.0)', # Orange
'rgba(199, 199, 199, 1.0)', # Grey
'rgba(83, 102, 255, 1.0)', # Indigo
'rgba(255, 99, 255, 1.0)', # Magenta
]
# Layout for three side-by-side spider charts
spider_col1, spider_col2, spider_col3 = st.columns(3)
# ---------------- CHRF SPIDER ----------------
with spider_col1:
fig_chrf_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_chrf.iterrows()):
color_idx = i % len(distinct_colors)
fig_chrf_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=False # Hide legend on first chart
))
fig_chrf_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=False,
title=dict(text=f"Domain Performance (chrF) - {selected_lang}"),
height=450
)
st.plotly_chart(fig_chrf_spider, use_container_width=True)
# ---------------- BLEU SPIDER ----------------
with spider_col2:
fig_bleu_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_bleu.iterrows()):
color_idx = i % len(distinct_colors)
fig_bleu_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=True # Show legend on middle chart
))
fig_bleu_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=True,
title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"),
height=450,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.3,
xanchor="center",
x=0.5
)
)
st.plotly_chart(fig_bleu_spider, use_container_width=True)
# ---------------- COMET SPIDER ----------------
with spider_col3:
fig_comet_spider = go.Figure()
for i, (model_name, row) in enumerate(pivot_comet.iterrows()):
color_idx = i % len(distinct_colors)
fig_comet_spider.add_trace(go.Scatterpolar(
r=row.tolist() + [row.tolist()[0]], # close loop
theta=domains + [domains[0]],
fill='toself',
name=model_name.split('_')[-1].upper(),
fillcolor=distinct_colors[color_idx],
line=dict(color=border_colors[color_idx], width=2),
opacity=0.7,
showlegend=False # Hide legend on last chart
))
fig_comet_spider.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
showlegend=False,
title=dict(text=f"Domain Performance (COMET) - {selected_lang}"),
height=450
)
st.plotly_chart(fig_comet_spider, use_container_width=True)
# # Overall Performance Summary
# st.markdown("""
# <h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
# 📋 Overall Performance Summary
# </h4>
# """, unsafe_allow_html=True)
# # Create overall summary table
# if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
# # Merge all three metrics
# merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf'))
# merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair')
# merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper())
# summary_data = []
# for _, row in merged_scores.iterrows():
# summary_data.append({
# 'Model': row['model'],
# 'BLEU Score': f"{row['bleu_score']:.3f}",
# 'chrF Score': f"{row['chrf_score']:.3f}",
# 'COMET Score': f"{row['comet_score']:.3f}",
# 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}"
# })
# summary_df = pd.DataFrame(summary_data)
# # Only sort if dataframe has data and 'Average' column exists
# if len(summary_df) > 0 and 'Average' in summary_df.columns:
# summary_df = summary_df.sort_values('Average', ascending=False)
# # Style the dataframe
# st.dataframe(
# summary_df,
# use_container_width=True,
# hide_index=True,
# column_config={
# "Model": st.column_config.TextColumn("Model", width="medium"),
# "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"),
# "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"),
# "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"),
# "Average": st.column_config.NumberColumn("Average", format="%.3f")
# }
# )
with analysis_tab3:
# Revision Analysis Tab
st.markdown("""
<div style="margin: 20px 0;">
<h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;">
✏️ Human Translation Revision Analysis for {selected_lang}
</h4>
</div>
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
rev_code = code
# Check for revision columns
human_col = f"{rev_code}_human"
revised_col = f"{rev_code}_revised"
if human_col in df_translations.columns and revised_col in df_translations.columns:
# Get all rows with human translations for this language
df_lang_data = df_translations[[human_col, revised_col]].copy()
# Remove rows where human translation is missing (can't analyze revisions without original)
df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy()
total_human_translations = len(df_lang_data)
if total_human_translations == 0:
st.warning(f"⚠️ No human translations found for {selected_lang}")
else:
# Calculate revision statistics
# For missing revised translations, we assume no revision was made (same as original)
df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col])
# Count actual changes
revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col])
revision_rate = (revisions_made / total_human_translations) * 100
# Count how many had revision data available
revisions_available = sum(df_translations[revised_col].notna())
# Calculate revision types
def categorize_revision(original, revised):
if pd.isna(original) or pd.isna(revised):
return "Missing Data"
if str(original).strip() == str(revised).strip():
return "No Change"
orig_words = str(original).lower().split()
rev_words = str(revised).lower().split()
if len(rev_words) > len(orig_words):
return "Expansion"
elif len(rev_words) < len(orig_words):
return "Reduction"
else:
return "Modification"
df_lang_data['revision_type'] = df_lang_data.apply(
lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1
)
# Revision statistics cards
rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4)
with rev_col1:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Human Translations</div>
<div class="metric-value">{total_human_translations}</div>
</div>
""", unsafe_allow_html=True)
with rev_col2:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Revisions Available</div>
<div class="metric-value">{revisions_available}</div>
</div>
""", unsafe_allow_html=True)
with rev_col3:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Changes Made</div>
<div class="metric-value">{revisions_made}</div>
</div>
""", unsafe_allow_html=True)
with rev_col4:
st.markdown(f"""
<div class="metric-card">
<div class="metric-title">Revision Rate</div>
<div class="metric-value">{revision_rate:.1f}%</div>
</div>
""", unsafe_allow_html=True)
# Revision type analysis
st.markdown("""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
📈 Revision Pattern Analysis
</h4>
""", unsafe_allow_html=True)
revision_counts = df_lang_data['revision_type'].value_counts()
if len(revision_counts) > 0:
# Create revision type charts
rev_chart_col1, rev_chart_col2 = st.columns(2)
with rev_chart_col1:
# Pie chart of revision types
fig_pie = px.pie(
values=revision_counts.values,
names=revision_counts.index,
title=f"Revision Types Distribution",
color_discrete_sequence=px.colors.qualitative.Set3
)
fig_pie.update_layout(height=400, font=dict(family="Inter", size=12))
st.plotly_chart(fig_pie, use_container_width=True)
with rev_chart_col2:
# Bar chart of revision types
fig_bar = px.bar(
x=revision_counts.values,
y=revision_counts.index,
orientation='h',
title=f"Revision Frequency",
color=revision_counts.values,
color_continuous_scale='viridis'
)
fig_bar.update_layout(
height=400,
xaxis_title="Count",
yaxis_title="Revision Type",
font=dict(family="Inter", size=12)
)
st.plotly_chart(fig_bar, use_container_width=True)
# Word-level revision analysis
st.markdown("""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
🔤 Word-Level Changes Analysis
</h4>
""", unsafe_allow_html=True)
# Calculate word changes only for actual revisions
words_added = []
words_removed = []
changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change']
for _, row in changed_revisions.iterrows():
if pd.notna(row[human_col]) and pd.notna(row[revised_col]):
orig_words = set(str(row[human_col]).lower().split())
rev_words = set(str(row[revised_col]).lower().split())
added = rev_words - orig_words
removed = orig_words - rev_words
words_added.extend(list(added))
words_removed.extend(list(removed))
from collections import Counter
added_counts = Counter(words_added)
removed_counts = Counter(words_removed)
word_analysis_col1, word_analysis_col2 = st.columns(2)
with word_analysis_col1:
st.markdown("**🟢 Most Added Words**")
if added_counts:
top_added = dict(added_counts.most_common(15))
# Create horizontal bar chart for added words
fig_added = px.bar(
x=list(top_added.values()),
y=list(top_added.keys()),
orientation='h',
title="Most Frequently Added Words",
color=list(top_added.values()),
color_continuous_scale='Greens'
)
fig_added.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_added, use_container_width=True)
else:
st.markdown("*No words added in revisions*")
with word_analysis_col2:
st.markdown("**🔴 Most Removed Words**")
if removed_counts:
top_removed = dict(removed_counts.most_common(15))
# Create horizontal bar chart for removed words
fig_removed = px.bar(
x=list(top_removed.values()),
y=list(top_removed.keys()),
orientation='h',
title="Most Frequently Removed Words",
color=list(top_removed.values()),
color_continuous_scale='Reds'
)
fig_removed.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_removed, use_container_width=True)
else:
st.markdown("*No words removed in revisions*")
# Revision examples
st.markdown("""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
📝 Revision Examples
</h4>
""", unsafe_allow_html=True)
# Show examples of different types of revisions
revision_examples = changed_revisions.head(10)
if len(revision_examples) > 0:
# Create tabs for different revision types
available_types = revision_examples['revision_type'].unique()
if len(available_types) > 1:
type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})"
for rtype in available_types])
for i, rtype in enumerate(available_types):
with type_tabs[i]:
type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5)
for idx, row in type_examples.iterrows():
st.markdown(f"""
<div style="background: #f8fafc; border-left: 4px solid #3b82f6; padding: 16px; margin: 10px 0; border-radius: 0 8px 8px 0;">
<div style="font-weight: 600; color: #1e40af; margin-bottom: 8px;">Original:</div>
<div style="margin-bottom: 12px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[human_col]}</div>
<div style="font-weight: 600; color: #059669; margin-bottom: 8px;">Revised:</div>
<div style="margin-bottom: 8px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[revised_col]}</div>
<div style="font-size: 0.875rem; color: #6b7280;">Type: <strong>{row['revision_type']}</strong></div>
</div>
""", unsafe_allow_html=True)
else:
# Single type, show directly
for idx, row in revision_examples.iterrows():
st.markdown(f"""
<div style="background: #f8fafc; border-left: 4px solid #3b82f6; padding: 16px; margin: 10px 0; border-radius: 0 8px 8px 0;">
<div style="font-weight: 600; color: #1e40af; margin-bottom: 8px;">Original:</div>
<div style="margin-bottom: 12px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[human_col]}</div>
<div style="font-weight: 600; color: #059669; margin-bottom: 8px;">Revised:</div>
<div style="margin-bottom: 8px; font-family: monospace; background: #fff; padding: 8px; border-radius: 4px;">{row[revised_col]}</div>
<div style="font-size: 0.875rem; color: #6b7280;">Type: <strong>{row['revision_type']}</strong></div>
</div>
""", unsafe_allow_html=True)
else:
st.info(f"No revisions found for {selected_lang}.")
else:
st.info(f"No revision data available for analysis.")
else:
st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`")
with analysis_tab4:
# Translation comparison section
st.markdown("""
<div style="margin: 20px 0;">
<h4 style="font-family: 'Inter', sans-serif; font-size: 1.2rem; font-weight: 600; color: #374151; margin: 0 0 16px 0;">
🔍 Translation Comparison & Word Analysis for {selected_lang}
</h4>
</div>
""".format(selected_lang=selected_lang), unsafe_allow_html=True)
# Use the global language selection
comp_code = code
# Get available translation columns for selected language
available_cols = []
for col in df_translations.columns:
if col.startswith(comp_code) and col != 'english':
available_cols.append(col)
if len(available_cols) >= 2:
comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1])
with comp_col1:
col1_selection = st.selectbox(
"First Translation:",
available_cols,
key="col1_select"
)
with comp_col2:
col2_selection = st.selectbox(
"Second Translation:",
[col for col in available_cols if col != col1_selection],
key="col2_select"
)
with comp_col3:
# Add spacing to align button with selectboxes
st.markdown('<div style="margin-top: 25px;"></div>', unsafe_allow_html=True)
analyze_clicked = st.button(
"🔍 Analyze",
type="primary",
use_container_width=True,
key="analyze_word_diff_btn"
)
if analyze_clicked:
# Perform word analysis with ALL available data
def get_word_differences(text1, text2):
# Handle missing data by using available text
if pd.isna(text1) and pd.isna(text2):
return set(), set(), set()
# If one is missing, treat it as empty for comparison
words1 = set(str(text1).lower().split()) if pd.notna(text1) else set()
words2 = set(str(text2).lower().split()) if pd.notna(text2) else set()
only_in_1 = words1 - words2
only_in_2 = words2 - words1
common = words1 & words2
return only_in_1, only_in_2, common
# Analyze ALL rows with available data
unique_words_1 = []
unique_words_2 = []
common_words = []
all_words_1 = [] # For frequency counting
all_words_2 = [] # For frequency counting
# Process all rows, including those with missing revisions
for _, row in df_translations.iterrows():
# Get text from columns, using original if revision is missing
text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None
text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None
# Skip if both are missing
if text1 is None and text2 is None:
continue
# Collect ALL words from each column for frequency analysis
if text1 is not None:
words_from_1 = str(text1).lower().split()
all_words_1.extend(words_from_1)
if text2 is not None:
words_from_2 = str(text2).lower().split()
all_words_2.extend(words_from_2)
# Only do comparison if both texts exist
if text1 is not None and text2 is not None:
only_1, only_2, common = get_word_differences(text1, text2)
unique_words_1.extend(list(only_1))
unique_words_2.extend(list(only_2))
common_words.extend(list(common))
from collections import Counter
# Count frequencies from ALL words
all_freq_1 = Counter(all_words_1) # All words from column 1
all_freq_2 = Counter(all_words_2) # All words from column 2
unique_freq_1 = Counter(unique_words_1) # Only unique words
unique_freq_2 = Counter(unique_words_2) # Only unique words
common_freq = Counter(common_words) # Only common words
# Display statistics
st.markdown('<div class="comparison-container">', unsafe_allow_html=True)
col_result1, col_result2, col_result3, col_result4 = st.columns(4)
with col_result1:
st.markdown(f"""
<div style="text-align: center; padding: 15px;">
<h5 style="color: #dc2626; margin-bottom: 10px;">Unique to {col1_selection.replace('_', ' ').title()}</h5>
<div style="font-size: 1.3rem; font-weight: bold; color: #dc2626;">{len(unique_freq_1)}</div>
<div style="color: #6b7280; font-size: 0.8rem;">unique words</div>
</div>
""", unsafe_allow_html=True)
with col_result2:
st.markdown(f"""
<div style="text-align: center; padding: 15px;">
<h5 style="color: #166534; margin-bottom: 10px;">Unique to {col2_selection.replace('_', ' ').title()}</h5>
<div style="font-size: 1.3rem; font-weight: bold; color: #166534;">{len(unique_freq_2)}</div>
<div style="color: #6b7280; font-size: 0.8rem;">unique words</div>
</div>
""", unsafe_allow_html=True)
with col_result3:
st.markdown(f"""
<div style="text-align: center; padding: 15px;">
<h5 style="color: #475569; margin-bottom: 10px;">Common Words</h5>
<div style="font-size: 1.3rem; font-weight: bold; color: #475569;">{len(common_freq)}</div>
<div style="color: #6b7280; font-size: 0.8rem;">shared words</div>
</div>
""", unsafe_allow_html=True)
with col_result4:
st.markdown(f"""
<div style="text-align: center; padding: 15px;">
<h5 style="color: #7c3aed; margin-bottom: 10px;">Total Vocabulary</h5>
<div style="font-size: 1.3rem; font-weight: bold; color: #7c3aed;">{len(set(all_words_1 + all_words_2))}</div>
<div style="color: #6b7280; font-size: 0.8rem;">total unique words</div>
</div>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Word Clouds Section
st.markdown("""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
☁️ Word Clouds Visualization
</h4>
""", unsafe_allow_html=True)
# Generate word clouds using matplotlib and wordcloud
try:
# Show loading spinner while generating word clouds
with st.spinner("🎨 Generating word clouds... This may take a moment."):
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io
import base64
# Function to create word cloud image (optimized)
def create_wordcloud_image(word_freq, title, color_scheme='viridis'):
if not word_freq or len(word_freq) == 0:
return None
try:
# Create word cloud with all frequency data, but limit max_words to 25
wordcloud = WordCloud(
width=300, # Reduced size
height=200, # Reduced size
background_color='white',
colormap=color_scheme,
max_words=25, # Display top 25 words
relative_scaling=0.6,
random_state=42,
min_font_size=8,
max_font_size=60,
prefer_horizontal=0.9,
collocations=False # Avoid word combinations
).generate_from_frequencies(word_freq) # Use ALL frequency data
# Create matplotlib figure with smaller size
fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title, fontsize=10, fontweight='bold', pad=10)
# Convert to base64 for HTML display
buffer = io.BytesIO()
plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI
buffer.seek(0)
image_base64 = base64.b64encode(buffer.getvalue()).decode()
plt.close(fig) # Important: close figure to free memory
return image_base64
except Exception as e:
st.warning(f"Error creating word cloud for {title}: {str(e)}")
return None
# Create all word clouds in one row
cloud_col1, cloud_col2, cloud_col3 = st.columns(3)
with cloud_col1:
if unique_freq_1 and len(unique_freq_1) > 0:
# Use ALL unique words but display top 25 in cloud
img1 = create_wordcloud_image(
dict(unique_freq_1), # Use ALL unique words for frequency
f"Unique: {col1_selection.replace('_', ' ').title()}",
'Reds'
)
if img1:
st.markdown(f'''
<div style="text-align: center; margin: 10px 0;">
<img src="data:image/png;base64,{img1}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);">
</div>
<div style="text-align: center; font-size: 0.8rem; color: #6b7280;">
Showing top 25 of {len(unique_freq_1)} unique words
</div>
''', unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #fef2f2; border-radius: 6px; color: #dc2626;">
<div style="font-size: 2rem;">📝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No unique words</div>
</div>
""", unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;">
<div style="font-size: 2rem;">📝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No unique words found</div>
</div>
""", unsafe_allow_html=True)
with cloud_col2:
if unique_freq_2 and len(unique_freq_2) > 0:
# Use ALL unique words but display top 25 in cloud
img2 = create_wordcloud_image(
dict(unique_freq_2), # Use ALL unique words for frequency
f"Unique: {col2_selection.replace('_', ' ').title()}",
'Greens'
)
if img2:
st.markdown(f'''
<div style="text-align: center; margin: 10px 0;">
<img src="data:image/png;base64,{img2}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);">
</div>
<div style="text-align: center; font-size: 0.8rem; color: #6b7280;">
Showing top 25 of {len(unique_freq_2)} unique words
</div>
''', unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #f0fdf4; border-radius: 6px; color: #166534;">
<div style="font-size: 2rem;">📝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No unique words</div>
</div>
""", unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;">
<div style="font-size: 2rem;">📝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No unique words found</div>
</div>
""", unsafe_allow_html=True)
with cloud_col3:
if common_freq and len(common_freq) > 0:
# Use ALL common words but display top 25 in cloud
img3 = create_wordcloud_image(
dict(common_freq), # Use ALL common words for frequency
"Common Words",
'Blues'
)
if img3:
st.markdown(f'''
<div style="text-align: center; margin: 10px 0;">
<img src="data:image/png;base64,{img3}" style="max-width: 100%; height: auto; border-radius: 6px; box-shadow: 0 1px 4px rgba(0,0,0,0.1);">
</div>
<div style="text-align: center; font-size: 0.8rem; color: #6b7280;">
Showing top 25 of {len(common_freq)} common words
</div>
''', unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #eff6ff; border-radius: 6px; color: #1d4ed8;">
<div style="font-size: 2rem;">📝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No common words</div>
</div>
""", unsafe_allow_html=True)
else:
st.markdown("""
<div style="text-align: center; padding: 40px; background: #f9fafb; border-radius: 6px; color: #6b7280;">
<div style="font-size: 2rem;">🤝</div>
<div style="font-size: 0.9rem; margin-top: 8px;">No common words found</div>
</div>
""", unsafe_allow_html=True)
except ImportError:
st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`")
# Fallback to top words lists
st.markdown("**📋 Top Unique Words (Fallback)**")
fallback_col1, fallback_col2, fallback_col3 = st.columns(3)
with fallback_col1:
st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**")
if unique_freq_1:
for word, count in unique_freq_1.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No unique words*")
with fallback_col2:
st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**")
if unique_freq_2:
for word, count in unique_freq_2.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No unique words*")
with fallback_col3:
st.markdown("**🔵 Common Words**")
if common_freq:
for word, count in common_freq.most_common(10):
st.markdown(f"• {word} ({count})")
else:
st.markdown("*No common words*")
# Word frequency bar charts as additional analysis
st.markdown("""
<h4 style="font-family: 'Inter', sans-serif; font-weight: 600; color: #374151; margin: 30px 0 16px 0;">
📊 Top Words Frequency Comparison
</h4>
""", unsafe_allow_html=True)
freq_col1, freq_col2 = st.columns(2)
with freq_col1:
if unique_freq_1:
top_words_1 = dict(unique_freq_1.most_common(10))
fig_freq1 = px.bar(
x=list(top_words_1.values()),
y=list(top_words_1.keys()),
orientation='h',
title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}",
color=list(top_words_1.values()),
color_continuous_scale='Reds'
)
fig_freq1.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_freq1, use_container_width=True)
with freq_col2:
if unique_freq_2:
top_words_2 = dict(unique_freq_2.most_common(10))
fig_freq2 = px.bar(
x=list(top_words_2.values()),
y=list(top_words_2.keys()),
orientation='h',
title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}",
color=list(top_words_2.values()),
color_continuous_scale='Greens'
)
fig_freq2.update_layout(
height=400,
xaxis_title="Frequency",
yaxis_title="Words",
font=dict(family="Inter", size=10)
)
st.plotly_chart(fig_freq2, use_container_width=True)
else:
st.warning("⚠️ Need at least 2 translation columns for comparison analysis.")
else:
st.markdown("""
<div style="background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; padding: 24px; margin: 16px 0; text-align: center;">
<h3 style="font-family: 'Inter', sans-serif; color: #dc2626; margin: 0 0 12px 0;">❌ No Data Available</h3>
<p style="font-family: 'Inter', sans-serif; color: #7f1d1d; margin: 0;">
Please ensure translation data files are available in the data directory.
</p>
</div>
""", unsafe_allow_html=True)
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: #6b7280; font-family: 'Inter', sans-serif; font-size: 0.875rem;">
Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()