Spaces:

faisalsns
/

language-detection-compare-models

Sleeping

App Files Files Community

faisalsns commited on Jul 21

Commit

023e017

1 Parent(s): f5cf0c0

Initial commit

Browse files

Files changed (4) hide show

README.md +17 -0
app.py +319 -0
models/lid.176.bin +3 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -12,3 +12,20 @@ short_description: compare language detection models
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Language Detection Comparison App
+This app compares language detection results from three sources:
+- **Facebook fastText** (offline, accurate)
+- **Google Cloud Translation API** (online, requires API key)
+- **Hugging Face language detection model** (configurable)
+## Setup
+1. Install dependencies:
+```bash
+pip install -r requirements.txt

app.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import os
+import json
+import gradio as gr
+import fasttext
+from google.cloud import translate_v2 as translate
+from transformers import pipeline
+from dotenv import load_dotenv
+import subprocess
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_PATH = os.path.join(BASE_DIR, "models", "lid.176.bin")
+fasttext_model = fasttext.load_model(MODEL_PATH)
+# model = fasttext.load_model("models\lid.176.bin")
+# print(model.predict("Hello world"))
+# --- Setup FastText model (download if missing) ---
+# MODEL_PATH = "C:/_Prep/_code/Python/language-detection-compare-models/models/lid.176.bin"
+# os.makedirs("models", exist_ok=True)
+# if not os.path.exists(MODEL_PATH):
+#     os.system(
+#         f"wget -O {MODEL_PATH} https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
+#     )
+try:
+    fasttext_model = fasttext.load_model(MODEL_PATH)
+except ValueError:
+    raise RuntimeError("FastText model file could not be loaded.")
+# --- Setup Google Translate Client ---
+# google_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+# if google_creds:
+#     with open("google_creds.json", "w") as f:
+#         f.write(google_creds)
+#     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google_creds.json"
+#     translate_client = translate.Client()
+# else:
+#     translate_client = None
+#print("Current working directory:", os.getcwd())
+#load_dotenv(dotenv_path=r"C:\_Prep\_code\Python\language-detection-compare-models\.env")  # If needed
+#C:\_Prep\_code\Python\language-detection-compare-models\.env
+google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIAL")
+#print("Resolved GOOGLE_APPLICATION_CREDENTIALS:", google_creds_path)
+# load_dotenv()
+# google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+#google_creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+if google_creds_path and os.path.isfile(google_creds_path):
+    os.environ["GOOGLE_APPLICATION_CREDENTIAL"] = google_creds_path  # redundant but explicit
+    from google.cloud import translate_v2 as translate
+    translate_client = translate.Client()
+else:
+    translate_client = None
+# --- Setup Hugging Face pipeline ---
+HF_MODEL_NAME = "papluca/xlm-roberta-base-language-detection"
+hf_lang_detector = pipeline("text-classification", model=HF_MODEL_NAME)
+# --- Mapping ISO 639-1 language codes to countries with flag emojis ---
+# Source: filtered and truncated for top 5 countries (edit as needed)
+LANGUAGE_TO_COUNTRIES = {
+    "en": ["US", "GB", "CA", "AU", "IN"],
+    "fr": ["FR", "BE", "CA", "CH", "LU"],
+    "es": ["ES", "MX", "CO", "AR", "PE"],
+    "de": ["DE", "AT", "CH", "LU", "BE"],
+    "ar": ["EG", "SA", "IQ", "DZ", "MA"],
+    "hi": ["IN", "FJ", "MU", "NP", "SG"],
+    "zh": ["CN", "SG", "MY", "TW", "HK"],
+    "ru": ["RU", "BY", "KZ", "UA", "KG"],
+    "pt": ["PT", "BR", "AO", "MZ", "GW"],
+    "ja": ["JP"],
+    "ko": ["KR"],
+}
+def flag_emoji(country_code):
+    return "".join(chr(0x1F1E6 + ord(c) - ord('A')) for c in country_code)
+def render_result(model_name, lang_code, score):
+    flags = LANGUAGE_TO_COUNTRIES.get(lang_code, [])
+    if flags:
+        flag_str = " ".join(flag_emoji(c) for c in flags[:5])
+        etc = "<br>...etc" if len(flags) > 5 else ""
+    else:
+        flag_str = "🌐"
+        etc = ""
+    return f"<b>{model_name}:</b> <code>{lang_code}</code> ({score})<br>{flag_str}{etc}"
+# def detect_languages(text, hf_model_path=None):
+#     # FastText
+#     try:
+#         ft_label, ft_score = fasttext_model.predict(text, k=1)
+#         ft_lang = ft_label[0].replace("__label__", "")
+#         ft_score = round(ft_score[0], 3)
+#     except Exception:
+#         ft_lang, ft_score = "Error", 0
+#     # Google Translate
+#     if translate_client:
+#         try:
+#             result = translate_client.detect_language(text)
+#             google_lang = result.get("language", "N/A")
+#             google_conf = round(result.get("confidence", 0), 3)
+#         except Exception:
+#             google_lang, google_conf = "Error", 0
+#     else:
+#         google_lang, google_conf = "NotConfigured", 0
+#     # Hugging Face
+#     try:
+#         model = (
+#             pipeline("text-classification", model=hf_model_path)
+#             if hf_model_path and hf_model_path.strip()
+#             else hf_lang_detector
+#         )
+#         hf_results = model(text)
+#         hf_lang = hf_results[0]["label"].lower()
+#         hf_score = round(hf_results[0]["score"], 3)
+#     except Exception:
+#         hf_lang, hf_score = "Error", 0
+#     return (
+#         render_result("FastText", ft_lang, ft_score),
+#         render_result("Google", google_lang, google_conf),
+#         render_result("HuggingFace", hf_lang, hf_score)
+#     )
+from langcodes import Language
+# Maps language code to top 5 countries where it's predominantly spoken
+LANG_COUNTRY_MAP = {
+    'af': ['ZA', 'NA'],
+    'am': ['ET'],
+    'ar': ['SA', 'EG', 'IQ', 'MA', 'DZ', 'SD', 'SY', 'YE', 'JO', 'LB', 'TN', 'AE', 'OM', 'KW', 'BH', 'QA', 'LY'],
+    'az': ['AZ'],
+    'be': ['BY'],
+    'bg': ['BG'],
+    'bn': ['BD', 'IN'],
+    'bs': ['BA'],
+    'ca': ['ES', 'AD'],
+    'ceb': ['PH'],
+    'cs': ['CZ'],
+    'cy': ['GB'],
+    'da': ['DK'],
+    'de': ['DE', 'AT', 'CH', 'LU', 'BE', 'LI'],
+    'el': ['GR', 'CY'],
+    'en': ['US', 'GB', 'CA', 'AU', 'NZ', 'IE', 'ZA', 'IN', 'PH', 'NG', 'KE', 'UG'],
+    'eo': ['PL', 'FR', 'DE', 'US'],
+    'es': ['ES', 'MX', 'CO', 'AR', 'PE', 'VE', 'CL', 'EC', 'GT', 'CU', 'BO', 'DO', 'HN', 'PY', 'SV', 'NI', 'CR', 'PA', 'UY'],
+    'et': ['EE'],
+    'eu': ['ES', 'FR'],
+    'fa': ['IR', 'AF', 'TJ'],
+    'fi': ['FI'],
+    'fil': ['PH'],
+    'fj': ['FJ'],
+    'fr': ['FR', 'BE', 'CA', 'CH', 'LU', 'CI', 'SN', 'ML', 'CM', 'HT', 'MG', 'NE', 'TG', 'GA', 'CD', 'BF', 'TD'],
+    'fy': ['NL'],
+    'ga': ['IE'],
+    'gd': ['GB'],
+    'gl': ['ES'],
+    'gu': ['IN'],
+    'ha': ['NG', 'NE', 'GH'],
+    'haw': ['US'],
+    'he': ['IL'],
+    'hi': ['IN', 'FJ', 'MU', 'NP', 'SG'],
+    'hmn': ['US'],
+    'hr': ['HR', 'BA'],
+    'ht': ['HT'],
+    'hu': ['HU'],
+    'hy': ['AM'],
+    'id': ['ID'],
+    'ig': ['NG'],
+    'is': ['IS'],
+    'it': ['IT', 'CH', 'SM'],
+    'ja': ['JP'],
+    'jv': ['ID'],
+    'ka': ['GE'],
+    'kk': ['KZ'],
+    'km': ['KH'],
+    'kn': ['IN'],
+    'ko': ['KR', 'KP'],
+    'ku': ['IQ', 'TR', 'SY', 'IR'],
+    'ky': ['KG'],
+    'la': ['VA'],
+    'lb': ['LU'],
+    'lo': ['LA'],
+    'lt': ['LT'],
+    'lv': ['LV'],
+    'mg': ['MG'],
+    'mi': ['NZ'],
+    'mk': ['MK'],
+    'ml': ['IN'],
+    'mn': ['MN'],
+    'mr': ['IN'],
+    'ms': ['MY', 'BN', 'SG'],
+    'mt': ['MT'],
+    'my': ['MM'],
+    'ne': ['NP'],
+    'nl': ['NL', 'BE', 'SR', 'AW', 'CW'],
+    'no': ['NO'],
+    'ny': ['MW', 'ZM', 'ZW'],
+    'pa': ['IN', 'PK'],
+    'pl': ['PL'],
+    'ps': ['AF'],
+    'pt': ['PT', 'BR', 'AO', 'MZ', 'GW', 'ST', 'CV'],
+    'ro': ['RO', 'MD'],
+    'ru': ['RU', 'BY', 'KZ', 'KG', 'UA'],
+    'rw': ['RW'],
+    'sd': ['PK'],
+    'si': ['LK'],
+    'sk': ['SK'],
+    'sl': ['SI'],
+    'sm': ['WS'],
+    'sn': ['ZW'],
+    'so': ['SO'],
+    'sq': ['AL', 'XK', 'MK'],
+    'sr': ['RS', 'BA', 'ME'],
+    'st': ['LS'],
+    'su': ['ID'],
+    'sv': ['SE', 'FI'],
+    'sw': ['KE', 'TZ', 'UG'],
+    'ta': ['IN', 'LK', 'SG', 'MY'],
+    'te': ['IN'],
+    'tg': ['TJ'],
+    'th': ['TH'],
+    'ti': ['ET', 'ER'],
+    'tk': ['TM'],
+    'tl': ['PH'],
+    'tr': ['TR', 'CY'],
+    'tt': ['RU'],
+    'ug': ['CN'],
+    'uk': ['UA'],
+    'ur': ['PK', 'IN'],
+    'uz': ['UZ'],
+    'vi': ['VN'],
+    'xh': ['ZA'],
+    'yi': ['US', 'IL'],
+    'yo': ['NG'],
+    'zh': ['CN', 'SG', 'MY', 'TW'],
+    'zu': ['ZA'],
+}
+def country_flag_img(country_code):
+    #return f"<img src='https://flagcdn.com/w40/{country_code.lower()}.png' height='20' style='margin-right:4px'/><br/>"
+    return f"<img src='https://flagcdn.com/w40/{country_code.lower()}.png' title='{LANG_COUNTRY_MAP.get(country_code, country_code)}' height='20' style='margin-right:4px'/><br/>"
+def format_with_flags(lang_code):
+    countries = LANG_COUNTRY_MAP.get(lang_code, [])
+    flags_html = ''.join([country_flag_img(c) for c in countries[:5]])
+    if len(countries) > 5:
+        flags_html += "<span style='margin-left:4px;'>etc...</span>"
+    return flags_html
+def detect_languages(text, hf_model_path=None):
+    ft_label, ft_score = fasttext_model.predict(text, k=1)
+    ft_lang = ft_label[0].replace("__label__", "")
+    ft_score = round(ft_score[0], 3)
+    if translate_client:
+        try:
+            result = translate_client.detect_language(text)
+            google_lang = result.get("language", "N/A")
+            google_conf = round(result.get("confidence", 0), 3)
+        except Exception:
+            google_lang = "Error"
+            google_conf = 0
+    else:
+        google_lang = "Not Configured"
+        google_conf = 0
+    if hf_model_path and hf_model_path.strip() != "":
+        try:
+            custom_detector = pipeline("text-classification", model=hf_model_path)
+            hf_results = custom_detector(text)
+        except Exception:
+            hf_results = [{"label": "Error", "score": 0}]
+    else:
+        hf_results = hf_lang_detector(text)
+    hf_label = hf_results[0]["label"].lower()
+    hf_score = round(hf_results[0]["score"], 3)
+    return (
+        f"FastText: {ft_lang} ({ft_score})<br>{format_with_flags(ft_lang)}",
+        f"Google API: {google_lang} ({google_conf})<br>{format_with_flags(google_lang)}",
+        f"HuggingFace: {hf_label} ({hf_score})<br>{format_with_flags(hf_label)}"
+    )
+with gr.Blocks() as demo:
+    gr.Markdown("## 🌍 Language Detection Comparison")
+    with gr.Row():
+        input_text = gr.TextArea(label="Enter text", lines=4, placeholder="Type text to detect language...")
+    with gr.Row():
+        hf_model_path = gr.Textbox(label="HuggingFace Model Path (optional)", value="papluca/xlm-roberta-base-language-detection", placeholder="e.g. papluca/xlm-roberta-base-language-detection")
+    detect_btn = gr.Button("Detect Language")
+    with gr.Row():
+        fasttext_out = gr.HTML(label="FastText")
+        google_out = gr.HTML(label="Google")
+        hf_out = gr.HTML(label="Hugging Face")
+    detect_btn.click(
+        detect_languages,
+        inputs=[input_text, hf_model_path],
+        outputs=[fasttext_out, google_out, hf_out]
+    )
+if __name__ == "__main__":
+    demo.launch()

models/lid.176.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+fasttext
+google-cloud-translate
+transformers
+torch