my-gradio-app / i18n /language_detector.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
"""
Language Detector
Automatically detect user's language from their message
"""
import re
from typing import Optional, Tuple
from enum import Enum
class Language(str, Enum):
"""Supported languages"""
VIETNAMESE = "vi"
ENGLISH = "en"
class LanguageDetector:
"""Detect language from user input"""
# Vietnamese-specific characters
VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ')
# Common Vietnamese words
VIETNAMESE_WORDS = {
'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho',
'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi',
'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu',
'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói',
'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện'
}
# Common English words
ENGLISH_WORDS = {
'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has',
'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your',
'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who',
'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel'
}
@staticmethod
def detect(text: str) -> Language:
"""
Detect language from text
Args:
text: Input text
Returns:
Detected language (vi or en)
"""
if not text or len(text.strip()) < 2:
return Language.VIETNAMESE # Default
text_lower = text.lower()
# Check for Vietnamese characters
has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower)
if has_vietnamese_chars:
return Language.VIETNAMESE
# Check for Vietnamese words
words = re.findall(r'\b\w+\b', text_lower)
vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
# If more Vietnamese words, it's Vietnamese
if vietnamese_word_count > english_word_count:
return Language.VIETNAMESE
# If more English words, it's English
if english_word_count > vietnamese_word_count:
return Language.ENGLISH
# Default to Vietnamese
return Language.VIETNAMESE
@staticmethod
def detect_with_confidence(text: str) -> Tuple[Language, float]:
"""
Detect language with confidence score
Args:
text: Input text
Returns:
(language, confidence_score)
"""
if not text or len(text.strip()) < 2:
return Language.VIETNAMESE, 0.5
text_lower = text.lower()
# Count Vietnamese characters
vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS)
total_chars = len([c for c in text_lower if c.isalpha()])
if vietnamese_char_count > 0 and total_chars > 0:
confidence = min(vietnamese_char_count / total_chars * 2, 1.0)
return Language.VIETNAMESE, confidence
# Count words
words = re.findall(r'\b\w+\b', text_lower)
if not words:
return Language.VIETNAMESE, 0.5
vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
total_matched = vietnamese_word_count + english_word_count
if total_matched == 0:
return Language.VIETNAMESE, 0.5
if vietnamese_word_count > english_word_count:
confidence = vietnamese_word_count / total_matched
return Language.VIETNAMESE, confidence
else:
confidence = english_word_count / total_matched
return Language.ENGLISH, confidence
def detect_language(text: str) -> Language:
"""Convenience function to detect language"""
return LanguageDetector.detect(text)