""" Language Detector Automatically detect user's language from their message """ import re from typing import Optional, Tuple from enum import Enum class Language(str, Enum): """Supported languages""" VIETNAMESE = "vi" ENGLISH = "en" class LanguageDetector: """Detect language from user input""" # Vietnamese-specific characters VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ') # Common Vietnamese words VIETNAMESE_WORDS = { 'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho', 'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi', 'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu', 'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói', 'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện' } # Common English words ENGLISH_WORDS = { 'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your', 'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who', 'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel' } @staticmethod def detect(text: str) -> Language: """ Detect language from text Args: text: Input text Returns: Detected language (vi or en) """ if not text or len(text.strip()) < 2: return Language.VIETNAMESE # Default text_lower = text.lower() # Check for Vietnamese characters has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower) if has_vietnamese_chars: return Language.VIETNAMESE # Check for Vietnamese words words = re.findall(r'\b\w+\b', text_lower) vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS) english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS) # If more Vietnamese words, it's Vietnamese if vietnamese_word_count > english_word_count: return Language.VIETNAMESE # If more English words, it's English if english_word_count > vietnamese_word_count: return Language.ENGLISH # Default to Vietnamese return Language.VIETNAMESE @staticmethod def detect_with_confidence(text: str) -> Tuple[Language, float]: """ Detect language with confidence score Args: text: Input text Returns: (language, confidence_score) """ if not text or len(text.strip()) < 2: return Language.VIETNAMESE, 0.5 text_lower = text.lower() # Count Vietnamese characters vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS) total_chars = len([c for c in text_lower if c.isalpha()]) if vietnamese_char_count > 0 and total_chars > 0: confidence = min(vietnamese_char_count / total_chars * 2, 1.0) return Language.VIETNAMESE, confidence # Count words words = re.findall(r'\b\w+\b', text_lower) if not words: return Language.VIETNAMESE, 0.5 vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS) english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS) total_matched = vietnamese_word_count + english_word_count if total_matched == 0: return Language.VIETNAMESE, 0.5 if vietnamese_word_count > english_word_count: confidence = vietnamese_word_count / total_matched return Language.VIETNAMESE, confidence else: confidence = english_word_count / total_matched return Language.ENGLISH, confidence def detect_language(text: str) -> Language: """Convenience function to detect language""" return LanguageDetector.detect(text)