Spaces:
Runtime error
Runtime error
| """ | |
| Language Detector | |
| Automatically detect user's language from their message | |
| """ | |
| import re | |
| from typing import Optional, Tuple | |
| from enum import Enum | |
| class Language(str, Enum): | |
| """Supported languages""" | |
| VIETNAMESE = "vi" | |
| ENGLISH = "en" | |
| class LanguageDetector: | |
| """Detect language from user input""" | |
| # Vietnamese-specific characters | |
| VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ') | |
| # Common Vietnamese words | |
| VIETNAMESE_WORDS = { | |
| 'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho', | |
| 'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi', | |
| 'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu', | |
| 'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói', | |
| 'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện' | |
| } | |
| # Common English words | |
| ENGLISH_WORDS = { | |
| 'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has', | |
| 'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your', | |
| 'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who', | |
| 'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel' | |
| } | |
| def detect(text: str) -> Language: | |
| """ | |
| Detect language from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| Detected language (vi or en) | |
| """ | |
| if not text or len(text.strip()) < 2: | |
| return Language.VIETNAMESE # Default | |
| text_lower = text.lower() | |
| # Check for Vietnamese characters | |
| has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower) | |
| if has_vietnamese_chars: | |
| return Language.VIETNAMESE | |
| # Check for Vietnamese words | |
| words = re.findall(r'\b\w+\b', text_lower) | |
| vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS) | |
| english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS) | |
| # If more Vietnamese words, it's Vietnamese | |
| if vietnamese_word_count > english_word_count: | |
| return Language.VIETNAMESE | |
| # If more English words, it's English | |
| if english_word_count > vietnamese_word_count: | |
| return Language.ENGLISH | |
| # Default to Vietnamese | |
| return Language.VIETNAMESE | |
| def detect_with_confidence(text: str) -> Tuple[Language, float]: | |
| """ | |
| Detect language with confidence score | |
| Args: | |
| text: Input text | |
| Returns: | |
| (language, confidence_score) | |
| """ | |
| if not text or len(text.strip()) < 2: | |
| return Language.VIETNAMESE, 0.5 | |
| text_lower = text.lower() | |
| # Count Vietnamese characters | |
| vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS) | |
| total_chars = len([c for c in text_lower if c.isalpha()]) | |
| if vietnamese_char_count > 0 and total_chars > 0: | |
| confidence = min(vietnamese_char_count / total_chars * 2, 1.0) | |
| return Language.VIETNAMESE, confidence | |
| # Count words | |
| words = re.findall(r'\b\w+\b', text_lower) | |
| if not words: | |
| return Language.VIETNAMESE, 0.5 | |
| vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS) | |
| english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS) | |
| total_matched = vietnamese_word_count + english_word_count | |
| if total_matched == 0: | |
| return Language.VIETNAMESE, 0.5 | |
| if vietnamese_word_count > english_word_count: | |
| confidence = vietnamese_word_count / total_matched | |
| return Language.VIETNAMESE, confidence | |
| else: | |
| confidence = english_word_count / total_matched | |
| return Language.ENGLISH, confidence | |
| def detect_language(text: str) -> Language: | |
| """Convenience function to detect language""" | |
| return LanguageDetector.detect(text) | |