Spaces:

lapnt3
/

my-gradio-app

Runtime error

my-gradio-app / i18n /language_detector.py

Nguyen Trong Lap

Recreate history without binary blobs

eeb0f9c 10 days ago

4.55 kB

	"""
	Language Detector
	Automatically detect user's language from their message
	"""

	import re
	from typing import Optional, Tuple
	from enum import Enum


	class Language(str, Enum):
	"""Supported languages"""
	VIETNAMESE = "vi"
	ENGLISH = "en"


	class LanguageDetector:
	"""Detect language from user input"""

	# Vietnamese-specific characters
	VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ')

	# Common Vietnamese words
	VIETNAMESE_WORDS = {
	'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho',
	'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi',
	'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu',
	'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói',
	'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện'
	}

	# Common English words
	ENGLISH_WORDS = {
	'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has',
	'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your',
	'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who',
	'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel'
	}

	@staticmethod
	def detect(text: str) -> Language:
	"""
	Detect language from text

	Args:
	text: Input text

	Returns:
	Detected language (vi or en)
	"""
	if not text or len(text.strip()) < 2:
	return Language.VIETNAMESE # Default

	text_lower = text.lower()

	# Check for Vietnamese characters
	has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower)

	if has_vietnamese_chars:
	return Language.VIETNAMESE

	# Check for Vietnamese words
	words = re.findall(r'\b\w+\b', text_lower)
	vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
	english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)

	# If more Vietnamese words, it's Vietnamese
	if vietnamese_word_count > english_word_count:
	return Language.VIETNAMESE

	# If more English words, it's English
	if english_word_count > vietnamese_word_count:
	return Language.ENGLISH

	# Default to Vietnamese
	return Language.VIETNAMESE

	@staticmethod
	def detect_with_confidence(text: str) -> Tuple[Language, float]:
	"""
	Detect language with confidence score

	Args:
	text: Input text

	Returns:
	(language, confidence_score)
	"""
	if not text or len(text.strip()) < 2:
	return Language.VIETNAMESE, 0.5

	text_lower = text.lower()

	# Count Vietnamese characters
	vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS)
	total_chars = len([c for c in text_lower if c.isalpha()])

	if vietnamese_char_count > 0 and total_chars > 0:
	confidence = min(vietnamese_char_count / total_chars * 2, 1.0)
	return Language.VIETNAMESE, confidence

	# Count words
	words = re.findall(r'\b\w+\b', text_lower)
	if not words:
	return Language.VIETNAMESE, 0.5

	vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
	english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)

	total_matched = vietnamese_word_count + english_word_count

	if total_matched == 0:
	return Language.VIETNAMESE, 0.5

	if vietnamese_word_count > english_word_count:
	confidence = vietnamese_word_count / total_matched
	return Language.VIETNAMESE, confidence
	else:
	confidence = english_word_count / total_matched
	return Language.ENGLISH, confidence


	def detect_language(text: str) -> Language:
	"""Convenience function to detect language"""
	return LanguageDetector.detect(text)