Update app.py
Browse files
app.py
CHANGED
|
@@ -31,6 +31,8 @@ INDEX_NAME_MULTILINGUAL = "budget-proposals-embeddinggemma" # 768 dimensions fo
|
|
| 31 |
# English: all-MiniLM-L6-v2 (better domain understanding)
|
| 32 |
# Sinhala/Tamil: EmbeddingGemma-300m (better multilingual support)
|
| 33 |
import os
|
|
|
|
|
|
|
| 34 |
from huggingface_hub import login
|
| 35 |
|
| 36 |
# Login to Hugging Face if token is available (for EmbeddingGemma)
|
|
@@ -38,6 +40,12 @@ hf_token = os.getenv('HF_TOKEN')
|
|
| 38 |
if hf_token:
|
| 39 |
login(token=hf_token)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# Load both models
|
| 42 |
embed_model_en = SentenceTransformer("all-MiniLM-L6-v2")
|
| 43 |
embed_model_multilingual = SentenceTransformer("google/embeddinggemma-300m")
|
|
@@ -49,6 +57,53 @@ def get_embedding_model(language):
|
|
| 49 |
else: # si, ta, or any other language
|
| 50 |
return embed_model_multilingual
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Load dynamic metadata
|
| 53 |
def load_dynamic_metadata():
|
| 54 |
"""Load metadata from dynamic_metadata.json"""
|
|
@@ -94,6 +149,10 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 94 |
global DYNAMIC_METADATA
|
| 95 |
DYNAMIC_METADATA = load_dynamic_metadata()
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
pc_index = get_pinecone_index(language)
|
| 98 |
if not pc_index:
|
| 99 |
return []
|
|
|
|
| 31 |
# English: all-MiniLM-L6-v2 (better domain understanding)
|
| 32 |
# Sinhala/Tamil: EmbeddingGemma-300m (better multilingual support)
|
| 33 |
import os
|
| 34 |
+
import re
|
| 35 |
+
import google.generativeai as genai
|
| 36 |
from huggingface_hub import login
|
| 37 |
|
| 38 |
# Login to Hugging Face if token is available (for EmbeddingGemma)
|
|
|
|
| 40 |
if hf_token:
|
| 41 |
login(token=hf_token)
|
| 42 |
|
| 43 |
+
# Configure Gemini for transliteration
|
| 44 |
+
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
| 45 |
+
if gemini_api_key:
|
| 46 |
+
genai.configure(api_key=gemini_api_key)
|
| 47 |
+
gemini_model = genai.GenerativeModel('gemini-2.5-flash')
|
| 48 |
+
|
| 49 |
# Load both models
|
| 50 |
embed_model_en = SentenceTransformer("all-MiniLM-L6-v2")
|
| 51 |
embed_model_multilingual = SentenceTransformer("google/embeddinggemma-300m")
|
|
|
|
| 57 |
else: # si, ta, or any other language
|
| 58 |
return embed_model_multilingual
|
| 59 |
|
| 60 |
+
def contains_sinhala_roman(text):
|
| 61 |
+
"""Check if text contains Roman Sinhala patterns"""
|
| 62 |
+
# Common Roman Sinhala patterns
|
| 63 |
+
sinhala_roman_patterns = [
|
| 64 |
+
r'\b[a-z]+[aeiou][a-z]*\b', # Basic Sinhala roman patterns
|
| 65 |
+
r'\b(ma|ta|ka|ga|cha|ja|da|tha|pa|ba|ya|ra|la|wa|sa|ha|na|mata|kata|gata)\b', # Common words
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
for pattern in sinhala_roman_patterns:
|
| 69 |
+
if re.search(pattern, text.lower()):
|
| 70 |
+
return True
|
| 71 |
+
return False
|
| 72 |
+
|
| 73 |
+
def transliterate_sinhala_roman_to_sinhala(text):
|
| 74 |
+
"""Use Gemini to convert Roman Sinhala to Sinhala script"""
|
| 75 |
+
if not gemini_api_key or not contains_sinhala_roman(text):
|
| 76 |
+
return text
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
prompt = f"""Convert this Roman Sinhala text to Sinhala script. Only convert if it's actually Sinhala words in Roman script. If it's English or other language, return as is.
|
| 80 |
+
|
| 81 |
+
Text: "{text}"
|
| 82 |
+
|
| 83 |
+
Sinhala script:"""
|
| 84 |
+
|
| 85 |
+
response = gemini_model.generate_content(prompt)
|
| 86 |
+
result = response.text.strip()
|
| 87 |
+
|
| 88 |
+
# Clean up the response
|
| 89 |
+
if result and len(result) > 0:
|
| 90 |
+
return result
|
| 91 |
+
else:
|
| 92 |
+
return text
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.warning(f"Transliteration failed: {e}")
|
| 96 |
+
return text
|
| 97 |
+
|
| 98 |
+
def preprocess_query(query, language):
|
| 99 |
+
"""Preprocess query with transliteration if needed"""
|
| 100 |
+
if language == 'si' and contains_sinhala_roman(query):
|
| 101 |
+
logger.info(f"Transliterating Roman Sinhala: {query}")
|
| 102 |
+
transliterated = transliterate_sinhala_roman_to_sinhala(query)
|
| 103 |
+
logger.info(f"Transliterated to: {transliterated}")
|
| 104 |
+
return transliterated
|
| 105 |
+
return query
|
| 106 |
+
|
| 107 |
# Load dynamic metadata
|
| 108 |
def load_dynamic_metadata():
|
| 109 |
"""Load metadata from dynamic_metadata.json"""
|
|
|
|
| 149 |
global DYNAMIC_METADATA
|
| 150 |
DYNAMIC_METADATA = load_dynamic_metadata()
|
| 151 |
|
| 152 |
+
# Preprocess query with transliteration if needed
|
| 153 |
+
original_query = query
|
| 154 |
+
query = preprocess_query(query, language)
|
| 155 |
+
|
| 156 |
pc_index = get_pinecone_index(language)
|
| 157 |
if not pc_index:
|
| 158 |
return []
|