danulr05 commited on
Commit
780cfe8
·
verified ·
1 Parent(s): 0810251

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py CHANGED
@@ -31,6 +31,8 @@ INDEX_NAME_MULTILINGUAL = "budget-proposals-embeddinggemma" # 768 dimensions fo
31
  # English: all-MiniLM-L6-v2 (better domain understanding)
32
  # Sinhala/Tamil: EmbeddingGemma-300m (better multilingual support)
33
  import os
 
 
34
  from huggingface_hub import login
35
 
36
  # Login to Hugging Face if token is available (for EmbeddingGemma)
@@ -38,6 +40,12 @@ hf_token = os.getenv('HF_TOKEN')
38
  if hf_token:
39
  login(token=hf_token)
40
 
 
 
 
 
 
 
41
  # Load both models
42
  embed_model_en = SentenceTransformer("all-MiniLM-L6-v2")
43
  embed_model_multilingual = SentenceTransformer("google/embeddinggemma-300m")
@@ -49,6 +57,53 @@ def get_embedding_model(language):
49
  else: # si, ta, or any other language
50
  return embed_model_multilingual
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Load dynamic metadata
53
  def load_dynamic_metadata():
54
  """Load metadata from dynamic_metadata.json"""
@@ -94,6 +149,10 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
94
  global DYNAMIC_METADATA
95
  DYNAMIC_METADATA = load_dynamic_metadata()
96
 
 
 
 
 
97
  pc_index = get_pinecone_index(language)
98
  if not pc_index:
99
  return []
 
31
  # English: all-MiniLM-L6-v2 (better domain understanding)
32
  # Sinhala/Tamil: EmbeddingGemma-300m (better multilingual support)
33
  import os
34
+ import re
35
+ import google.generativeai as genai
36
  from huggingface_hub import login
37
 
38
  # Login to Hugging Face if token is available (for EmbeddingGemma)
 
40
  if hf_token:
41
  login(token=hf_token)
42
 
43
+ # Configure Gemini for transliteration
44
+ gemini_api_key = os.getenv('GEMINI_API_KEY')
45
+ if gemini_api_key:
46
+ genai.configure(api_key=gemini_api_key)
47
+ gemini_model = genai.GenerativeModel('gemini-2.5-flash')
48
+
49
  # Load both models
50
  embed_model_en = SentenceTransformer("all-MiniLM-L6-v2")
51
  embed_model_multilingual = SentenceTransformer("google/embeddinggemma-300m")
 
57
  else: # si, ta, or any other language
58
  return embed_model_multilingual
59
 
60
+ def contains_sinhala_roman(text):
61
+ """Check if text contains Roman Sinhala patterns"""
62
+ # Common Roman Sinhala patterns
63
+ sinhala_roman_patterns = [
64
+ r'\b[a-z]+[aeiou][a-z]*\b', # Basic Sinhala roman patterns
65
+ r'\b(ma|ta|ka|ga|cha|ja|da|tha|pa|ba|ya|ra|la|wa|sa|ha|na|mata|kata|gata)\b', # Common words
66
+ ]
67
+
68
+ for pattern in sinhala_roman_patterns:
69
+ if re.search(pattern, text.lower()):
70
+ return True
71
+ return False
72
+
73
+ def transliterate_sinhala_roman_to_sinhala(text):
74
+ """Use Gemini to convert Roman Sinhala to Sinhala script"""
75
+ if not gemini_api_key or not contains_sinhala_roman(text):
76
+ return text
77
+
78
+ try:
79
+ prompt = f"""Convert this Roman Sinhala text to Sinhala script. Only convert if it's actually Sinhala words in Roman script. If it's English or other language, return as is.
80
+
81
+ Text: "{text}"
82
+
83
+ Sinhala script:"""
84
+
85
+ response = gemini_model.generate_content(prompt)
86
+ result = response.text.strip()
87
+
88
+ # Clean up the response
89
+ if result and len(result) > 0:
90
+ return result
91
+ else:
92
+ return text
93
+
94
+ except Exception as e:
95
+ logger.warning(f"Transliteration failed: {e}")
96
+ return text
97
+
98
+ def preprocess_query(query, language):
99
+ """Preprocess query with transliteration if needed"""
100
+ if language == 'si' and contains_sinhala_roman(query):
101
+ logger.info(f"Transliterating Roman Sinhala: {query}")
102
+ transliterated = transliterate_sinhala_roman_to_sinhala(query)
103
+ logger.info(f"Transliterated to: {transliterated}")
104
+ return transliterated
105
+ return query
106
+
107
  # Load dynamic metadata
108
  def load_dynamic_metadata():
109
  """Load metadata from dynamic_metadata.json"""
 
149
  global DYNAMIC_METADATA
150
  DYNAMIC_METADATA = load_dynamic_metadata()
151
 
152
+ # Preprocess query with transliteration if needed
153
+ original_query = query
154
+ query = preprocess_query(query, language)
155
+
156
  pc_index = get_pinecone_index(language)
157
  if not pc_index:
158
  return []