Update app.py
Browse files
app.py
CHANGED
|
@@ -157,59 +157,19 @@ Converted Tamil script:"""
|
|
| 157 |
logger.warning(f"Tamil transliteration failed: {e}")
|
| 158 |
return text
|
| 159 |
|
| 160 |
-
def detect_language_from_query(query):
|
| 161 |
-
"""Detect language from query text"""
|
| 162 |
-
# Check for Sinhala Unicode
|
| 163 |
-
if re.search(r'[\u0D80-\u0DFF]', query):
|
| 164 |
-
return 'si'
|
| 165 |
-
|
| 166 |
-
# Check for Tamil Unicode
|
| 167 |
-
if re.search(r'[\u0B80-\u0BFF]', query):
|
| 168 |
-
return 'ta'
|
| 169 |
-
|
| 170 |
-
# Check for Singlish (Romanized Sinhala)
|
| 171 |
-
if contains_sinhala_roman(query):
|
| 172 |
-
return 'singlish'
|
| 173 |
-
|
| 174 |
-
# Check for Romanized Tamil
|
| 175 |
-
if contains_tamil_roman(query):
|
| 176 |
-
return 'romanized_tamil'
|
| 177 |
-
|
| 178 |
-
# Default to English
|
| 179 |
-
return 'en'
|
| 180 |
-
|
| 181 |
def preprocess_query(query, language):
|
| 182 |
"""Preprocess query with transliteration if needed"""
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
detected_language = detect_language_from_query(query)
|
| 186 |
-
logger.info(f"Auto-detected language: {detected_language} for query: {query}")
|
| 187 |
-
else:
|
| 188 |
-
detected_language = language
|
| 189 |
-
|
| 190 |
-
# Handle Singlish (Romanized Sinhala) - always transliterate to Sinhala script
|
| 191 |
-
if detected_language == 'singlish' or (language == 'si' and contains_sinhala_roman(query)):
|
| 192 |
-
logger.info(f"Transliterating Singlish to Sinhala: {query}")
|
| 193 |
transliterated = transliterate_sinhala_roman_to_sinhala(query)
|
| 194 |
logger.info(f"Transliterated to: {transliterated}")
|
| 195 |
-
return transliterated
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
elif detected_language == 'romanized_tamil' or (language == 'ta' and contains_tamil_roman(query)):
|
| 199 |
-
logger.info(f"Transliterating Romanized Tamil to Tamil: {query}")
|
| 200 |
transliterated = transliterate_tamil_roman_to_tamil(query)
|
| 201 |
logger.info(f"Transliterated to: {transliterated}")
|
| 202 |
-
return transliterated
|
| 203 |
-
|
| 204 |
-
# For proper Sinhala/Tamil Unicode, use as-is
|
| 205 |
-
elif detected_language in ['si', 'ta']:
|
| 206 |
-
logger.info(f"Using original {detected_language} text: {query}")
|
| 207 |
-
return query, detected_language
|
| 208 |
-
|
| 209 |
-
# For English, use as-is
|
| 210 |
-
else:
|
| 211 |
-
logger.info(f"Using original English text: {query}")
|
| 212 |
-
return query, 'en'
|
| 213 |
|
| 214 |
# Load dynamic metadata
|
| 215 |
def load_dynamic_metadata():
|
|
@@ -258,17 +218,14 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 258 |
|
| 259 |
# Preprocess query with transliteration if needed
|
| 260 |
original_query = query
|
| 261 |
-
query
|
| 262 |
|
| 263 |
-
|
| 264 |
-
search_language = detected_language
|
| 265 |
-
|
| 266 |
-
pc_index = get_pinecone_index(search_language)
|
| 267 |
if not pc_index:
|
| 268 |
return []
|
| 269 |
|
| 270 |
# Use language-specific embedding model
|
| 271 |
-
model = get_embedding_model(
|
| 272 |
query_emb = model.encode(query).tolist()
|
| 273 |
|
| 274 |
# Build filter if category is specified
|
|
@@ -340,11 +297,11 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 340 |
"costLKR": metadata.get("costLKR", "No Costing Available")
|
| 341 |
})
|
| 342 |
|
| 343 |
-
# Get language-specific data
|
| 344 |
-
title = get_language_specific_data(proposal_data, "title",
|
| 345 |
-
summary = get_language_specific_data(proposal_data, "summary",
|
| 346 |
-
costLKR = get_language_specific_data(proposal_data, "costLKR",
|
| 347 |
-
category = get_language_specific_data(proposal_data, "category",
|
| 348 |
thumb_url = metadata.get("thumbUrl", "")
|
| 349 |
|
| 350 |
# Only include documents that have meaningful content in the requested language
|
|
@@ -370,22 +327,10 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
|
|
| 370 |
doc_count += 1
|
| 371 |
break
|
| 372 |
|
| 373 |
-
return
|
| 374 |
-
"results": results,
|
| 375 |
-
"original_query": original_query,
|
| 376 |
-
"processed_query": query,
|
| 377 |
-
"detected_language": search_language,
|
| 378 |
-
"transliterated": original_query != query
|
| 379 |
-
}
|
| 380 |
except Exception as e:
|
| 381 |
logger.error(f"Search error: {e}")
|
| 382 |
-
return
|
| 383 |
-
"results": [],
|
| 384 |
-
"original_query": query,
|
| 385 |
-
"processed_query": query,
|
| 386 |
-
"detected_language": language,
|
| 387 |
-
"transliterated": False
|
| 388 |
-
}
|
| 389 |
|
| 390 |
def get_all_proposals(category_filter=None, language='en'):
|
| 391 |
"""Get all budget proposals with multi-language support"""
|
|
@@ -486,28 +431,16 @@ def search_proposals():
|
|
| 486 |
if not query:
|
| 487 |
# If no query, return all proposals
|
| 488 |
results = get_all_proposals(category_filter, language)
|
| 489 |
-
return jsonify({
|
| 490 |
-
"query": query,
|
| 491 |
-
"results": results,
|
| 492 |
-
"total_results": len(results),
|
| 493 |
-
"category_filter": category_filter,
|
| 494 |
-
"language": language,
|
| 495 |
-
"detected_language": language,
|
| 496 |
-
"transliterated": False
|
| 497 |
-
})
|
| 498 |
else:
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
"detected_language": search_result["detected_language"],
|
| 509 |
-
"transliterated": search_result["transliterated"]
|
| 510 |
-
})
|
| 511 |
|
| 512 |
except Exception as e:
|
| 513 |
logger.error(f"API error: {e}")
|
|
@@ -525,28 +458,16 @@ def search_proposals_get():
|
|
| 525 |
if not query:
|
| 526 |
# If no query, return all proposals
|
| 527 |
results = get_all_proposals(category_filter, language)
|
| 528 |
-
return jsonify({
|
| 529 |
-
"query": query,
|
| 530 |
-
"results": results,
|
| 531 |
-
"total_results": len(results),
|
| 532 |
-
"category_filter": category_filter,
|
| 533 |
-
"language": language,
|
| 534 |
-
"detected_language": language,
|
| 535 |
-
"transliterated": False
|
| 536 |
-
})
|
| 537 |
else:
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
"detected_language": search_result["detected_language"],
|
| 548 |
-
"transliterated": search_result["transliterated"]
|
| 549 |
-
})
|
| 550 |
|
| 551 |
except Exception as e:
|
| 552 |
logger.error(f"API error: {e}")
|
|
|
|
| 157 |
logger.warning(f"Tamil transliteration failed: {e}")
|
| 158 |
return text
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
def preprocess_query(query, language):
|
| 161 |
"""Preprocess query with transliteration if needed"""
|
| 162 |
+
if language == 'si' and contains_sinhala_roman(query):
|
| 163 |
+
logger.info(f"Transliterating Roman Sinhala: {query}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
transliterated = transliterate_sinhala_roman_to_sinhala(query)
|
| 165 |
logger.info(f"Transliterated to: {transliterated}")
|
| 166 |
+
return transliterated
|
| 167 |
+
elif language == 'ta' and contains_tamil_roman(query):
|
| 168 |
+
logger.info(f"Transliterating Roman Tamil: {query}")
|
|
|
|
|
|
|
| 169 |
transliterated = transliterate_tamil_roman_to_tamil(query)
|
| 170 |
logger.info(f"Transliterated to: {transliterated}")
|
| 171 |
+
return transliterated
|
| 172 |
+
return query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
# Load dynamic metadata
|
| 175 |
def load_dynamic_metadata():
|
|
|
|
| 218 |
|
| 219 |
# Preprocess query with transliteration if needed
|
| 220 |
original_query = query
|
| 221 |
+
query = preprocess_query(query, language)
|
| 222 |
|
| 223 |
+
pc_index = get_pinecone_index(language)
|
|
|
|
|
|
|
|
|
|
| 224 |
if not pc_index:
|
| 225 |
return []
|
| 226 |
|
| 227 |
# Use language-specific embedding model
|
| 228 |
+
model = get_embedding_model(language)
|
| 229 |
query_emb = model.encode(query).tolist()
|
| 230 |
|
| 231 |
# Build filter if category is specified
|
|
|
|
| 297 |
"costLKR": metadata.get("costLKR", "No Costing Available")
|
| 298 |
})
|
| 299 |
|
| 300 |
+
# Get language-specific data
|
| 301 |
+
title = get_language_specific_data(proposal_data, "title", language)
|
| 302 |
+
summary = get_language_specific_data(proposal_data, "summary", language)
|
| 303 |
+
costLKR = get_language_specific_data(proposal_data, "costLKR", language)
|
| 304 |
+
category = get_language_specific_data(proposal_data, "category", language)
|
| 305 |
thumb_url = metadata.get("thumbUrl", "")
|
| 306 |
|
| 307 |
# Only include documents that have meaningful content in the requested language
|
|
|
|
| 327 |
doc_count += 1
|
| 328 |
break
|
| 329 |
|
| 330 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
except Exception as e:
|
| 332 |
logger.error(f"Search error: {e}")
|
| 333 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
def get_all_proposals(category_filter=None, language='en'):
|
| 336 |
"""Get all budget proposals with multi-language support"""
|
|
|
|
| 431 |
if not query:
|
| 432 |
# If no query, return all proposals
|
| 433 |
results = get_all_proposals(category_filter, language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
else:
|
| 435 |
+
results = semantic_search(query, top_k, category_filter, language)
|
| 436 |
+
|
| 437 |
+
return jsonify({
|
| 438 |
+
"query": query,
|
| 439 |
+
"results": results,
|
| 440 |
+
"total_results": len(results),
|
| 441 |
+
"category_filter": category_filter,
|
| 442 |
+
"language": language
|
| 443 |
+
})
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
except Exception as e:
|
| 446 |
logger.error(f"API error: {e}")
|
|
|
|
| 458 |
if not query:
|
| 459 |
# If no query, return all proposals
|
| 460 |
results = get_all_proposals(category_filter, language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
else:
|
| 462 |
+
results = semantic_search(query, top_k, category_filter, language)
|
| 463 |
+
|
| 464 |
+
return jsonify({
|
| 465 |
+
"query": query,
|
| 466 |
+
"results": results,
|
| 467 |
+
"total_results": len(results),
|
| 468 |
+
"category_filter": category_filter,
|
| 469 |
+
"language": language
|
| 470 |
+
})
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
except Exception as e:
|
| 473 |
logger.error(f"API error: {e}")
|