danulr05 commited on
Commit
5dd4551
·
verified ·
1 Parent(s): 8a5b496

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -114
app.py CHANGED
@@ -157,59 +157,19 @@ Converted Tamil script:"""
157
  logger.warning(f"Tamil transliteration failed: {e}")
158
  return text
159
 
160
- def detect_language_from_query(query):
161
- """Detect language from query text"""
162
- # Check for Sinhala Unicode
163
- if re.search(r'[\u0D80-\u0DFF]', query):
164
- return 'si'
165
-
166
- # Check for Tamil Unicode
167
- if re.search(r'[\u0B80-\u0BFF]', query):
168
- return 'ta'
169
-
170
- # Check for Singlish (Romanized Sinhala)
171
- if contains_sinhala_roman(query):
172
- return 'singlish'
173
-
174
- # Check for Romanized Tamil
175
- if contains_tamil_roman(query):
176
- return 'romanized_tamil'
177
-
178
- # Default to English
179
- return 'en'
180
-
181
  def preprocess_query(query, language):
182
  """Preprocess query with transliteration if needed"""
183
- # Auto-detect language if not explicitly provided or if it's 'en' (default)
184
- if language == 'en' or not language:
185
- detected_language = detect_language_from_query(query)
186
- logger.info(f"Auto-detected language: {detected_language} for query: {query}")
187
- else:
188
- detected_language = language
189
-
190
- # Handle Singlish (Romanized Sinhala) - always transliterate to Sinhala script
191
- if detected_language == 'singlish' or (language == 'si' and contains_sinhala_roman(query)):
192
- logger.info(f"Transliterating Singlish to Sinhala: {query}")
193
  transliterated = transliterate_sinhala_roman_to_sinhala(query)
194
  logger.info(f"Transliterated to: {transliterated}")
195
- return transliterated, 'si' # Return both transliterated text and target language
196
-
197
- # Handle Romanized Tamil - always transliterate to Tamil script
198
- elif detected_language == 'romanized_tamil' or (language == 'ta' and contains_tamil_roman(query)):
199
- logger.info(f"Transliterating Romanized Tamil to Tamil: {query}")
200
  transliterated = transliterate_tamil_roman_to_tamil(query)
201
  logger.info(f"Transliterated to: {transliterated}")
202
- return transliterated, 'ta' # Return both transliterated text and target language
203
-
204
- # For proper Sinhala/Tamil Unicode, use as-is
205
- elif detected_language in ['si', 'ta']:
206
- logger.info(f"Using original {detected_language} text: {query}")
207
- return query, detected_language
208
-
209
- # For English, use as-is
210
- else:
211
- logger.info(f"Using original English text: {query}")
212
- return query, 'en'
213
 
214
  # Load dynamic metadata
215
  def load_dynamic_metadata():
@@ -258,17 +218,14 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
258
 
259
  # Preprocess query with transliteration if needed
260
  original_query = query
261
- query, detected_language = preprocess_query(query, language)
262
 
263
- # Use the detected language for embedding model and index selection
264
- search_language = detected_language
265
-
266
- pc_index = get_pinecone_index(search_language)
267
  if not pc_index:
268
  return []
269
 
270
  # Use language-specific embedding model
271
- model = get_embedding_model(search_language)
272
  query_emb = model.encode(query).tolist()
273
 
274
  # Build filter if category is specified
@@ -340,11 +297,11 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
340
  "costLKR": metadata.get("costLKR", "No Costing Available")
341
  })
342
 
343
- # Get language-specific data using the detected language
344
- title = get_language_specific_data(proposal_data, "title", search_language)
345
- summary = get_language_specific_data(proposal_data, "summary", search_language)
346
- costLKR = get_language_specific_data(proposal_data, "costLKR", search_language)
347
- category = get_language_specific_data(proposal_data, "category", search_language)
348
  thumb_url = metadata.get("thumbUrl", "")
349
 
350
  # Only include documents that have meaningful content in the requested language
@@ -370,22 +327,10 @@ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
370
  doc_count += 1
371
  break
372
 
373
- return {
374
- "results": results,
375
- "original_query": original_query,
376
- "processed_query": query,
377
- "detected_language": search_language,
378
- "transliterated": original_query != query
379
- }
380
  except Exception as e:
381
  logger.error(f"Search error: {e}")
382
- return {
383
- "results": [],
384
- "original_query": query,
385
- "processed_query": query,
386
- "detected_language": language,
387
- "transliterated": False
388
- }
389
 
390
  def get_all_proposals(category_filter=None, language='en'):
391
  """Get all budget proposals with multi-language support"""
@@ -486,28 +431,16 @@ def search_proposals():
486
  if not query:
487
  # If no query, return all proposals
488
  results = get_all_proposals(category_filter, language)
489
- return jsonify({
490
- "query": query,
491
- "results": results,
492
- "total_results": len(results),
493
- "category_filter": category_filter,
494
- "language": language,
495
- "detected_language": language,
496
- "transliterated": False
497
- })
498
  else:
499
- search_result = semantic_search(query, top_k, category_filter, language)
500
- return jsonify({
501
- "query": query,
502
- "original_query": search_result["original_query"],
503
- "processed_query": search_result["processed_query"],
504
- "results": search_result["results"],
505
- "total_results": len(search_result["results"]),
506
- "category_filter": category_filter,
507
- "language": language,
508
- "detected_language": search_result["detected_language"],
509
- "transliterated": search_result["transliterated"]
510
- })
511
 
512
  except Exception as e:
513
  logger.error(f"API error: {e}")
@@ -525,28 +458,16 @@ def search_proposals_get():
525
  if not query:
526
  # If no query, return all proposals
527
  results = get_all_proposals(category_filter, language)
528
- return jsonify({
529
- "query": query,
530
- "results": results,
531
- "total_results": len(results),
532
- "category_filter": category_filter,
533
- "language": language,
534
- "detected_language": language,
535
- "transliterated": False
536
- })
537
  else:
538
- search_result = semantic_search(query, top_k, category_filter, language)
539
- return jsonify({
540
- "query": query,
541
- "original_query": search_result["original_query"],
542
- "processed_query": search_result["processed_query"],
543
- "results": search_result["results"],
544
- "total_results": len(search_result["results"]),
545
- "category_filter": category_filter,
546
- "language": language,
547
- "detected_language": search_result["detected_language"],
548
- "transliterated": search_result["transliterated"]
549
- })
550
 
551
  except Exception as e:
552
  logger.error(f"API error: {e}")
 
157
  logger.warning(f"Tamil transliteration failed: {e}")
158
  return text
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def preprocess_query(query, language):
161
  """Preprocess query with transliteration if needed"""
162
+ if language == 'si' and contains_sinhala_roman(query):
163
+ logger.info(f"Transliterating Roman Sinhala: {query}")
 
 
 
 
 
 
 
 
164
  transliterated = transliterate_sinhala_roman_to_sinhala(query)
165
  logger.info(f"Transliterated to: {transliterated}")
166
+ return transliterated
167
+ elif language == 'ta' and contains_tamil_roman(query):
168
+ logger.info(f"Transliterating Roman Tamil: {query}")
 
 
169
  transliterated = transliterate_tamil_roman_to_tamil(query)
170
  logger.info(f"Transliterated to: {transliterated}")
171
+ return transliterated
172
+ return query
 
 
 
 
 
 
 
 
 
173
 
174
  # Load dynamic metadata
175
  def load_dynamic_metadata():
 
218
 
219
  # Preprocess query with transliteration if needed
220
  original_query = query
221
+ query = preprocess_query(query, language)
222
 
223
+ pc_index = get_pinecone_index(language)
 
 
 
224
  if not pc_index:
225
  return []
226
 
227
  # Use language-specific embedding model
228
+ model = get_embedding_model(language)
229
  query_emb = model.encode(query).tolist()
230
 
231
  # Build filter if category is specified
 
297
  "costLKR": metadata.get("costLKR", "No Costing Available")
298
  })
299
 
300
+ # Get language-specific data
301
+ title = get_language_specific_data(proposal_data, "title", language)
302
+ summary = get_language_specific_data(proposal_data, "summary", language)
303
+ costLKR = get_language_specific_data(proposal_data, "costLKR", language)
304
+ category = get_language_specific_data(proposal_data, "category", language)
305
  thumb_url = metadata.get("thumbUrl", "")
306
 
307
  # Only include documents that have meaningful content in the requested language
 
327
  doc_count += 1
328
  break
329
 
330
+ return results
 
 
 
 
 
 
331
  except Exception as e:
332
  logger.error(f"Search error: {e}")
333
+ return []
 
 
 
 
 
 
334
 
335
  def get_all_proposals(category_filter=None, language='en'):
336
  """Get all budget proposals with multi-language support"""
 
431
  if not query:
432
  # If no query, return all proposals
433
  results = get_all_proposals(category_filter, language)
 
 
 
 
 
 
 
 
 
434
  else:
435
+ results = semantic_search(query, top_k, category_filter, language)
436
+
437
+ return jsonify({
438
+ "query": query,
439
+ "results": results,
440
+ "total_results": len(results),
441
+ "category_filter": category_filter,
442
+ "language": language
443
+ })
 
 
 
444
 
445
  except Exception as e:
446
  logger.error(f"API error: {e}")
 
458
  if not query:
459
  # If no query, return all proposals
460
  results = get_all_proposals(category_filter, language)
 
 
 
 
 
 
 
 
 
461
  else:
462
+ results = semantic_search(query, top_k, category_filter, language)
463
+
464
+ return jsonify({
465
+ "query": query,
466
+ "results": results,
467
+ "total_results": len(results),
468
+ "category_filter": category_filter,
469
+ "language": language
470
+ })
 
 
 
471
 
472
  except Exception as e:
473
  logger.error(f"API error: {e}")