danulr05 commited on
Commit
14e48c5
·
verified ·
1 Parent(s): 13cda6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -56
app.py CHANGED
@@ -5,6 +5,7 @@ from pinecone import Pinecone
5
  import os
6
  import logging
7
  import json
 
8
 
9
  app = Flask(__name__)
10
  CORS(app) # Enable CORS for all routes
@@ -14,13 +15,9 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  # Initialize Pinecone
17
- PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
- if not PINECONE_API_KEY:
19
- raise ValueError("PINECONE_API_KEY environment variable is required")
20
-
21
  pc = Pinecone(api_key=PINECONE_API_KEY)
22
  # Configuration
23
- INDEX_NAME = "budget-proposals" # Use the new optimized index
24
 
25
  # Load embedding model
26
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
@@ -36,9 +33,22 @@ def load_dynamic_metadata():
36
  logger.error(f"Error loading dynamic metadata: {e}")
37
  return {}
38
 
39
- # Load dynamic metadata
40
  DYNAMIC_METADATA = load_dynamic_metadata()
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def get_pinecone_index():
43
  """Get the budget proposals Pinecone index"""
44
  try:
@@ -47,9 +57,13 @@ def get_pinecone_index():
47
  logger.error(f"Error accessing Pinecone index: {e}")
48
  return None
49
 
50
- def semantic_search(query: str, top_k=1, category_filter=None):
51
- """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
52
  try:
 
 
 
 
53
  pc_index = get_pinecone_index()
54
  if not pc_index:
55
  return []
@@ -87,28 +101,24 @@ def semantic_search(query: str, top_k=1, category_filter=None):
87
  # Sort documents by their best scores
88
  sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
89
 
90
-
91
  # Determine how many documents to return based on query specificity
92
  max_score = sorted_docs[0][1] # Best score
93
 
94
- # Minimum threshold - if best score is too low, return no results
95
- MIN_SCORE_THRESHOLD = 0.05 # Adjust this value as needed
96
-
97
  if max_score > 0.6:
98
- # Specific query - show 1-2 documents
99
- threshold = max_score * 0.8
100
  max_docs = 2
101
  elif max_score > 0.3:
102
- # Medium query - show 2-3 documents
103
- threshold = max_score * 0.7
104
  max_docs = 3
105
- elif max_score >= MIN_SCORE_THRESHOLD: # 0.3+ range
106
  # Broad query - show 3-5 documents
107
- threshold = max_score * 0.5
108
  max_docs = 5
109
- else:
110
- # Score too low - return no results
111
- return []
112
 
113
  results = []
114
  doc_count = 0
@@ -129,10 +139,11 @@ def semantic_search(query: str, top_k=1, category_filter=None):
129
  "costLKR": metadata.get("costLKR", "No Costing Available")
130
  })
131
 
132
- title = proposal_data["title"]
133
- summary = proposal_data["summary"]
134
- costLKR = proposal_data["costLKR"]
135
- category = proposal_data["category"]
 
136
  thumb_url = metadata.get("thumbUrl", "")
137
 
138
  result = {
@@ -158,9 +169,13 @@ def semantic_search(query: str, top_k=1, category_filter=None):
158
  logger.error(f"Search error: {e}")
159
  return []
160
 
161
- def get_all_proposals(category_filter=None):
162
- """Get all budget proposals (for initial load or when no search query)"""
163
  try:
 
 
 
 
164
  pc_index = get_pinecone_index()
165
  if not pc_index:
166
  logger.warning("Pinecone index not available, returning empty list")
@@ -204,26 +219,32 @@ def get_all_proposals(category_filter=None):
204
  "costLKR": metadata.get("costLKR", "No Costing Available")
205
  })
206
 
207
- title = proposal_data["title"]
208
- summary = proposal_data["summary"]
209
- costLKR = proposal_data["costLKR"]
210
- category = proposal_data["category"]
 
211
  thumb_url = metadata.get("thumbUrl", "")
212
 
213
- result = {
214
- "title": title,
215
- "summary": summary,
216
- "costLKR": costLKR,
217
- "category": category,
218
- "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
219
- "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
220
- "score": 1.0, # Default score for all proposals
221
- "relevance_percentage": 100,
222
- "file_path": file_path,
223
- "id": match["id"]
224
- }
225
-
226
- results.append(result)
 
 
 
 
 
227
 
228
  return results
229
 
@@ -233,24 +254,26 @@ def get_all_proposals(category_filter=None):
233
 
234
  @app.route('/api/search', methods=['POST'])
235
  def search_proposals():
236
- """API endpoint for searching budget proposals"""
237
  try:
238
  data = request.get_json()
239
  query = data.get('query', '').strip()
240
  top_k = data.get('top_k', 10)
241
  category_filter = data.get('category_filter')
 
242
 
243
  if not query:
244
  # If no query, return all proposals
245
- results = get_all_proposals(category_filter)
246
  else:
247
- results = semantic_search(query, top_k, category_filter)
248
 
249
  return jsonify({
250
  "query": query,
251
  "results": results,
252
  "total_results": len(results),
253
- "category_filter": category_filter
 
254
  })
255
 
256
  except Exception as e:
@@ -259,23 +282,25 @@ def search_proposals():
259
 
260
  @app.route('/api/search', methods=['GET'])
261
  def search_proposals_get():
262
- """API endpoint for searching proposals (GET method)"""
263
  try:
264
  query = request.args.get('query', '').strip()
265
  top_k = int(request.args.get('top_k', 10))
266
  category_filter = request.args.get('category_filter')
 
267
 
268
  if not query:
269
  # If no query, return all proposals
270
- results = get_all_proposals(category_filter)
271
  else:
272
- results = semantic_search(query, top_k, category_filter)
273
 
274
  return jsonify({
275
  "query": query,
276
  "results": results,
277
  "total_results": len(results),
278
- "category_filter": category_filter
 
279
  })
280
 
281
  except Exception as e:
@@ -284,15 +309,17 @@ def search_proposals_get():
284
 
285
  @app.route('/api/proposals', methods=['GET'])
286
  def get_proposals():
287
- """Get all budget proposals"""
288
  try:
289
  category_filter = request.args.get('category_filter')
290
- results = get_all_proposals(category_filter)
 
291
 
292
  return jsonify({
293
  "results": results,
294
  "total_results": len(results),
295
- "category_filter": category_filter
 
296
  })
297
 
298
  except Exception as e:
 
5
  import os
6
  import logging
7
  import json
8
+ from env_config import PINECONE_API_KEY
9
 
10
  app = Flask(__name__)
11
  CORS(app) # Enable CORS for all routes
 
15
  logger = logging.getLogger(__name__)
16
 
17
  # Initialize Pinecone
 
 
 
 
18
  pc = Pinecone(api_key=PINECONE_API_KEY)
19
  # Configuration
20
+ INDEX_NAME = "budget-proposals-optimized" # Use the new optimized index
21
 
22
  # Load embedding model
23
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
33
  logger.error(f"Error loading dynamic metadata: {e}")
34
  return {}
35
 
36
+ # Load dynamic metadata (will be reloaded on each request)
37
  DYNAMIC_METADATA = load_dynamic_metadata()
38
 
39
+ def get_language_specific_data(proposal_data, field, language='en'):
40
+ """Get language-specific data from proposal metadata"""
41
+ # If it's the old format (single language), return as-is
42
+ if isinstance(proposal_data.get(field), str):
43
+ return proposal_data.get(field, '')
44
+
45
+ # If it's the new multi-language format, return language-specific data
46
+ if isinstance(proposal_data.get(field), dict):
47
+ return proposal_data.get(field, {}).get(language,
48
+ proposal_data.get(field, {}).get('en', ''))
49
+
50
+ return ''
51
+
52
  def get_pinecone_index():
53
  """Get the budget proposals Pinecone index"""
54
  try:
 
57
  logger.error(f"Error accessing Pinecone index: {e}")
58
  return None
59
 
60
+ def semantic_search(query: str, top_k=1, category_filter=None, language='en'):
61
+ """Perform semantic search on budget proposals with multi-language support"""
62
  try:
63
+ # Reload metadata to get latest updates
64
+ global DYNAMIC_METADATA
65
+ DYNAMIC_METADATA = load_dynamic_metadata()
66
+
67
  pc_index = get_pinecone_index()
68
  if not pc_index:
69
  return []
 
101
  # Sort documents by their best scores
102
  sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
103
 
 
104
  # Determine how many documents to return based on query specificity
105
  max_score = sorted_docs[0][1] # Best score
106
 
107
+ # If the best score is very high (>0.6), it's a specific query - show fewer results
108
+ # If the best score is moderate (0.3-0.6), it's a medium query - show some results
109
+ # If the best score is low (<0.3), it's a broad query - show more results
110
  if max_score > 0.6:
111
+ # Specific query - show 1-2 documents
112
+ threshold = max_score * 0.8 # Show documents within 80% of best score
113
  max_docs = 2
114
  elif max_score > 0.3:
115
+ # Medium query - show 2-3 documents
116
+ threshold = max_score * 0.7 # Show documents within 70% of best score
117
  max_docs = 3
118
+ else:
119
  # Broad query - show 3-5 documents
120
+ threshold = max_score * 0.5 # Show documents within 50% of best score
121
  max_docs = 5
 
 
 
122
 
123
  results = []
124
  doc_count = 0
 
139
  "costLKR": metadata.get("costLKR", "No Costing Available")
140
  })
141
 
142
+ # Get language-specific data
143
+ title = get_language_specific_data(proposal_data, "title", language)
144
+ summary = get_language_specific_data(proposal_data, "summary", language)
145
+ costLKR = get_language_specific_data(proposal_data, "costLKR", language)
146
+ category = get_language_specific_data(proposal_data, "category", language)
147
  thumb_url = metadata.get("thumbUrl", "")
148
 
149
  result = {
 
169
  logger.error(f"Search error: {e}")
170
  return []
171
 
172
+ def get_all_proposals(category_filter=None, language='en'):
173
+ """Get all budget proposals with multi-language support"""
174
  try:
175
+ # Reload metadata to get latest updates
176
+ global DYNAMIC_METADATA
177
+ DYNAMIC_METADATA = load_dynamic_metadata()
178
+
179
  pc_index = get_pinecone_index()
180
  if not pc_index:
181
  logger.warning("Pinecone index not available, returning empty list")
 
219
  "costLKR": metadata.get("costLKR", "No Costing Available")
220
  })
221
 
222
+ # Get language-specific data
223
+ title = get_language_specific_data(proposal_data, "title", language)
224
+ summary = get_language_specific_data(proposal_data, "summary", language)
225
+ costLKR = get_language_specific_data(proposal_data, "costLKR", language)
226
+ category = get_language_specific_data(proposal_data, "category", language)
227
  thumb_url = metadata.get("thumbUrl", "")
228
 
229
+ # Only include documents that have meaningful content in the requested language
230
+ # Skip documents where title and summary are empty or "Unknown"/"No summary available"
231
+ if (title and title.strip() and title not in ["Unknown", "Unknown Title"] and
232
+ summary and summary.strip() and summary not in ["No summary available", ""]):
233
+
234
+ result = {
235
+ "title": title,
236
+ "summary": summary,
237
+ "costLKR": costLKR,
238
+ "category": category,
239
+ "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
240
+ "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
241
+ "score": 1.0, # Default score for all proposals
242
+ "relevance_percentage": 100,
243
+ "file_path": file_path,
244
+ "id": match["id"]
245
+ }
246
+
247
+ results.append(result)
248
 
249
  return results
250
 
 
254
 
255
  @app.route('/api/search', methods=['POST'])
256
  def search_proposals():
257
+ """API endpoint for searching budget proposals with multi-language support"""
258
  try:
259
  data = request.get_json()
260
  query = data.get('query', '').strip()
261
  top_k = data.get('top_k', 10)
262
  category_filter = data.get('category_filter')
263
+ language = data.get('language', 'en') # Default to English
264
 
265
  if not query:
266
  # If no query, return all proposals
267
+ results = get_all_proposals(category_filter, language)
268
  else:
269
+ results = semantic_search(query, top_k, category_filter, language)
270
 
271
  return jsonify({
272
  "query": query,
273
  "results": results,
274
  "total_results": len(results),
275
+ "category_filter": category_filter,
276
+ "language": language
277
  })
278
 
279
  except Exception as e:
 
282
 
283
  @app.route('/api/search', methods=['GET'])
284
  def search_proposals_get():
285
+ """API endpoint for searching proposals (GET method) with multi-language support"""
286
  try:
287
  query = request.args.get('query', '').strip()
288
  top_k = int(request.args.get('top_k', 10))
289
  category_filter = request.args.get('category_filter')
290
+ language = request.args.get('language', 'en') # Default to English
291
 
292
  if not query:
293
  # If no query, return all proposals
294
+ results = get_all_proposals(category_filter, language)
295
  else:
296
+ results = semantic_search(query, top_k, category_filter, language)
297
 
298
  return jsonify({
299
  "query": query,
300
  "results": results,
301
  "total_results": len(results),
302
+ "category_filter": category_filter,
303
+ "language": language
304
  })
305
 
306
  except Exception as e:
 
309
 
310
  @app.route('/api/proposals', methods=['GET'])
311
  def get_proposals():
312
+ """Get all budget proposals with multi-language support"""
313
  try:
314
  category_filter = request.args.get('category_filter')
315
+ language = request.args.get('language', 'en') # Default to English
316
+ results = get_all_proposals(category_filter, language)
317
 
318
  return jsonify({
319
  "results": results,
320
  "total_results": len(results),
321
+ "category_filter": category_filter,
322
+ "language": language
323
  })
324
 
325
  except Exception as e: