Spaces:

helal94hb1
/

backend_chatbot

Sleeping

App Files Files Community

helal94hb1 commited on Sep 13

Commit

bea2de8

1 Parent(s): 8b1b13a

fix: new embeddings and reranker

Browse files

Files changed (5) hide show

app/api/v2_endpoints.py +77 -8
app/core/config.py +5 -2
app/services/llm_service.py +2 -2
app/services/reranker_service.py +2 -1
app/services/retrieval.py +4 -4

app/api/v2_endpoints.py CHANGED Viewed

@@ -29,13 +29,53 @@ from app.services import query_expansion_service
 from app.core import state
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
 router = APIRouter()
 # --- Constants ---
 CONTEXT_CHUNK_COUNT = 100
 # --- MODIFIED: TOTAL_RETRIEVAL_COUNT is now the number of candidates for the re-ranker ---
-RERANK_CANDIDATE_COUNT = 200
 # --- Startup Event (Loads data into state) ---
 @router.on_event("startup")
@@ -172,10 +212,23 @@ async def handle_v2_query(
             retrieved_scores_float = [float(score) for chunk_id, score in search_results]
             candidate_chunks = []
             for chunk_id, initial_score in search_results:
                 chunk_text = state.chunk_content_map.get(str(chunk_id))
                 if chunk_text:
                     candidate_chunks.append({"id": str(chunk_id), "text": chunk_text})
             # --- MODIFIED: Offload the blocking re-ranker function to a threadpool ---
             reranked_chunks = await run_in_threadpool(
@@ -186,12 +239,18 @@ async def handle_v2_query(
             )
             if reranked_chunks:
-                score_threshold = settings.RERANKER_SCORE_THRESHOLD
-                filtered_chunks = [c for c in reranked_chunks if c['rerank_score'] > score_threshold]
-                if not filtered_chunks:
-                    logger.warning(f"No chunks met the score threshold of {score_threshold}. Using only the top-ranked chunk.")
-                    filtered_chunks = reranked_chunks[:1]
                 # --- MODIFIED: Offload the blocking sequence organization to a threadpool ---
                 organized_chunks = await run_in_threadpool(
@@ -213,7 +272,17 @@ async def handle_v2_query(
                 else:
                     llm_answer = "I found relevant documents, but could not construct an answer."
-                # ... (rest of the logic for top_result_preview remains the same) ...
             else:
                 llm_answer = "Could not re-rank the search results."

 from app.core import state
 logger = logging.getLogger(__name__)
 router = APIRouter()
 # --- Constants ---
 CONTEXT_CHUNK_COUNT = 100
 # --- MODIFIED: TOTAL_RETRIEVAL_COUNT is now the number of candidates for the re-ranker ---
+RERANK_CANDIDATE_COUNT = 100
+def dynamic_top_k_selection(
+    reranked_docs: List[Dict[str, Any]],
+    k_min: int = 3,
+    k_max: int = 15,
+    fall_off_threshold: float = 1.0 # Start with a threshold of 1.0 logit score drop
+) -> List[Dict[str, Any]]:
+    """
+    Selects a dynamic number of documents based on score fall-off.
+    """
+    if not reranked_docs:
+        return []
+    if len(reranked_docs) <= k_min:
+        return reranked_docs
+    scores = np.array([doc.get('rerank_score', -float('inf')) for doc in reranked_docs])
+    score_diffs = np.diff(scores) * -1 # Make differences positive as scores are descending
+    elbow_index = -1
+    # Start searching for a large fall-off after the k_min-th document
+    for i in range(k_min - 1, len(score_diffs)):
+        if score_diffs[i] > fall_off_threshold:
+            # The drop is after this document, so we take up to and including this one.
+            elbow_index = i + 1
+            break
+    if elbow_index != -1:
+        # We found a significant drop
+        final_k = elbow_index
+    else:
+        # No significant drop found, take the max allowed
+        final_k = k_max
+    # Ensure final_k is within the [k_min, k_max] bounds and also within list size
+    final_k = min(max(final_k, k_min), k_max, len(reranked_docs))
+    logger.info(f"Dynamic K selection: Found elbow at index {elbow_index}. "
+                f"Selected final K of {final_k} from {len(reranked_docs)} candidates.")
+    return reranked_docs[:final_k]
 # --- Startup Event (Loads data into state) ---
 @router.on_event("startup")
             retrieved_scores_float = [float(score) for chunk_id, score in search_results]
             candidate_chunks = []
+            missing_chunk_count = 0
             for chunk_id, initial_score in search_results:
                 chunk_text = state.chunk_content_map.get(str(chunk_id))
                 if chunk_text:
                     candidate_chunks.append({"id": str(chunk_id), "text": chunk_text})
+                else:
+                    missing_chunk_count += 1
+                    logger.warning(
+                        f"Data consistency warning: Retrieved chunk_id '{chunk_id}' "
+                        f"not found in the in-memory chunk_content_map."
+                    )
+            # --- LOGGING POINT 2: After building the candidate list ---
+            logger.debug(
+                f"Successfully built {len(candidate_chunks)} candidate chunks for re-ranking. "
+                f"{missing_chunk_count} chunks were dropped due to missing text content."
+            )
             # --- MODIFIED: Offload the blocking re-ranker function to a threadpool ---
             reranked_chunks = await run_in_threadpool(
             )
             if reranked_chunks:
+                filtered_chunks = dynamic_top_k_selection(
+                    reranked_docs=reranked_chunks,
+                    k_min=settings.RERANKER_K_MIN, # e.g., 3
+                    k_max=settings.RERANKER_K_MAX, # e.g., 100
+                    fall_off_threshold=settings.RERANKER_FALLOFF_THRESHOLD # e.g., 1.0
+                )
+                # score_threshold = settings.RERANKER_SCORE_THRESHOLD
+                # filtered_chunks = [c for c in reranked_chunks if c['rerank_score'] > score_threshold]
+                # if not filtered_chunks:
+                #     logger.warning(f"No chunks met the score threshold of {score_threshold}. Using only the top-ranked chunk.")
+                #     filtered_chunks = reranked_chunks[:1]
                 # --- MODIFIED: Offload the blocking sequence organization to a threadpool ---
                 organized_chunks = await run_in_threadpool(
                 else:
                     llm_answer = "I found relevant documents, but could not construct an answer."
+                top_result_preview = None
+                if reranked_chunks:
+                    top_chunk = reranked_chunks[0]
+                    top_metadata = state.chunk_metadata_map.get(top_chunk['id'], {})
+                    top_result_preview = schemas.TopResultPreview(
+                        id=top_chunk['id'],
+                        score=float(top_chunk['rerank_score']),
+                        content_preview=top_chunk['text'][:150],
+                        original_file=top_metadata.get('original_file')
+                    )
             else:
                 llm_answer = "Could not re-rank the search results."

app/core/config.py CHANGED Viewed

@@ -26,8 +26,11 @@ class Settings(BaseSettings):
     S3_RERANKER_URL: Optional[str] = None
     RERANKER_MODEL_PATH: str = "data/best_expert_judge_cross_encoder.pt" # Or the exact name of your saved .pt file
-    RERANKER_MODEL_NAME: str = "BAAI/bge-reranker-base" # The base model used in your training script
-    RERANKER_SCORE_THRESHOLD: float = 0.3
     SEQUENCE_EXPANSION_THRESHOLD: float =0.68

     S3_RERANKER_URL: Optional[str] = None
     RERANKER_MODEL_PATH: str = "data/best_expert_judge_cross_encoder.pt" # Or the exact name of your saved .pt file
+    RERANKER_MODEL_NAME: str = "mixedbread-ai/mxbai-rerank-base-v2" # The base model used in your training script
+    RERANKER_SCORE_THRESHOLD: float = 0.0
+    RERANKER_K_MIN: int = 5
+    RERANKER_FALLOFF_THRESHOLD: int = 1
+    RERANKER_K_MAX: int = 100
     SEQUENCE_EXPANSION_THRESHOLD: float =0.68

app/services/llm_service.py CHANGED Viewed

@@ -116,7 +116,7 @@ async def generate_answer(query: str, context_used: str) -> str:
         answer = "⚠️ LLM returned an empty response."
     else:
         logger.info("LLM query-based response generated.")
-        logger.debug(f"LLM Request preview: {messages}")
-        logger.debug(f"LLM Response preview: {answer}...")
     return answer

         answer = "⚠️ LLM returned an empty response."
     else:
         logger.info("LLM query-based response generated.")
+        logger.info(f"LLM Request preview: {messages}")
+        logger.info(f"LLM Response preview: {answer}...")
     return answer

app/services/reranker_service.py CHANGED Viewed

@@ -97,7 +97,8 @@ def rerank_chunks(query: str, chunks: List[Dict], metadata_map: Dict) -> List[Di
         encoded = {k: v.to(state.device) for k, v in encoded.items()}
         logits = model(input_ids=encoded['input_ids'], attention_mask=encoded['attention_mask'])
-        scores = torch.sigmoid(logits).squeeze().cpu().numpy()
     # Add the new, more accurate score to each chunk dictionary
     if len(chunks) == 1:

         encoded = {k: v.to(state.device) for k, v in encoded.items()}
         logits = model(input_ids=encoded['input_ids'], attention_mask=encoded['attention_mask'])
+        # scores = torch.sigmoid(logits).squeeze().cpu().numpy()
+        scores = logits.squeeze().cpu().numpy()
     # Add the new, more accurate score to each chunk dictionary
     if len(chunks) == 1:

app/services/retrieval.py CHANGED Viewed

@@ -129,8 +129,8 @@ def find_top_gnn_chunks(query_text: str, top_n: int = 100) -> List[Tuple[str, fl
     logger.info(f"Similarity search completed in {duration:.4f} seconds.")
     # --- MODIFIED: Detailed logging to include original_file ---
-    top_results_to_log = results[:30]
-    logger.debug("--- Top 30 Retrieved Chunks (from retrieval service) ---")
     for i, (chunk_id, score) in enumerate(top_results_to_log):
         # Look up metadata for the current chunk_id
         chunk_id_str = str(chunk_id) # Ensure the key is a string for lookup
@@ -138,8 +138,8 @@ def find_top_gnn_chunks(query_text: str, top_n: int = 100) -> List[Tuple[str, fl
         original_file = metadata.get('original_file', 'File not found')
         # Updated log message to include the original file
-        logger.debug(f"  {i+1}. Chunk ID: {chunk_id_str} | File: {original_file} | Score: {score:.4f}")
-    logger.debug("----------------------------------------------------------------")
     # --- END OF MODIFICATION ---
     return results[:top_n]

     logger.info(f"Similarity search completed in {duration:.4f} seconds.")
     # --- MODIFIED: Detailed logging to include original_file ---
+    top_results_to_log = results[:200]
+    logger.info("--- Top 30 Retrieved Chunks (from retrieval service) ---")
     for i, (chunk_id, score) in enumerate(top_results_to_log):
         # Look up metadata for the current chunk_id
         chunk_id_str = str(chunk_id) # Ensure the key is a string for lookup
         original_file = metadata.get('original_file', 'File not found')
         # Updated log message to include the original file
+        logger.info(f"  {i+1}. Chunk ID: {chunk_id_str} | File: {original_file} | Score: {score:.4f}")
+    logger.info("----------------------------------------------------------------")
     # --- END OF MODIFICATION ---
     return results[:top_n]