Spaces:

helal94hb1
/

backend_chatbot

Sleeping

App Files Files Community

helal94hb1 commited on Aug 31

Commit

8b1b13a

1 Parent(s): 67e765b

fix: abbreviation AA2

Browse files

Files changed (2) hide show

app/api/v2_endpoints.py +1 -1
app/services/query_expansion_service.py +0 -197

app/api/v2_endpoints.py CHANGED Viewed

@@ -150,7 +150,7 @@ async def handle_v2_query(
     top_result_preview = None
     original_file = None
     try:
-        # --- STEP 1: PRE-PROCESSING (Direct "TPH" Replacement) ---
         original_query = request.query
         # --- EDIT: Call the new, direct replacement function ---

     top_result_preview = None
     original_file = None
     try:
+        # --- STEP 1: PRE-PROCESSING (Direct ABBREVIATION Replacement) ---
         original_query = request.query
         # --- EDIT: Call the new, direct replacement function ---

app/services/query_expansion_service.py CHANGED Viewed

@@ -16,153 +16,6 @@ from app.core.config import settings
 logger = logging.getLogger(__name__)
-def load_t5_paraphraser():
-    """
-    Loads the T5 paraphrasing model and tokenizer into the central state.
-    This should be called once on application startup.
-    """
-    if state.t5_paraphraser_loaded:
-        logger.info("T5 paraphraser model already loaded in state.")
-        return True
-    # --- MODIFIED: Switched to a reliable, public T5 paraphrasing model ---
-    model_name = getattr(settings, "T5_PARAPHRASER_MODEL_NAME", "humarin/chatgpt_paraphraser_on_T5_base")
-    logger.info(f"Loading T5 paraphraser model: {model_name}...")
-    try:
-        state.t5_tokenizer = AutoTokenizer.from_pretrained(model_name)
-        state.t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        state.t5_model.to(state.device)
-        state.t5_model.eval()
-        state.t5_paraphraser_loaded = True
-        logger.info("T5 paraphraser model loaded successfully.")
-        return True
-    except Exception as e:
-        logger.exception(f"Failed to load T5 paraphraser model: {e}")
-        return False
-### NEW: Function to load abbreviations at startup ###
-def load_abbreviations():
-    """
-    Loads the abbreviation mapping from a CSV file into the central state.
-    """
-    if state.abbreviations_loaded:
-        logger.info("Abbreviation map already loaded in state.")
-        return True
-    file_path = settings.ABBREVIATION_FILE_PATH
-    logger.info(f"Loading abbreviation map from: {file_path}")
-    if not os.path.exists(file_path):
-        logger.error(f"Abbreviation file not found at path: {file_path}")
-        return False
-    abbreviation_map = {}
-    try:
-        with open(file_path, mode='r', encoding='utf-8') as infile:
-            reader = csv.reader(infile)
-            # Skip header row
-            next(reader, None)
-            for row in reader:
-                if len(row) >= 2:
-                    abbreviation = row[0].strip()
-                    original_text = row[1].strip()
-                    if abbreviation and original_text:
-                        # Store in lowercase for case-insensitive matching
-                        abbreviation_map[abbreviation.lower()] = original_text
-        state.abbreviation_map = abbreviation_map
-        state.abbreviations_loaded = True
-        logger.info(f"Successfully loaded {len(abbreviation_map)} abbreviations.")
-        return True
-    except Exception as e:
-        logger.exception(f"Failed to load or parse abbreviation file: {e}")
-        return False
-### NEW: Helper function to perform abbreviation expansion ###
-def _expand_with_abbreviations(query: str, abbrevation_map: Dict[str, str]) -> List[str]:
-    """
-    Generates new query variations by replacing known abbreviations.
-    """
-    expanded_queries = []
-    # Use word boundaries to match whole words only, case-insensitively
-    words = re.split(r'(\s+)', query)
-    for i, word in enumerate(words):
-        # Check the lowercased, punctuation-stripped word
-        clean_word = re.sub(r'[^\w]', '', word).lower()
-        if clean_word in abbrevation_map:
-            # Create a new query with the replacement
-            new_words = words[:]
-            new_words[i] = abbrevation_map[clean_word]
-            expanded_queries.append("".join(new_words))
-    if expanded_queries:
-        logger.info(f"Generated {len(expanded_queries)} variations from abbreviations.")
-    return expanded_queries
-### MODIFIED: Main function now combines both expansion strategies ###
-async def generate_query_variations(query: str, num_variations: int = 2) -> List[str]:
-    """
-    Uses both a local T5 model and an abbreviation map to generate paraphrases
-    and expansions of the user's query.
-    """
-    all_variations = []
-    # --- 1. T5 Paraphrasing ---
-    if state.t5_paraphraser_loaded and state.t5_model and state.t5_tokenizer:
-        try:
-            input_text = query
-            encoding = state.t5_tokenizer.encode_plus(input_text, padding="longest", return_tensors="pt")
-            input_ids, attention_mask = encoding.input_ids.to(state.device), encoding.attention_mask.to(state.device)
-            outputs = state.t5_model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_length=256,
-                num_beams=10,
-                num_return_sequences=num_variations,
-                no_repeat_ngram_size=2,
-                early_stopping=True
-            )
-            t5_variations = [
-                state.t5_tokenizer.decode(seq, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-                for seq in outputs
-            ]
-            all_variations.extend(t5_variations)
-            logger.info(f"Generated {len(t5_variations)} variations from T5 model.")
-        except Exception as e:
-            logger.exception("An error occurred during T5 query variation generation.")
-    else:
-        logger.warning("T5 paraphraser not loaded. Skipping AI paraphrasing.")
-    # --- 2. Abbreviation Expansion ---
-    if state.abbreviations_loaded and state.abbreviation_map:
-        try:
-            abbreviation_variations = _expand_with_abbreviations(query, state.abbreviation_map)
-            all_variations.extend(abbreviation_variations)
-        except Exception as e:
-            logger.exception("An error occurred during abbreviation expansion.")
-    else:
-        logger.warning("Abbreviation map not loaded. Skipping abbreviation expansion.")
-    # Return a unique list of variations
-    return list(set(all_variations))
-# --- NEW: Simple, direct function for TPH replacement ---
-def expand_tph_in_query(query_text: str) -> str:
-    """
-    Performs a case-insensitive, whole-word replacement of "TPH" with "Payment Hub".
-    """
-    # \b ensures we match "TPH" as a whole word, not as part of another word like "GRAPH".
-    # re.IGNORECASE makes the match case-insensitive (e.g., "tph", "Tph").
-    pattern = r'\bTPH\b'
-    replacement = "Payment Hub"
-    return re.sub(pattern, replacement, query_text, flags=re.IGNORECASE)
 def replace_abbreviations(query_text: str) -> str:
     """
@@ -199,53 +52,3 @@ def replace_abbreviations(query_text: str) -> str:
     # 6. Perform the substitution and return the result.
     return pattern.sub(get_replacement, query_text)
-# async def generate_query_variations(query: str, num_variations: int = 2) -> List[str]:
-#     """
-#     Uses a local T5 model to generate paraphrases of the user's query.
-#     Args:
-#         query (str): The original user query.
-#         num_variations (int): The number of variations to generate.
-#     Returns:
-#         List[str]: A list of paraphrased queries. Returns an empty list on failure.
-#     """
-#     if not state.t5_paraphraser_loaded or not state.t5_model or not state.t5_tokenizer:
-#         logger.error("Cannot generate query variations: T5 paraphraser is not initialized.")
-#         return []
-#     try:
-#         # --- MODIFIED: Removed the "paraphrase: " prefix as this model does not require it ---
-#         input_text = query
-#         # Tokenize the input
-#         encoding = state.t5_tokenizer.encode_plus(
-#             input_text,
-#             padding="longest",
-#             return_tensors="pt"
-#         )
-#         input_ids, attention_mask = encoding.input_ids.to(state.device), encoding.attention_mask.to(state.device)
-#         # Generate variations
-#         outputs = state.t5_model.generate(
-#             input_ids=input_ids,
-#             attention_mask=attention_mask,
-#             max_length=256,
-#             num_beams=10,
-#             num_return_sequences=num_variations,
-#             no_repeat_ngram_size=2,
-#             early_stopping=True
-#         )
-#         # Decode the generated token IDs back to strings
-#         variations = [
-#             state.t5_tokenizer.decode(generated_sequence, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-#             for generated_sequence in outputs
-#         ]
-#         logger.info(f"Generated {len(variations)} variations for query.")
-#         return variations
-#     except Exception as e:
-#         logger.exception(f"An unexpected error occurred during T5 query variation generation: {e}")
-#         return []

 logger = logging.getLogger(__name__)
 def replace_abbreviations(query_text: str) -> str:
     """
     # 6. Perform the substitution and return the result.
     return pattern.sub(get_replacement, query_text)