Spaces:

minhvtt
/

ChatbotRAG

Running

File size: 7,319 Bytes

70413d7

"""
Intent Classifier for Hybrid RAG + FSM Chatbot
Detects user intent to route between scenario flows and RAG queries
"""
from typing import Dict, Optional, List
import re


class IntentClassifier:
    """
    Classify user intent using keyword matching
    Routes to either:
    - Scenario flows (scripted conversations)
    - RAG queries (knowledge retrieval)
    """
    
    def __init__(self, scenarios_dir: str = "scenarios"):
        """
        Initialize with auto-loading triggers from scenario JSON files
        
        Args:
            scenarios_dir: Directory containing scenario JSON files
        """
        # Auto-load scenario patterns from JSON files
        self.scenario_patterns = self._load_scenario_patterns(scenarios_dir)
        
        # General question patterns (RAG)
        self.general_patterns = [
            # Location
            "ở đâu", "địa điểm", "location", "where",
            "chỗ nào", "tổ chức tại",
            
            # Time
            "mấy giờ", "khi nào", "when", "time",
            "bao giờ", "thời gian", "ngày nào",
            
            # Info
            "thông tin", "info", "information",
            "chi tiết", "details", "về",
            
            # Parking
            "đậu xe", "parking", "gửi xe",
            
            # Contact
            "liên hệ", "contact", "số điện thoại",
            
            # Events/content
            "sự kiện", "event", "đâu", "show nào",
            "line-up", "lineup", "performer"
        ]
    
    def _load_scenario_patterns(self, scenarios_dir: str) -> dict:
        """
        Auto-load triggers from all scenario JSON files
        
        Returns:
            {"scenario_id": ["trigger1", "trigger2", ...]}
        """
        import json
        import os
        
        patterns = {}
        
        if not os.path.exists(scenarios_dir):
            print(f"⚠ Scenarios directory not found: {scenarios_dir}")
            return patterns
        
        for filename in os.listdir(scenarios_dir):
            if filename.endswith('.json'):
                filepath = os.path.join(scenarios_dir, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        scenario = json.load(f)
                        scenario_id = scenario.get('scenario_id')
                        triggers = scenario.get('triggers', [])
                        
                        if scenario_id and triggers:
                            patterns[scenario_id] = triggers
                            print(f"✓ Loaded triggers for: {scenario_id} ({len(triggers)} patterns)")
                except Exception as e:
                    print(f"⚠ Error loading {filename}: {e}")
        
        return patterns
    
    def classify(
        self, 
        message: str, 
        conversation_state: Optional[Dict] = None
    ) -> str:
        """
        Classify user intent with improved mid-scenario detection
        
        Returns:
            - "scenario:{scenario_id}" - Trigger new scenario
            - "scenario:continue" - Continue active scenario
            - "rag:general" - General RAG query (no active scenario)
            - "rag:with_resume" - RAG query mid-scenario (then resume)
        """
        message_lower = message.lower().strip()
        
        # Check if user is in active scenario
        active_scenario = conversation_state.get('active_scenario') if conversation_state else None
        
        if active_scenario:
            # User is in a scenario - check if this is off-topic or continuation
            
            # Valid choice keywords (answers to scenario questions)
            choice_keywords = [
                # Event recommendation choices
                'giá', 'price', 'vé', 'ticket',
                'lineup', 'line-up', 'nghệ sĩ', 'artist',
                'địa điểm', 'location', 'chỗ',
                'thời gian', 'time', 'lịch',
                # General answers
                'có', 'yes', 'ok', 'được', 'không', 'no',
                'chill', 'sôi động', 'hài', 'workshop',
                '1', '2', '3', '4', '5'  # Ratings or choices
            ]
            
            # Check if message matches valid answer
            is_valid_answer = any(keyword in message_lower for keyword in choice_keywords)
            
            # Check if this is a question (off-topic)
            has_question_mark = "?" in message
            question_words = ["gì", "sao", "thế nào", "bao nhiêu", "mấy giờ", "ai", "how", "what", "why"]
            has_question_word = any(qw in message_lower for qw in question_words)
            
            # Classify as off-topic ONLY if:
            # 1. Has question mark OR question words
            # 2. AND does NOT match valid answer keywords
            # 3. AND is asking about new information
            is_off_topic = (has_question_mark or has_question_word) and not is_valid_answer
            
            if is_off_topic:
                print(f"🔀 Off-topic question detected: '{message}' → rag:with_resume")
                return "rag:with_resume"
            else:
                # Normal scenario continuation
                return "scenario:continue"
        
        # Not in scenario - check for scenario triggers
        for scenario_id, patterns in self.scenario_patterns.items():
            for pattern in patterns:
                if pattern.lower() in message_lower:
                    return f"scenario:{scenario_id}"
        
        # No scenario match - general RAG query
        return "rag:general"
    
    def _matches_any_pattern(self, message: str, patterns: List[str]) -> bool:
        """
        Check if message matches any pattern in list
        """
        for pattern in patterns:
            # Simple substring match
            if pattern in message:
                return True
            
            # Word boundary check
            if re.search(rf'\b{re.escape(pattern)}\b', message, re.IGNORECASE):
                return True
        
        return False
    
    def get_scenario_type(self, intent: str) -> Optional[str]:
        """
        Extract scenario type from intent string
        
        Args:
            intent: "scenario:price_inquiry" or "scenario:continue"
        
        Returns:
            "price_inquiry" or None
        """
        if not intent.startswith("scenario:"):
            return None
        
        parts = intent.split(":", 1)
        if len(parts) < 2:
            return None
        
        scenario_type = parts[1]
        if scenario_type == "continue":
            return None
        
        return scenario_type
    
    def add_scenario_pattern(self, scenario_id: str, patterns: List[str]):
        """
        Dynamically add new scenario patterns
        """
        if scenario_id in self.scenario_patterns:
            self.scenario_patterns[scenario_id].extend(patterns)
        else:
            self.scenario_patterns[scenario_id] = patterns
    
    def add_general_pattern(self, patterns: List[str]):
        """
        Dynamically add new general question patterns
        """
        self.general_patterns.extend(patterns)