Spaces:

markobinario
/

flaskbot

Sleeping

File size: 8,907 Bytes

d1e5d7d

from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import re

class AIChatbot:
    def __init__(self):
        # Load the pre-trained model (can use a smaller model for more speed)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        # Warm up the model to avoid first-request slowness
        _ = self.model.encode(["Hello, world!"])
        self.faq_embeddings = None
        self.faqs = None
        self.load_faqs()
    
    def load_faqs(self):
        """Load static FAQs and compute their normalized embeddings"""
        # Static FAQ data
        self.faqs = [
            {"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
            {"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
            {"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
            {"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
            {"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
            {"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
            {"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
            {"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
            {"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
            {"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
        ]
        
        if self.faqs:
            # Compute and normalize embeddings for all questions
            questions = [faq['question'] for faq in self.faqs]
            embeddings = self.model.encode(questions, normalize_embeddings=True)
            self.faq_embeddings = np.array(embeddings)
    
    def save_unanswered_question(self, question):
        """Log unanswered questions to console (can be extended to save to file)"""
        print(f"Unanswered question logged: {question}")
        # In a real implementation, you could save this to a file or send to an admin

    def _tokenize(self, text: str):
        if not text:
            return []
        return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]

    def _overlap_ratio(self, q_tokens, faq_tokens):
        if not q_tokens or not faq_tokens:
            return 0.0
        q_set = set(q_tokens)
        f_set = set(faq_tokens)
        inter = len(q_set & f_set)
        denom = max(len(q_set), 1)
        return inter / denom

    def _wh_class(self, text: str) -> str:
        if not text:
            return ''
        s = text.strip().lower()
        # simple heuristic classification by leading wh-word
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if s.startswith(key + ' ') or s.startswith(key + "?"):
                return key
        # also check presence if not leading
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if f' {key} ' in f' {s} ':
                return key
        return ''

    def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
        print(f"find_best_match called with: {question}")  # Debug print
        if not self.faqs or self.faq_embeddings is None:
            return "I'm sorry, I couldn't find any FAQs in the database.", 0.0

        # Compute and normalize embedding for the input question
        question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
        similarities = np.dot(self.faq_embeddings, question_embedding)

        # Compute keyword overlap with each FAQ question
        q_tokens = self._tokenize(question)
        overlap_scores = []
        for faq in self.faqs:
            overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))

        similarities = np.array(similarities)
        overlap_scores = np.array(overlap_scores)

        # Combined score to reduce false positives
        combined = 0.7 * similarities + 0.3 * overlap_scores
        
        # Apply WH-word intent consistency penalty
        q_wh = self._wh_class(question)
        if q_wh:
            for i, faq in enumerate(self.faqs):
                f_wh = self._wh_class(faq['question'])
                if f_wh and f_wh != q_wh:
                    combined[i] *= 0.6  # penalize mismatched intent significantly
        best_idx = int(np.argmax(combined))
        best_semantic = float(similarities[best_idx])
        best_overlap = float(overlap_scores[best_idx])
        best_combined = float(combined[best_idx])
        best_wh = self._wh_class(self.faqs[best_idx]['question'])

        # Acceptance criteria: require good semantic OR strong combined with overlap
        accept = (
            best_semantic >= max(0.7, threshold)
            or (best_combined >= threshold and best_overlap >= 0.3)
        )
        # Enforce WH intent match when present
        if accept and q_wh and best_wh and q_wh != best_wh:
            accept = False

        if accept:
            return self.faqs[best_idx]['answer'], best_combined
        else:
            # Log as unanswered so admins can curate (ignore errors)
            try:
                self.save_unanswered_question(question)
            except Exception:
                pass
            fallback = (
                "Sorry, I don’t have the knowledge to answer that yet.\n"
                "I’ll notify an admin about your question and we’ll add the answer soon.\n"
                "Please come back in a while."
            )
            return (fallback, best_combined)
    
    def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
        """Get suggested questions based on the input question"""
        if not self.faqs or self.faq_embeddings is None:
            return []
        
        # Compute and normalize embedding for the input question
        question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
        
        # Calculate cosine similarity
        similarities = np.dot(self.faq_embeddings, question_embedding)
        
        # Get top N similar questions
        top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
        return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
    
    def add_faq(self, question: str, answer: str) -> bool:
        """Add a new FAQ to the static list (for demonstration purposes)"""
        try:
            new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
            new_faq = {"id": new_id, "question": question, "answer": answer}
            self.faqs.append(new_faq)
            
            # Recompute embeddings
            questions = [faq['question'] for faq in self.faqs]
            embeddings = self.model.encode(questions, normalize_embeddings=True)
            self.faq_embeddings = np.array(embeddings)
            
            print(f"FAQ added: {question}")
            return True
        except Exception as e:
            print(f"Error adding FAQ: {e}")
            return False