File size: 8,907 Bytes
d1e5d7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import re

class AIChatbot:
    def __init__(self):
        # Load the pre-trained model (can use a smaller model for more speed)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        # Warm up the model to avoid first-request slowness
        _ = self.model.encode(["Hello, world!"])
        self.faq_embeddings = None
        self.faqs = None
        self.load_faqs()
    
    def load_faqs(self):
        """Load static FAQs and compute their normalized embeddings"""
        # Static FAQ data
        self.faqs = [
            {"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
            {"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
            {"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
            {"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
            {"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
            {"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
            {"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
            {"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
            {"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
            {"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
        ]
        
        if self.faqs:
            # Compute and normalize embeddings for all questions
            questions = [faq['question'] for faq in self.faqs]
            embeddings = self.model.encode(questions, normalize_embeddings=True)
            self.faq_embeddings = np.array(embeddings)
    
    def save_unanswered_question(self, question):
        """Log unanswered questions to console (can be extended to save to file)"""
        print(f"Unanswered question logged: {question}")
        # In a real implementation, you could save this to a file or send to an admin

    def _tokenize(self, text: str):
        if not text:
            return []
        return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]

    def _overlap_ratio(self, q_tokens, faq_tokens):
        if not q_tokens or not faq_tokens:
            return 0.0
        q_set = set(q_tokens)
        f_set = set(faq_tokens)
        inter = len(q_set & f_set)
        denom = max(len(q_set), 1)
        return inter / denom

    def _wh_class(self, text: str) -> str:
        if not text:
            return ''
        s = text.strip().lower()
        # simple heuristic classification by leading wh-word
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if s.startswith(key + ' ') or s.startswith(key + "?"):
                return key
        # also check presence if not leading
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if f' {key} ' in f' {s} ':
                return key
        return ''

    def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
        print(f"find_best_match called with: {question}")  # Debug print
        if not self.faqs or self.faq_embeddings is None:
            return "I'm sorry, I couldn't find any FAQs in the database.", 0.0

        # Compute and normalize embedding for the input question
        question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
        similarities = np.dot(self.faq_embeddings, question_embedding)

        # Compute keyword overlap with each FAQ question
        q_tokens = self._tokenize(question)
        overlap_scores = []
        for faq in self.faqs:
            overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))

        similarities = np.array(similarities)
        overlap_scores = np.array(overlap_scores)

        # Combined score to reduce false positives
        combined = 0.7 * similarities + 0.3 * overlap_scores
        
        # Apply WH-word intent consistency penalty
        q_wh = self._wh_class(question)
        if q_wh:
            for i, faq in enumerate(self.faqs):
                f_wh = self._wh_class(faq['question'])
                if f_wh and f_wh != q_wh:
                    combined[i] *= 0.6  # penalize mismatched intent significantly
        best_idx = int(np.argmax(combined))
        best_semantic = float(similarities[best_idx])
        best_overlap = float(overlap_scores[best_idx])
        best_combined = float(combined[best_idx])
        best_wh = self._wh_class(self.faqs[best_idx]['question'])

        # Acceptance criteria: require good semantic OR strong combined with overlap
        accept = (
            best_semantic >= max(0.7, threshold)
            or (best_combined >= threshold and best_overlap >= 0.3)
        )
        # Enforce WH intent match when present
        if accept and q_wh and best_wh and q_wh != best_wh:
            accept = False

        if accept:
            return self.faqs[best_idx]['answer'], best_combined
        else:
            # Log as unanswered so admins can curate (ignore errors)
            try:
                self.save_unanswered_question(question)
            except Exception:
                pass
            fallback = (
                "Sorry, I don’t have the knowledge to answer that yet.\n"
                "I’ll notify an admin about your question and we’ll add the answer soon.\n"
                "Please come back in a while."
            )
            return (fallback, best_combined)
    
    def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
        """Get suggested questions based on the input question"""
        if not self.faqs or self.faq_embeddings is None:
            return []
        
        # Compute and normalize embedding for the input question
        question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
        
        # Calculate cosine similarity
        similarities = np.dot(self.faq_embeddings, question_embedding)
        
        # Get top N similar questions
        top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
        return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
    
    def add_faq(self, question: str, answer: str) -> bool:
        """Add a new FAQ to the static list (for demonstration purposes)"""
        try:
            new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
            new_faq = {"id": new_id, "question": question, "answer": answer}
            self.faqs.append(new_faq)
            
            # Recompute embeddings
            questions = [faq['question'] for faq in self.faqs]
            embeddings = self.model.encode(questions, normalize_embeddings=True)
            self.faq_embeddings = np.array(embeddings)
            
            print(f"FAQ added: {question}")
            return True
        except Exception as e:
            print(f"Error adding FAQ: {e}")
            return False