from sentence_transformers import SentenceTransformer import numpy as np from typing import List, Dict, Tuple import re class AIChatbot: def __init__(self): # Load the pre-trained model (can use a smaller model for more speed) self.model = SentenceTransformer('all-MiniLM-L6-v2') # Warm up the model to avoid first-request slowness _ = self.model.encode(["Hello, world!"]) self.faq_embeddings = None self.faqs = None self.load_faqs() def load_faqs(self): """Load static FAQs and compute their normalized embeddings""" # Static FAQ data self.faqs = [ {"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."}, {"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."}, {"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."}, {"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."}, {"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."}, {"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."}, {"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."}, {"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."}, {"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."}, {"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."} ] if self.faqs: # Compute and normalize embeddings for all questions questions = [faq['question'] for faq in self.faqs] embeddings = self.model.encode(questions, normalize_embeddings=True) self.faq_embeddings = np.array(embeddings) def save_unanswered_question(self, question): """Log unanswered questions to console (can be extended to save to file)""" print(f"Unanswered question logged: {question}") # In a real implementation, you could save this to a file or send to an admin def _tokenize(self, text: str): if not text: return [] return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2] def _overlap_ratio(self, q_tokens, faq_tokens): if not q_tokens or not faq_tokens: return 0.0 q_set = set(q_tokens) f_set = set(faq_tokens) inter = len(q_set & f_set) denom = max(len(q_set), 1) return inter / denom def _wh_class(self, text: str) -> str: if not text: return '' s = text.strip().lower() # simple heuristic classification by leading wh-word for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']: if s.startswith(key + ' ') or s.startswith(key + "?"): return key # also check presence if not leading for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']: if f' {key} ' in f' {s} ': return key return '' def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]: print(f"find_best_match called with: {question}") # Debug print if not self.faqs or self.faq_embeddings is None: return "I'm sorry, I couldn't find any FAQs in the database.", 0.0 # Compute and normalize embedding for the input question question_embedding = self.model.encode([question], normalize_embeddings=True)[0] similarities = np.dot(self.faq_embeddings, question_embedding) # Compute keyword overlap with each FAQ question q_tokens = self._tokenize(question) overlap_scores = [] for faq in self.faqs: overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question']))) similarities = np.array(similarities) overlap_scores = np.array(overlap_scores) # Combined score to reduce false positives combined = 0.7 * similarities + 0.3 * overlap_scores # Apply WH-word intent consistency penalty q_wh = self._wh_class(question) if q_wh: for i, faq in enumerate(self.faqs): f_wh = self._wh_class(faq['question']) if f_wh and f_wh != q_wh: combined[i] *= 0.6 # penalize mismatched intent significantly best_idx = int(np.argmax(combined)) best_semantic = float(similarities[best_idx]) best_overlap = float(overlap_scores[best_idx]) best_combined = float(combined[best_idx]) best_wh = self._wh_class(self.faqs[best_idx]['question']) # Acceptance criteria: require good semantic OR strong combined with overlap accept = ( best_semantic >= max(0.7, threshold) or (best_combined >= threshold and best_overlap >= 0.3) ) # Enforce WH intent match when present if accept and q_wh and best_wh and q_wh != best_wh: accept = False if accept: return self.faqs[best_idx]['answer'], best_combined else: # Log as unanswered so admins can curate (ignore errors) try: self.save_unanswered_question(question) except Exception: pass fallback = ( "Sorry, I don’t have the knowledge to answer that yet.\n" "I’ll notify an admin about your question and we’ll add the answer soon.\n" "Please come back in a while." ) return (fallback, best_combined) def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]: """Get suggested questions based on the input question""" if not self.faqs or self.faq_embeddings is None: return [] # Compute and normalize embedding for the input question question_embedding = self.model.encode([question], normalize_embeddings=True)[0] # Calculate cosine similarity similarities = np.dot(self.faq_embeddings, question_embedding) # Get top N similar questions top_indices = np.argsort(similarities)[-num_suggestions:][::-1] return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3] def add_faq(self, question: str, answer: str) -> bool: """Add a new FAQ to the static list (for demonstration purposes)""" try: new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1 new_faq = {"id": new_id, "question": question, "answer": answer} self.faqs.append(new_faq) # Recompute embeddings questions = [faq['question'] for faq in self.faqs] embeddings = self.model.encode(questions, normalize_embeddings=True) self.faq_embeddings = np.array(embeddings) print(f"FAQ added: {question}") return True except Exception as e: print(f"Error adding FAQ: {e}") return False