Spaces:
Sleeping
Sleeping
File size: 8,907 Bytes
d1e5d7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import re
class AIChatbot:
def __init__(self):
# Load the pre-trained model (can use a smaller model for more speed)
self.model = SentenceTransformer('all-MiniLM-L6-v2')
# Warm up the model to avoid first-request slowness
_ = self.model.encode(["Hello, world!"])
self.faq_embeddings = None
self.faqs = None
self.load_faqs()
def load_faqs(self):
"""Load static FAQs and compute their normalized embeddings"""
# Static FAQ data
self.faqs = [
{"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
{"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
{"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
{"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
{"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
{"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
{"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
{"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
{"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
{"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
]
if self.faqs:
# Compute and normalize embeddings for all questions
questions = [faq['question'] for faq in self.faqs]
embeddings = self.model.encode(questions, normalize_embeddings=True)
self.faq_embeddings = np.array(embeddings)
def save_unanswered_question(self, question):
"""Log unanswered questions to console (can be extended to save to file)"""
print(f"Unanswered question logged: {question}")
# In a real implementation, you could save this to a file or send to an admin
def _tokenize(self, text: str):
if not text:
return []
return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]
def _overlap_ratio(self, q_tokens, faq_tokens):
if not q_tokens or not faq_tokens:
return 0.0
q_set = set(q_tokens)
f_set = set(faq_tokens)
inter = len(q_set & f_set)
denom = max(len(q_set), 1)
return inter / denom
def _wh_class(self, text: str) -> str:
if not text:
return ''
s = text.strip().lower()
# simple heuristic classification by leading wh-word
for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
if s.startswith(key + ' ') or s.startswith(key + "?"):
return key
# also check presence if not leading
for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
if f' {key} ' in f' {s} ':
return key
return ''
def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
print(f"find_best_match called with: {question}") # Debug print
if not self.faqs or self.faq_embeddings is None:
return "I'm sorry, I couldn't find any FAQs in the database.", 0.0
# Compute and normalize embedding for the input question
question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
similarities = np.dot(self.faq_embeddings, question_embedding)
# Compute keyword overlap with each FAQ question
q_tokens = self._tokenize(question)
overlap_scores = []
for faq in self.faqs:
overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))
similarities = np.array(similarities)
overlap_scores = np.array(overlap_scores)
# Combined score to reduce false positives
combined = 0.7 * similarities + 0.3 * overlap_scores
# Apply WH-word intent consistency penalty
q_wh = self._wh_class(question)
if q_wh:
for i, faq in enumerate(self.faqs):
f_wh = self._wh_class(faq['question'])
if f_wh and f_wh != q_wh:
combined[i] *= 0.6 # penalize mismatched intent significantly
best_idx = int(np.argmax(combined))
best_semantic = float(similarities[best_idx])
best_overlap = float(overlap_scores[best_idx])
best_combined = float(combined[best_idx])
best_wh = self._wh_class(self.faqs[best_idx]['question'])
# Acceptance criteria: require good semantic OR strong combined with overlap
accept = (
best_semantic >= max(0.7, threshold)
or (best_combined >= threshold and best_overlap >= 0.3)
)
# Enforce WH intent match when present
if accept and q_wh and best_wh and q_wh != best_wh:
accept = False
if accept:
return self.faqs[best_idx]['answer'], best_combined
else:
# Log as unanswered so admins can curate (ignore errors)
try:
self.save_unanswered_question(question)
except Exception:
pass
fallback = (
"Sorry, I don’t have the knowledge to answer that yet.\n"
"I’ll notify an admin about your question and we’ll add the answer soon.\n"
"Please come back in a while."
)
return (fallback, best_combined)
def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
"""Get suggested questions based on the input question"""
if not self.faqs or self.faq_embeddings is None:
return []
# Compute and normalize embedding for the input question
question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
# Calculate cosine similarity
similarities = np.dot(self.faq_embeddings, question_embedding)
# Get top N similar questions
top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
def add_faq(self, question: str, answer: str) -> bool:
"""Add a new FAQ to the static list (for demonstration purposes)"""
try:
new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
new_faq = {"id": new_id, "question": question, "answer": answer}
self.faqs.append(new_faq)
# Recompute embeddings
questions = [faq['question'] for faq in self.faqs]
embeddings = self.model.encode(questions, normalize_embeddings=True)
self.faq_embeddings = np.array(embeddings)
print(f"FAQ added: {question}")
return True
except Exception as e:
print(f"Error adding FAQ: {e}")
return False |