Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import BertTokenizer, BertModel | |
| from AI_Model_architecture import BertLSTM_CNN_Classifier | |
| import re | |
| import os | |
| import requests | |
| # ✅ 使用 CPU 模式(部署環境通用) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # ✅ 模型權重與儲存位置 | |
| model_path = "/tmp/model.pth" | |
| model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth" | |
| # ✅ 初次下載模型(如果不存在) | |
| if not os.path.exists(model_path): | |
| print("📦 下載 model.pth 中...") | |
| response = requests.get(model_url) | |
| if response.status_code == 200: | |
| with open(model_path, "wb") as f: | |
| f.write(response.content) | |
| print("✅ 模型下載完成") | |
| else: | |
| raise FileNotFoundError("❌ 無法下載 model.pth,請檢查網址是否正確") | |
| # ✅ 初始化 tokenizer | |
| tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese") | |
| # ✅ 初始化自訂分類模型 | |
| model = BertLSTM_CNN_Classifier() | |
| model.load_state_dict(torch.load(model_path, map_location=device)) | |
| model.to(device) | |
| model.eval() | |
| # ✅ 初始化 ckiplab BERT 模型,用於抽取 attention 可疑詞(與分類模型無關) | |
| bert_model = BertModel.from_pretrained("ckiplab/bert-base-chinese", output_attentions=True) | |
| bert_model.to(device) | |
| bert_model.eval() | |
| # ✅ 單句推論(輸出預測結果與信心值) | |
| def predict_single_sentence(text: str, max_len=256): | |
| text = re.sub(r"\s+", "", text) | |
| text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。,!?:/.\-]", "", text) | |
| encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len) | |
| input_ids = encoded["input_ids"].to(device) | |
| attention_mask = encoded["attention_mask"].to(device) | |
| token_type_ids = encoded["token_type_ids"].to(device) | |
| with torch.no_grad(): | |
| output = model(input_ids, attention_mask, token_type_ids) | |
| prob = output.item() | |
| label = int(prob > 0.5) | |
| return label, prob | |
| # ✅ 擷取 BERT attention 權重最高的詞(作為可疑詞) | |
| def extract_attention_keywords(text, top_k=5): | |
| cleaned = re.sub(r"\s+", "", text) | |
| inputs = tokenizer(cleaned, return_tensors="pt", truncation=True, padding="max_length", max_length=128) | |
| input_ids = inputs["input_ids"].to(device) | |
| attention_mask = inputs["attention_mask"].to(device) | |
| with torch.no_grad(): | |
| outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask) | |
| attentions = outputs.attentions # [layers][batch, heads, seq, seq] | |
| # 取最後一層,平均所有 head 與所有 attention 給 token 的權重 | |
| attn = attentions[-1][0].mean(dim=0).mean(dim=0) # [seq] | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) | |
| top_indices = attn.topk(top_k).indices.tolist() | |
| top_tokens = [tokens[i] for i in top_indices if tokens[i] not in ["[CLS]", "[SEP]", "[PAD]"]] | |
| return top_tokens | |
| # ✅ 主函式:整合推論與關鍵詞標註 | |
| def analyze_text(text: str): | |
| label, prob = predict_single_sentence(text) | |
| prob_percent = round(prob * 100, 2) | |
| status = "詐騙" if label == 1 else "正常" | |
| # 風險說明(僅作為備用顯示) | |
| if prob > 0.9: | |
| risk = "🔴 高風險(極可能是詐騙)" | |
| elif prob > 0.5: | |
| risk = "🟡 中風險(可疑)" | |
| else: | |
| risk = "🟢 低風險(正常)" | |
| # 自動抽取可疑詞 | |
| attention_keywords = extract_attention_keywords(text) | |
| return { | |
| "status": status, | |
| "confidence": prob_percent, | |
| "suspicious_keywords": attention_keywords or [risk] | |
| } | |