Spaces:

jerrynnms
/

scam-detector

Sleeping

File size: 2,349 Bytes

310d6ab
995e13a
310d6ab
 
 
995e13a
310d6ab
995e13a
310d6ab
 
995e13a
310d6ab
995e13a
310d6ab
995e13a
 
 
310d6ab
 
 
 
995e13a
 
 
310d6ab
995e13a
310d6ab
 
 
 
 
 
 
995e13a
 
 
 
310d6ab
995e13a
 
 
 
310d6ab
995e13a
310d6ab
 
 
 
995e13a
310d6ab
995e13a
 
 
 
310d6ab
995e13a
 
 
 
 
 
 
 
310d6ab
 
995e13a
 
 
310d6ab
995e13a

import torch
from AI_Model_architecture import BertLSTM_CNN_Classifier
from transformers import BertTokenizer
import re
import os
import requests

# ✅ 使用 CPU 模式（如果你只部署在 Hugging Face）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Hugging Face 建議路徑（防止 cache 錯誤）
model_path = "/tmp/model.pth"
model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"

# ✅ 快取模型檔（僅首次下載）
if not os.path.exists(model_path):
    print("📦 下載 model.pth 中...")
    response = requests.get(model_url)
    if response.status_code == 200:
        with open(model_path, "wb") as f:
            f.write(response.content)
        print("✅ 模型下載完成")
    else:
        raise FileNotFoundError("❌ 無法下載 model.pth，請檢查網址")

# ✅ 全域快取模型與 tokenizer
model = BertLSTM_CNN_Classifier()
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")

# ✅ 預測單句文字
def predict_single_sentence(text: str, max_len=256):
    text = re.sub(r"\s+", "", text)  # 移除空白
    text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？:/.\-]", "", text)  # 清洗非標點與文字

    encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len)
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    token_type_ids = encoded["token_type_ids"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask, token_type_ids)
        prob = output.item()
        label = int(prob > 0.5)

    return label, prob

# ✅ 封裝為 API 可用格式
def analyze_text(text: str):
    label, prob = predict_single_sentence(text)
    prob_percent = round(prob * 100, 2)

    if prob > 0.9:
        risk = "🔴 高風險（極可能是詐騙）"
    elif prob > 0.5:
        risk = "🟡 中風險（可疑）"
    else:
        risk = "🟢 低風險（正常）"

    status = "詐騙" if label == 1 else "正常"

    return {
        "status": status,
        "confidence": prob_percent,
        "suspicious_keywords": [risk]  # 這裡之後可進一步做關鍵字標註
    }