Spaces:

Bennie12
/

Project-PredictScamInfo

Sleeping

App Files Files Community

Project-PredictScamInfo / bert_explainer.py

Bennie12

Upload 4 files

6709d22 verified 6 months ago

raw

history blame

12.8 kB


	# 引入重要套件Import Library
	# PyTorch 主模組，和Tensorflow很像
	# 共通點：都是深度學習框架，支援建構神經網路、訓練與推論，都支援GPU加速、載入模型，和處理tensor等。
	# 操作比較直覺，接近Python本身的風格，動態圖架構(每一次forward都即時計算)，更容易除錯、快速迭代，在研究領域非常流行。
	# re是Python內建的正則表示式(regular expression)模組，在這專案中用來"用關鍵規則篩選文字內容"。
	# requests是一個非常好用的 HTTP 請求套件，能讓你從Python發送GET/POST請求，在專案中用來從Google Drive下載模型檔案(model.pth)。
	# BertTokenizer:從Hugging Face的transformers套件載入一個專用的「分詞器（Tokenizer）」。
	import os
	os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


	import torch
	import re
	import easyocr
	import io
	import numpy as np

	from PIL import Image
	from huggingface_hub import hf_hub_download
	from transformers import BertTokenizer




	# 設定裝置（GPU 優先）
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# 預設模型與 tokenizer 為 None，直到首次請求才載入（延遲載入）
	model = None
	tokenizer = None
	# ✅ 延遲載入模型與 tokenizer
	def load_model_and_tokenizer():
	global model, tokenizer
	if os.path.exists("model.pth"):
	model_path = "model.pth"
	else:
	model_path = hf_hub_download(repo_id="Bennie12/Bert-Lstm-Cnn-ScamDetecter", filename="model.pth")
	# 匯入模型架構（避免在模組初始化階段就占用大量記憶體）
	from AI_Model_architecture import BertLSTM_CNN_Classifier
	"""
	file_id = "19t6NlRFMc1i8bGtngRwIRtRcCmibdP9q"

	url = f"https://drive.google.com/uc?export=download&id={file_id}"
	if not os.path.exists(model_path): # 如果本地還沒有這個檔案 → 才下載（避免重複）
	print("📥 Downloading model from Google Drive...")
	r = requests.get(url) # 用requests發送GET請求到Google Drive
	with open(model_path, 'wb')as f: # 把下載的檔案內容寫入到 model.pth 本地檔案
	f.write(r.content)
	print("✅ Model downloaded.")
	else:
	print("📦 Model already exists.")
	"""
	# 載入模型架構與參數，初始化模型架構並載入訓練權重
	model = BertLSTM_CNN_Classifier()

	# 這行的功能是：「從 model_path把.pth 權重檔案讀進來，載入進模型裡」。
	# model.load_state_dict(...)把上面載入的權重「套進模型架構裡」
	# torch.load(...)載入.pth 權重檔案，會變成一份 Python 字典
	# map_location=device指定模型載入到 CPU 還是 GPU，避免報錯
	model.load_state_dict(torch.load(model_path, map_location=device))

	model.to(device)

	# 這是PyTorch中的「推論模式」設定
	# model.eval()模型處於推論狀態（關掉 Dropout 等隨機操作）
	# 只要是用來「預測」而不是訓練，一定要加 .eval()！
	model.eval()

	# 初始化 tokenizer(不要從 build_bert_inputs 中取)
	# 載入預訓練好的CKIP中文BERT分詞器
	# 能把中文句子轉成 BERT 模型需要的 input 格式（input_ids, attention_mask, token_type_ids）
	tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")

	return model, tokenizer

	all_preds = []
	all_labels = []

	# 預測單一句子的分類結果（詐騙 or 正常）
	# model: 訓練好的PyTorch模型
	# tokenizer: 分詞器，負責把中文轉成 BERT 能處理的數值格式
	# sentence: 使用者輸入的文字句子
	# max_len: 限制最大輸入長度（預設 256 個 token）
	def predict_single_sentence(model, tokenizer, sentence, max_len=256):

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# 使用 with torch.no_grad()，代表這段程式「不需要記錄梯度」
	# 這樣可以加速推論並節省記憶體
	with torch.no_grad():
	# ----------- 文字前處理：清洗輸入句子 -----------
	sentence = re.sub(r"\s+", "", sentence) # 移除所有空白字元（空格、換行等）
	sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？]", "", sentence)
	# 保留常見中文字、英數字與標點符號，其他奇怪符號都移除
	# ----------- 使用 BERT Tokenizer 將句子編碼 -----------
	encoded = tokenizer(sentence,
	return_tensors="pt", # 回傳 PyTorch tensor 格式（預設是 numpy 或 list）
	truncation=True, # 超過最大長度就截斷
	padding="max_length", # 不足最大長度則補空白（PAD token）
	max_length=max_len) # 設定最大長度為 256
	# 把 tokenizer 回傳的資料送進模型前，to(device)轉到指定的裝置（GPU or CPU）
	input_ids = encoded["input_ids"].to(device)
	attention_mask = encoded["attention_mask"].to(device)
	token_type_ids = encoded["token_type_ids"].to(device)
	# ----------- 模型推論：輸出詐騙的機率值 -----------
	output = model(input_ids, attention_mask, token_type_ids)# 回傳的是一個機率值（float）
	prob = output.item() # 從 tensor 取出純數字，例如 0.86
	label = int(prob > 0.5) # 如果機率 > 0.5，標為「詐騙」（1），否則為「正常」（0）
	# ----------- 根據機率進行風險分級 -----------
	if prob > 0.9:
	risk = "🔴 高風險（極可能是詐騙）"
	elif prob > 0.5:
	risk = "🟡 中風險（可疑）"
	else:
	risk = "🟢 低風險（正常）"
	# ----------- 根據 label 判斷文字結果 -----------
	pre_label ='詐騙'if label == 1 else '正常'
	# ----------- 顯示推論資訊（後端終端機） -----------
	print(f"\n📩 訊息內容：{sentence}")
	print(f"✅ 預測結果：{'詐騙' if label == 1 else '正常'}")
	print(f"📊 信心值：{round(prob*100, 2)}")
	print(f"⚠️ 風險等級：{risk}")
	# ----------- 回傳結果給呼叫端（通常是 API） -----------
	# 組成一個 Python 字典（對應 API 的 JSON 輸出格式）
	return {
	"label" : pre_label, # 預測分類（"詐騙" or "正常"）
	"prob" : prob, # 預測分類（"詐騙" or "正常"）
	"risk" : risk # 用風險分級當作"可疑提示"放進 list（名稱為 suspicious_keywords）
	}

	# analyze_text(text)對應app.py第117行
	# 這個函式是「對外的簡化版本」：輸入一句文字 → 回傳詐騙判定結果
	# 用在主程式或 FastAPI 後端中，是整個模型預測流程的入口點


	#------------ CNN ------------
	def extract_suspicious_tokens_cnn(model, tokenizer, text, top_k=3):
	model.eval()
	model.to(device)

	# 清理與編碼輸入文字
	sentence = re.sub(r"\s+", "", text)
	sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？]", "", sentence)

	encoded = tokenizer(sentence,
	return_tensors="pt",
	truncation=True,
	padding="max_length",
	max_length=128)

	input_ids = encoded["input_ids"].to(device)
	attention_mask = encoded["attention_mask"].to(device)
	token_type_ids = encoded["token_type_ids"].to(device)

	# 前向傳遞直到 CNN 輸出
	with torch.no_grad():
	hidden_states = model.bert(input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids).last_hidden_state
	lstm_out, _ = model.LSTM(hidden_states)
	conv_input = lstm_out.transpose(1, 2)
	conv_out = model.conv1(conv_input) # conv_out = [batch, 128, seq_len]

	# 這裡會將conv_out的輸出[batch, 128, seq_len]，壓縮成[seq_len]，也就是轉換成bert編碼形勢的句子。
	token_scores = conv_out.mean(dim=1).squeeze()

	# torch.topk(token_scores, top_k)會得到分數高的token，和對應索引位置，.indices只留下索引，.cpu()把結果從GPU移到CPU（必要才能轉為 list），
	# .tolist()轉化成list格式。挑出重要性最高的幾個 token 的位置索引。
	topk_indices = torch.topk(token_scores, top_k).indices.cpu().tolist()

	"""
	tokenizer.convert_ids_to_tokens(input_ids.squeeze())將bert編碼還原成原始文字
	這段input_ids = encoded["input_ids"].to(device)輸出的編碼，還原成文字
	.squeeze() 去掉 batch 維度，得到 [seq_len]。
	[tokens[i] for i in topk_indices if tokens[i] not in ["[PAD]", "[CLS]", "[SEP]"]]
	上面的程式碼為，i為topk_indices挑出的索引，token[i]為分數最高的文字，也就是可疑的詞句。
	not in 就能避免選到就能避免選到[CLS]、[SEP]、 [PAD]
	[CLS] 開始符號 = 101
	[SEP] 結束符號 = 102
	[PAD] 補空白 = 0
	"""
	tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
	suspicious_tokens = [tokens[i] for i in topk_indices if tokens[i] not in ["[PAD]", "[CLS]", "[SEP]"]]

	return suspicious_tokens


	#------------ Bert Attention ------------
	def extract_suspicious_tokens_attention(model, tokenizer, text, top_k=3):
	from transformers import BertModel # 避免重複 import

	sentence = re.sub(r"\s+", "", text)
	sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？]", "", sentence)

	encoded = tokenizer(sentence,
	return_tensors="pt",
	truncation=True,
	padding="max_length",
	max_length=128)

	input_ids = encoded["input_ids"].to(device)
	attention_mask = encoded["attention_mask"].to(device)
	token_type_ids = encoded["token_type_ids"].to(device)

	with torch.no_grad():
	bert_outputs = model.bert(input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	output_attentions=True)
	# 取第一層第0個 head 的 attention（CLS → all tokens）
	"""
	attentions[0]第 0 層 attention（BERT 第 1 層），[0, 0, 0, :]取出第 0 個 batch、第 0 個 head、第 0 個 token（CLS）對所有 token 的注意力分數

	"""
	attention_scores = bert_outputs.attentions[0][0, 0, 0, :] # [seq_len]

	topk_indices = torch.topk(attention_scores, top_k).indices.cpu().tolist()

	tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
	suspicious_tokens = [tokens[i] for i in topk_indices if tokens[i] not in ["[PAD]", "[CLS]", "[SEP]"]]

	return suspicious_tokens



	def analyze_text(text, explain_mode="cnn"):
	model, tokenizer = load_model_and_tokenizer()
	model.eval()

	# 預測標籤與信心分數
	result = predict_single_sentence(model, tokenizer, text)
	label = result["label"]
	prob = result["prob"]
	risk = result["risk"]
	# 根據模式擷取可疑詞
	if explain_mode == "cnn":
	suspicious = extract_suspicious_tokens_cnn(model, tokenizer, text)
	elif explain_mode == "bert":
	suspicious = extract_suspicious_tokens_attention(model, tokenizer, text)
	elif explain_mode == "both":
	cnn_tokens = extract_suspicious_tokens_cnn(model, tokenizer, text)
	bert_tokens = extract_suspicious_tokens_attention(model, tokenizer, text)
	suspicious = list(set(cnn_tokens + bert_tokens))
	else:
	suspicious = [risk]

	return {
	"status": label,
	"confidence": round(prob * 100, 2),
	"suspicious_keywords": [str(s) for s in suspicious]
	}

	def analyze_image(file_bytes, explain_mode = "cnn"):
	image = Image.open(io.BytesIO(file_bytes))
	image_np = np.array(image)
	reader = easyocr.Reader(['ch_tra', 'en'], gpu=torch.cuda.is_available())
	results = reader.readtext(image_np)

	text = ' '.join([res[1] for res in results]).strip()

	if not text:
	return{
	"status" : "無法辨識文字",
	"confidence" : 0.0,
	"suspicious_keywords" : ["圖片中無可辨識的中文英文"]
	}
	return analyze_text(text, explain_mode=explain_mode)