CHUNYU0505 commited on
Commit
1740855
·
verified ·
1 Parent(s): 2aa3d8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -0
app.py CHANGED
@@ -1,3 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -------------------------------
2
  # 4. 本地推論模型設定
3
  # -------------------------------
 
1
+ # app.py
2
+ # -------------------------------
3
+ # 1. 套件載入
4
+ # -------------------------------
5
+ import os, glob
6
+ from langchain.docstore.document import Document
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+ from docx import Document as DocxDocument
11
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
12
+ import gradio as gr
13
+
14
+ # -------------------------------
15
+ # 2. 環境變數與資料路徑
16
+ # -------------------------------
17
+ TXT_FOLDER = "./out_texts"
18
+ DB_PATH = "./faiss_db"
19
+ os.makedirs(DB_PATH, exist_ok=True)
20
+ os.makedirs(TXT_FOLDER, exist_ok=True)
21
+
22
+ # -------------------------------
23
+ # 3. 建立或載入向量資料庫
24
+ # -------------------------------
25
+ EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
26
+ embeddings_model = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
27
+
28
+ if os.path.exists(os.path.join(DB_PATH, "index.faiss")):
29
+ print("載入現有向量資料庫...")
30
+ db = FAISS.load_local(DB_PATH, embeddings_model, allow_dangerous_deserialization=True)
31
+ else:
32
+ print("沒有資料庫,開始建立新向量資料庫...")
33
+ txt_files = glob.glob(f"{TXT_FOLDER}/*.txt")
34
+ docs = []
35
+ for filepath in txt_files:
36
+ with open(filepath, "r", encoding="utf-8") as f:
37
+ docs.append(Document(page_content=f.read(), metadata={"source": os.path.basename(filepath)}))
38
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
39
+ split_docs = splitter.split_documents(docs)
40
+ db = FAISS.from_documents(split_docs, embeddings_model)
41
+ db.save_local(DB_PATH)
42
+
43
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
44
+
45
  # -------------------------------
46
  # 4. 本地推論模型設定
47
  # -------------------------------