Spaces:
Sleeping
Sleeping
| # app.py | |
| # ------------------------------- | |
| # 1. 套件載入 | |
| # ------------------------------- | |
| import os, glob, requests | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint | |
| from docx import Document as DocxDocument | |
| import gradio as gr | |
| from langchain_community.vectorstores import FAISS | |
| import textwrap | |
| # ------------------------------- | |
| # 2. 環境變數與資料路徑 | |
| # ------------------------------- | |
| TXT_FOLDER = "./out_texts" | |
| DB_PATH = "./faiss_db" | |
| os.makedirs(DB_PATH, exist_ok=True) | |
| os.makedirs(TXT_FOLDER, exist_ok=True) # 避免沒有 txt 檔時錯誤 | |
| HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") | |
| if not HF_TOKEN: | |
| raise ValueError( | |
| "請在 Hugging Face Space 的 Settings → Repository secrets 設定 HUGGINGFACEHUB_API_TOKEN" | |
| ) | |
| # ------------------------------- | |
| # 3. 建立或載入向量資料庫 | |
| # ------------------------------- | |
| EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| embeddings_model = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME) | |
| if os.path.exists(os.path.join(DB_PATH, "index.faiss")): | |
| print("載入現有向量資料庫...") | |
| db = FAISS.load_local(DB_PATH, embeddings_model, allow_dangerous_deserialization=True) | |
| else: | |
| print("沒有資料庫,開始建立新向量資料庫...") | |
| txt_files = glob.glob(f"{TXT_FOLDER}/*.txt") | |
| if not txt_files: | |
| print("注意:TXT 資料夾中沒有任何文字檔,向量資料庫將為空。") | |
| docs = [] | |
| for filepath in txt_files: | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| docs.append(Document(page_content=f.read(), metadata={"source": os.path.basename(filepath)})) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| split_docs = splitter.split_documents(docs) | |
| db = FAISS.from_documents(split_docs, embeddings_model) | |
| db.save_local(DB_PATH) | |
| retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5}) | |
| # ------------------------------- | |
| # 4. LLM 設定(Hugging Face Endpoint) | |
| # ------------------------------- | |
| llm = HuggingFaceEndpoint( | |
| repo_id="google/flan-t5-large", | |
| task="text2text-generation", | |
| huggingfacehub_api_token=HF_TOKEN, | |
| temperature=0.7, | |
| max_new_tokens=512, | |
| ) | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| retriever=retriever, | |
| return_source_documents=True | |
| ) | |
| # ------------------------------- | |
| # 5. 查詢 API 剩餘額度 | |
| # ------------------------------- | |
| def get_hf_rate_limit(): | |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} | |
| try: | |
| r = requests.get("https://huggingface.co/api/whoami", headers=headers) | |
| r.raise_for_status() | |
| data = r.json() | |
| remaining = data.get("rate_limit", {}).get("remaining", "未知") | |
| return f"本小時剩餘 API 次數:約 {remaining}" | |
| except Exception: | |
| return "無法取得 API 速率資訊" | |
| # ------------------------------- | |
| # 6. 生成文章(進度條 + 美化換行) | |
| # ------------------------------- | |
| def generate_article_with_progress_bar(query, segments=5): | |
| docx_file = "/tmp/generated_article.docx" | |
| doc = DocxDocument() | |
| doc.add_heading(query, level=1) | |
| all_text = [] | |
| prompt = f"請依據下列主題生成段落:{query}\n\n每段約150-200字。" | |
| rate_info = get_hf_rate_limit() | |
| # 初始化 | |
| yield 0, gr.Textbox.update(value=f"{rate_info}\n\n開始生成文章...\n"), None | |
| for i in range(int(segments)): | |
| try: | |
| result = qa_chain({"query": prompt}) | |
| paragraph = result.get("result", "").strip() | |
| if not paragraph: | |
| paragraph = "(本段生成失敗,請稍後再試。)" | |
| except Exception as e: | |
| paragraph = f"(本段生成失敗:{e})" | |
| # 美化換行,每行約 60 字 | |
| wrapped_paragraph = "\n".join(textwrap.wrap(paragraph, width=60)) | |
| all_text.append(wrapped_paragraph) | |
| doc.add_paragraph(paragraph) # DOCX 保留原文,不換行 | |
| prompt = f"請接續上一段生成下一段:\n{paragraph}\n\n下一段:" | |
| # 更新進度 | |
| progress = int((i + 1) / int(segments) * 100) | |
| current_text = "\n\n".join(all_text) | |
| yield progress, gr.Textbox.update(value=f"{rate_info}\n\n{current_text}\n\n正在生成第 {i+1} 段 / {segments} ..."), None | |
| # 保存 DOCX | |
| doc.save(docx_file) | |
| full_text = "\n\n".join(all_text) | |
| yield 100, gr.Textbox.update(value=f"{rate_info}\n\n{full_text}"), docx_file | |
| # ------------------------------- | |
| # 7. Gradio 介面(進度條 + 文字 + 文件) | |
| # ------------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 佛教經論 RAG 系統 (HF API)") | |
| gr.Markdown("使用 Hugging Face Endpoint LLM + FAISS RAG,生成文章並提示 API 剩餘額度。") | |
| query_input = gr.Textbox(lines=2, placeholder="請輸入文章主題", label="文章主題") | |
| segments_input = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="段落數") | |
| progress_bar = gr.Progress() | |
| output_text = gr.Textbox(label="生成文章 + API 剩餘次數") | |
| output_file = gr.File(label="下載 DOCX") | |
| btn = gr.Button("生成文章") | |
| btn.click( | |
| generate_article_with_progress_bar, | |
| [query_input, segments_input], | |
| [progress_bar, output_text, output_file] | |
| ) | |
| # ------------------------------- | |
| # 8. 啟動 Gradio | |
| # ------------------------------- | |
| if __name__ == "__main__": | |
| demo.launch() | |