Spaces:
Sleeping
Sleeping
| import os | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| # 1. 设置路径 | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 当前脚本所在路径 | |
| PERSIST_DIR = os.path.abspath(os.path.join(BASE_DIR, "../vector_store")) # 向量库存储路径 | |
| SOURCE_DIR = BASE_DIR # 你的 .md 文件就在当前 vector_build/ 目录下 | |
| # 2. 加载 Embedding 模型 | |
| embed_model = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
| ) | |
| # 3. 加载 Markdown 文件 & 切分为小段 | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, chunk_overlap=50 | |
| ) | |
| docs = [] | |
| for fname in os.listdir(SOURCE_DIR): | |
| if fname.endswith(".md"): | |
| with open(os.path.join(SOURCE_DIR, fname), "r", encoding="utf-8") as f: | |
| raw_text = f.read() | |
| chunks = text_splitter.split_text(raw_text) | |
| for chunk in chunks: | |
| docs.append({ | |
| "text": chunk, | |
| "source": fname | |
| }) | |
| print(f"🐣 共切分出 {len(docs)} 个文本块,准备向量化...") | |
| # 4. 创建 Chroma 向量库 | |
| texts = [d["text"] for d in docs] | |
| metas = [{"source": d["source"]} for d in docs] | |
| vectordb = Chroma.from_texts( | |
| texts=texts, | |
| embedding=embed_model, | |
| metadatas=metas, | |
| persist_directory=PERSIST_DIR | |
| ) | |
| vectordb.persist() | |
| print(f"🎉 向量库生成完毕,已保存在:{PERSIST_DIR}") | |