Spaces:
Sleeping
Sleeping
| # build_index.py | |
| import os | |
| import glob | |
| import sys | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| def load_markdown_files(docs_dir): | |
| abs_docs_dir = os.path.abspath(docs_dir) | |
| print(f"👉 正在扫描目录: {abs_docs_dir}") | |
| file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True) | |
| if not file_paths: | |
| print("❌ 没有找到任何 Markdown 文件,请检查 docs_dir 配置!") | |
| sys.exit(1) | |
| docs = [] | |
| for path in file_paths: | |
| loader = TextLoader(path, encoding="utf-8") | |
| loaded = loader.load() | |
| print(f" - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档") | |
| docs.extend(loaded) | |
| print(f"✅ 总共加载 {len(docs)} 段原始文档") | |
| return docs | |
| def split_documents(docs, chunk_size=1000, chunk_overlap=200): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| new_docs = splitter.split_documents(docs) | |
| print(f"✅ 分片完成:从 {len(docs)} 段 -> {len(new_docs)} 个 chunk") | |
| return new_docs | |
| def build_index(docs, persist_dir, model_name): | |
| if not docs: | |
| print("❌ 没有任何 chunk 可供写入,请检查前面步骤!") | |
| sys.exit(1) | |
| os.makedirs(persist_dir, exist_ok=True) | |
| emb = HuggingFaceEmbeddings(model_name=model_name) | |
| db = Chroma( | |
| persist_directory=persist_dir, | |
| embedding_function=emb, | |
| ) | |
| print("👉 正在写入向量库(自动持久化)……") | |
| db.add_documents(docs) | |
| # 直接访问底层 collection 统计 | |
| count = db._collection.count() | |
| print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'") | |
| def main(): | |
| docs_dir = "./markdown_docs" | |
| persist_dir = "./vector_store" | |
| model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
| raw_docs = load_markdown_files(docs_dir) | |
| docs = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200) | |
| build_index(docs, persist_dir, model_name) | |
| if __name__ == "__main__": | |
| main() | |