Spaces:

ljy5946
/

Smart_Learning_Assistant

Sleeping

App Files Files Community

ljy5946 commited on Jun 10

Commit

0fc94ca

verified ·

1 Parent(s): a4752c8

Upload build_index.py

Browse files

Files changed (1) hide show

build_index.py +68 -0

build_index.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# build_index.py
+import os
+import glob
+import sys
+from langchain_community.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+def load_markdown_files(docs_dir):
+    abs_docs_dir = os.path.abspath(docs_dir)
+    print(f"👉 正在扫描目录: {abs_docs_dir}")
+    file_paths = glob.glob(os.path.join(abs_docs_dir, "**", "*.md"), recursive=True)
+    if not file_paths:
+        print("❌ 没有找到任何 Markdown 文件，请检查 docs_dir 配置！")
+        sys.exit(1)
+    docs = []
+    for path in file_paths:
+        loader = TextLoader(path, encoding="utf-8")
+        loaded = loader.load()
+        print(f"  - {os.path.basename(path)}: 加载 {len(loaded)} 段原始文档")
+        docs.extend(loaded)
+    print(f"✅ 总共加载 {len(docs)} 段原始文档")
+    return docs
+def split_documents(docs, chunk_size=1000, chunk_overlap=200):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    new_docs = splitter.split_documents(docs)
+    print(f"✅ 分片完成：从 {len(docs)} 段 -> {len(new_docs)} 个 chunk")
+    return new_docs
+def build_index(docs, persist_dir, model_name):
+    if not docs:
+        print("❌ 没有任何 chunk 可供写入，请检查前面步骤！")
+        sys.exit(1)
+    os.makedirs(persist_dir, exist_ok=True)
+    emb = HuggingFaceEmbeddings(model_name=model_name)
+    db = Chroma(
+        persist_directory=persist_dir,
+        embedding_function=emb,
+    )
+    print("👉 正在写入向量库（自动持久化）……")
+    db.add_documents(docs)
+    # 直接访问底层 collection 统计
+    count = db._collection.count()
+    print(f"✅ 已写入 {count} 条 embeddings 到 '{persist_dir}'")
+def main():
+    docs_dir    = "./markdown_docs"
+    persist_dir = "./vector_store"
+    model_name  = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+    raw_docs = load_markdown_files(docs_dir)
+    docs     = split_documents(raw_docs, chunk_size=1000, chunk_overlap=200)
+    build_index(docs, persist_dir, model_name)
+if __name__ == "__main__":
+    main()