Spaces:
Sleeping
Sleeping
| from dotenv import load_dotenv | |
| load_dotenv() | |
| import os | |
| from glob import glob | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_ollama import OllamaEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain.schema import Document | |
| # βββ CONFIG βββ | |
| DOCS_FOLDER = "docs/" # folder with .txt, .md, etc. | |
| OLLAMA_URL = os.getenv("OLLAMA_SERVER") | |
| EMBED_MODEL = "nomic-embed-text:latest" | |
| PERSIST_DIR = "chroma_db/" # on-disk Chroma store | |
| CHUNK_SIZE = 2000 | |
| CHUNK_OVERLAP = 10 | |
| # ββββββββββ | |
| def embed_all_docs(): | |
| all_chunks = [] | |
| files = glob(os.path.join(DOCS_FOLDER, "*.*")) | |
| for path in files: | |
| try: | |
| # 1) Try loading with UTF-8 + autodetect fallback | |
| loader = TextLoader( | |
| path, | |
| encoding="utf-8", | |
| autodetect_encoding=True | |
| ) | |
| docs = loader.load() | |
| except UnicodeDecodeError: | |
| # 2) If that still fails, fallback to a lenient read | |
| print(f"β οΈ Decoding error on {path}, falling back to ignore-errors mode") | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| docs = [Document(page_content=text, metadata={"source": path})] | |
| # 3) Split into chunks | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP | |
| ) | |
| chunks = splitter.split_documents(docs) | |
| print(f"β {len(chunks)} chunks from {os.path.basename(path)}") | |
| all_chunks.extend(chunks) | |
| # 4) Embed & persist on-disk Chroma | |
| embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL) | |
| vectordb = Chroma( | |
| embedding_function=embedder, | |
| persist_directory=PERSIST_DIR, | |
| collection_name="my_docs" | |
| ) | |
| vectordb.add_documents(all_chunks) | |
| print(f"β Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'") | |
| if __name__ == "__main__": | |
| embed_all_docs() | |