Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter | |
| from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import GPT4AllEmbeddings | |
| # Khai bao bien | |
| pdf_data_path = "/data" | |
| vector_db_path = "vectorstores/db_faiss" | |
| # Ham 1. Tao ra vector DB tu 1 doan text | |
| def create_db_from_text(): | |
| raw_text = """ | |
| Thinh created you who is a chatbox at Resvu, | |
| """ | |
| # Chia nho van ban | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=100, | |
| chunk_overlap=20, | |
| length_function=len | |
| ) | |
| chunks = text_splitter.split_text(raw_text) | |
| # Embeding | |
| embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") | |
| # Dua vao Faiss Vector DB | |
| db = FAISS.from_texts(texts=chunks, embedding=embedding_model) | |
| db.save_local(vector_db_path) | |
| return db | |
| # Define the file types you want to load | |
| file_types = ["*.pdf", "*.txt", "*.doc", "*.docx"] | |
| def create_db_from_files(): | |
| # Khai bao loader de quet toan bo thu muc dataa | |
| # loader = DirectoryLoader(pdf_data_path, glob=file_types, loader_cls = PyPDFLoader) | |
| # documents = loader.load() | |
| # Create a loader for each file type | |
| loaders = [] | |
| for file_type in file_types: | |
| loader = DirectoryLoader( | |
| pdf_data_path, | |
| glob=file_type, | |
| loader_cls=UnstructuredFileLoader | |
| ) | |
| loaders.append(loader) | |
| # Load all documents | |
| documents = [] | |
| for loader in loaders: | |
| documents.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50) | |
| chunks = text_splitter.split_documents(documents) | |
| # Embeding | |
| embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") | |
| db = FAISS.from_documents(chunks, embedding_model) | |
| db.save_local(vector_db_path) | |
| return db |