Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update retrival.py
Browse files- retrival.py +5 -13
    	
        retrival.py
    CHANGED
    
    | @@ -21,14 +21,16 @@ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract') | |
| 21 | 
             
            # Configurations
         | 
| 22 | 
             
            UPLOAD_FOLDER = "./uploads"
         | 
| 23 | 
             
            VECTOR_DB_FOLDER = "./VectorDB"
         | 
|  | |
| 24 | 
             
            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         | 
| 25 | 
             
            os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
         | 
|  | |
| 26 |  | 
| 27 | 
             
            ########################################################################################################################################################
         | 
| 28 | 
             
            ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
         | 
| 29 | 
             
            ########################################################################################################################################################
         | 
| 30 | 
             
            # Loaders for loading Document text, tables and images from any file format.
         | 
| 31 | 
            -
             | 
| 32 | 
             
            def load_document(data_path):
         | 
| 33 | 
             
                processed_documents = []
         | 
| 34 | 
             
                #element_content = []
         | 
| @@ -44,7 +46,7 @@ def load_document(data_path): | |
| 44 | 
             
                        try:
         | 
| 45 | 
             
                            # Determine the file type based on extension
         | 
| 46 | 
             
                            filename, file_extension = os.path.splitext(file.lower())
         | 
| 47 | 
            -
                            image_output = f" | 
| 48 | 
             
                            # Use specific partition techniques based on file extension
         | 
| 49 | 
             
                            if file_extension == ".pdf":
         | 
| 50 | 
             
                                elements = partition_pdf(
         | 
| @@ -217,11 +219,6 @@ def load_document(data_path): | |
| 217 | 
             
                        )
         | 
| 218 | 
             
                    )
         | 
| 219 |  | 
| 220 | 
            -
                # Output the grouped documents
         | 
| 221 | 
            -
                # for document in grouped_documents:
         | 
| 222 | 
            -
                #     print(document)
         | 
| 223 | 
            -
             | 
| 224 | 
            -
             | 
| 225 | 
             
                #Dirctory loader for loading the text data only to specific db
         | 
| 226 | 
             
                loader = DirectoryLoader(data_path, glob="*.*")
         | 
| 227 | 
             
                documents = loader.load()
         | 
| @@ -235,6 +232,7 @@ def load_document(data_path): | |
| 235 | 
             
                    doc.metadata.update({"filename":match.group(1)})
         | 
| 236 |  | 
| 237 | 
             
                return grouped_documents,documents,table_document
         | 
|  | |
| 238 | 
             
            #grouped_documents = load_document(data_path)
         | 
| 239 | 
             
            #documents,processed_documents,table_document = load_document(data_path)
         | 
| 240 |  | 
| @@ -307,12 +305,6 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen | |
| 307 | 
             
                    print("Error while saving to Chroma:", e)
         | 
| 308 | 
             
                    return None
         | 
| 309 |  | 
| 310 | 
            -
            # def get_unique_sources(chroma_path):
         | 
| 311 | 
            -
            #     db = Chroma(persist_directory=chroma_path)
         | 
| 312 | 
            -
            #     metadata_list = db.get()["metadatas"]
         | 
| 313 | 
            -
            #     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
         | 
| 314 | 
            -
            #     return list(unique_sources)
         | 
| 315 | 
            -
             | 
| 316 | 
             
            ########################################################################################################################################################
         | 
| 317 | 
             
            ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
         | 
| 318 | 
             
            ########################################################################################################################################################
         | 
|  | |
| 21 | 
             
            # Configurations
         | 
| 22 | 
             
            UPLOAD_FOLDER = "./uploads"
         | 
| 23 | 
             
            VECTOR_DB_FOLDER = "./VectorDB"
         | 
| 24 | 
            +
            IMAGE_DB_FOLDER = "./ImageDB"
         | 
| 25 | 
             
            os.makedirs(UPLOAD_FOLDER, exist_ok=True)
         | 
| 26 | 
             
            os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
         | 
| 27 | 
            +
            os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)
         | 
| 28 |  | 
| 29 | 
             
            ########################################################################################################################################################
         | 
| 30 | 
             
            ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
         | 
| 31 | 
             
            ########################################################################################################################################################
         | 
| 32 | 
             
            # Loaders for loading Document text, tables and images from any file format.
         | 
| 33 | 
            +
             | 
| 34 | 
             
            def load_document(data_path):
         | 
| 35 | 
             
                processed_documents = []
         | 
| 36 | 
             
                #element_content = []
         | 
|  | |
| 46 | 
             
                        try:
         | 
| 47 | 
             
                            # Determine the file type based on extension
         | 
| 48 | 
             
                            filename, file_extension = os.path.splitext(file.lower())
         | 
| 49 | 
            +
                            image_output = f"./ImageDB/{filename}/"
         | 
| 50 | 
             
                            # Use specific partition techniques based on file extension
         | 
| 51 | 
             
                            if file_extension == ".pdf":
         | 
| 52 | 
             
                                elements = partition_pdf(
         | 
|  | |
| 219 | 
             
                        )
         | 
| 220 | 
             
                    )
         | 
| 221 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 222 | 
             
                #Dirctory loader for loading the text data only to specific db
         | 
| 223 | 
             
                loader = DirectoryLoader(data_path, glob="*.*")
         | 
| 224 | 
             
                documents = loader.load()
         | 
|  | |
| 232 | 
             
                    doc.metadata.update({"filename":match.group(1)})
         | 
| 233 |  | 
| 234 | 
             
                return grouped_documents,documents,table_document
         | 
| 235 | 
            +
                
         | 
| 236 | 
             
            #grouped_documents = load_document(data_path)
         | 
| 237 | 
             
            #documents,processed_documents,table_document = load_document(data_path)
         | 
| 238 |  | 
|  | |
| 305 | 
             
                    print("Error while saving to Chroma:", e)
         | 
| 306 | 
             
                    return None
         | 
| 307 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 308 | 
             
            ########################################################################################################################################################
         | 
| 309 | 
             
            ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
         | 
| 310 | 
             
            ########################################################################################################################################################
         |