RAG_AI_V2

Running

App Files Files Community

WebashalarForML commited on Jan 30

Commit

5eb0b04

verified ·

1 Parent(s): 5c6971a

Update N.TXT

Browse files

Files changed (1) hide show

N.TXT +427 -0

N.TXT CHANGED Viewed

@@ -296,4 +296,431 @@ def update_db(db_name):
 if __name__ == "__main__":
     app.run(debug=False, use_reloader=False)

 if __name__ == "__main__":
     app.run(debug=False, use_reloader=False)
+RETRIVAL PY
+from langchain_community.document_loaders import DirectoryLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain_core.documents import Document
+from langchain_community.vectorstores import Chroma
+import os
+import shutil
+import asyncio
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.auto import partition
+import pytesseract
+import os
+import re
+import uuid
+from collections import defaultdict
+pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
+# Configurations
+UPLOAD_FOLDER = "./uploads"
+VECTOR_DB_FOLDER = "./VectorDB"
+IMAGE_DB_FOLDER = "./Images"
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
+########################################################################################################################################################
+####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
+########################################################################################################################################################
+# Loaders for loading Document text, tables and images from any file format.
+#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
+def load_document(data_path):
+    processed_documents = []
+    element_content = []
+    table_document = []
+    #having different process for the pdf
+    for root, _, files in os.walk(data_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            doc_id = str(uuid.uuid4())  # Generate a unique ID for the document
+            print(f"Processing document ID: {doc_id}, Path: {file_path}")
+            try:
+                # Determine the file type based on extension
+                filename, file_extension = os.path.splitext(file.lower())
+                image_output = f"./Images/{filename}/"
+                # Use specific partition techniques based on file extension
+                if file_extension == ".pdf":
+                    elements = partition_pdf(
+                        filename=file_path,
+                        strategy="hi_res",  # Use layout detection
+                        infer_table_structure=True,
+                        hi_res_model_name="yolox",
+                        extract_images_in_pdf=True,
+                        extract_image_block_types=["Image","Table"],
+                        extract_image_block_output_dir=image_output,
+                        show_progress=True,
+                        #chunking_strategy="by_title",
+                    )
+                else:
+                    # Default to auto partition if no specific handler is found
+                    elements = partition(
+                        filename=file_path,
+                        strategy="hi_res",
+                        infer_table_structure=True,
+                        show_progress=True,
+                        #chunking_strategy="by_title"
+                    )
+            except Exception as e:
+                print(f"Failed to process document {file_path}: {e}")
+                continue
+            categorized_content = {
+                "tables": {"content": [], "Metadata": []},
+                "images": {"content": [], "Metadata": []},
+                "text": {"content": [], "Metadata": []},
+                "text2": {"content": [], "Metadata": []}
+            }
+            element_content.append(elements)
+            CNT=1
+            for chunk in elements:
+                # Safely extract metadata and text
+                chunk_type = str(type(chunk))
+                chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}
+                chunk_text = getattr(chunk, "text", None)
+                # Separate content into categories
+                #if "Table" in chunk_type:
+                if any(
+                    keyword in chunk_type
+                    for keyword in [
+                        "Table",
+                        "TableChunk"]):
+                    categorized_content["tables"]["content"].append(chunk_text)
+                    categorized_content["tables"]["Metadata"].append(chunk_metadata)
+                    #test1
+                    TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
+                    CNT+=1
+                    categorized_content["text"]["content"].append(TABLE_DATA)
+                    categorized_content["text"]["Metadata"].append(chunk_metadata)
+                elif "Image" in chunk_type:
+                    categorized_content["images"]["content"].append(chunk_text)
+                    categorized_content["images"]["Metadata"].append(chunk_metadata)
+                elif any(
+                    keyword in chunk_type
+                    for keyword in [
+                        "CompositeElement",
+                        "Text",
+                        "NarrativeText",
+                        "Title",
+                        "Header",
+                        "Footer",
+                        "FigureCaption",
+                        "ListItem",
+                        "UncategorizedText",
+                        "Formula",
+                        "CodeSnippet",
+                        "Address",
+                        "EmailAddress",
+                        "PageBreak",
+                    ]
+                ):
+                    categorized_content["text"]["content"].append(chunk_text)
+                    categorized_content["text"]["Metadata"].append(chunk_metadata)
+                else:
+                    continue
+            # Append processed document
+            processed_documents.append({
+                "doc_id": doc_id,
+                "source": file_path,
+                **categorized_content,
+            })
+    # Loop over tables and match text from the same document and page
+    '''
+    for doc in processed_documents:
+        cnt=1 # count for storing number of the table
+        for table_metadata in doc.get("tables", {}).get("Metadata", []):
+            page_number = table_metadata.get("page_number")
+            source = doc.get("source")
+            page_content = ""
+            for text_metadata, text_content in zip(
+                doc.get("text", {}).get("Metadata", []),
+                doc.get("text", {}).get("content", [])
+            ):
+                page_number2 = text_metadata.get("page_number")
+                source2 = doc.get("source")
+                if source == source2 and page_number == page_number2:
+                    print(f"Matching text found for source: {source}, page: {page_number}")
+                    page_content += f"{text_content} "  # Concatenate text with a space
+            # Add the matched content to the table metadata
+            table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here
+            table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
+            table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
+            cnt+=1
+            # Custom loader of document which will store the table along with the text on that page specifically
+            # making document of each table with its content
+            unique_id = str(uuid.uuid4())
+            table_document.append(
+                Document(
+                    id =unique_id, # Add doc_id directly
+                    page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
+                    metadata={
+                        "source": doc["source"],
+                        "text_as_html": table_metadata.get("text_as_html", ""),
+                        "filetype": table_metadata.get("filetype", ""),
+                        "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
+                        "image_path": table_metadata.get("image_path", ""),
+                        "file_directory": table_metadata.get("file_directory", ""),
+                        "filename": table_metadata.get("filename", ""),
+                        "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
+                    }
+                )
+            )
+    '''
+    # Initialize a structure to group content by doc_id
+    grouped_by_doc_id = defaultdict(lambda: {
+        "text_content": [],
+        "metadata": None,  # Metadata will only be set once per doc_id
+    })
+    for doc in processed_documents:
+        doc_id = doc.get("doc_id")
+        source = doc.get("source")
+        text_content = doc.get("text", {}).get("content", [])
+        metadata_list = doc.get("text", {}).get("Metadata", [])
+        # Merge text content
+        grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
+        # Set metadata (if not already set)
+        if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
+            metadata = metadata_list[0]  # Assuming metadata is consistent
+            grouped_by_doc_id[doc_id]["metadata"] = {
+                "source": source,
+                "filetype": metadata.get("filetype"),
+                "file_directory": metadata.get("file_directory"),
+                "filename": metadata.get("filename"),
+                "languages": str(metadata.get("languages")),
+            }
+    # Convert grouped content into Document objects
+    grouped_documents = []
+    for doc_id, data in grouped_by_doc_id.items():
+        grouped_documents.append(
+            Document(
+                id=doc_id,
+                page_content=" ".join(data["text_content"]).strip(),
+                metadata=data["metadata"],
+            )
+        )
+    # Output the grouped documents
+    for document in grouped_documents:
+        print(document)
+    #Dirctory loader for loading the text data only to specific db
+    '''
+    loader = DirectoryLoader(data_path, glob="*.*")
+    documents = loader.load()
+    # update the metadata adding filname to the met
+    for doc in documents:
+        unique_id = str(uuid.uuid4())
+        doc.id = unique_id
+        path=doc.metadata.get("source")
+        match = re.search(r'([^\\]+\.[^\\]+)$', path)
+        doc.metadata.update({"filename":match.group(1)})
+    return documents,
+    '''
+    return grouped_documents
+#documents,processed_documents,table_document = load_document(data_path)
+########################################################################################################################################################
+####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
+########################################################################################################################################################
+def split_text(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=500,
+        length_function=len,
+        add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents) # splitting the document into chunks
+    for index in chunks:
+        index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3
+    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
+    return chunks
+########################################################################################################################################################
+####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
+########################################################################################################################################################
+#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
+def save_to_chroma(chunks: list[Document], name: str):
+    CHROMA_PATH = f"./VectorDB/chroma_{name}"
+    #TABLE_PATH = f"./TableDB/chroma_{name}"
+    if os.path.exists(CHROMA_PATH):
+        shutil.rmtree(CHROMA_PATH)
+    # if os.path.exists(TABLE_PATH):
+    #     shutil.rmtree(TABLE_PATH)
+    try:
+        # Load the embedding model
+        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
+        print("Creating document vector database...")
+        db = Chroma.from_documents(
+            documents=chunks,
+            embedding=embedding_function,
+            persist_directory=CHROMA_PATH,
+        )
+        print("Document database successfully saved.")
+        # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
+        # if tables:
+        #     print("Creating table vector database...")
+        #     tdb = Chroma.from_documents(
+        #         documents=tables,
+        #         embedding=embedding_function,
+        #         persist_directory=TABLE_PATH,
+        #     )
+        #     print("Table database successfully saved.")
+        # else:
+        #     tdb = None
+        #return db, tdb
+        return db
+    except Exception as e:
+        print("Error while saving to Chroma:", e)
+        return None
+# def get_unique_sources(chroma_path):
+#     db = Chroma(persist_directory=chroma_path)
+#     metadata_list = db.get()["metadatas"]
+#     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
+#     return list(unique_sources)
+########################################################################################################################################################
+####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
+########################################################################################################################################################
+# def add_document_to_existing_db(new_documents: list[Document], db_name: str):
+#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+#     if not os.path.exists(CHROMA_PATH):
+#         print(f"Database '{db_name}' does not exist. Please create it first.")
+#         return
+#     try:
+#         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+#         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+#         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+#         print("Adding new documents to the existing database...")
+#         chunks = split_text(new_documents)
+#         db.add_documents(chunks)
+#         db.persist()
+#         print("New documents added and database updated successfully.")
+#     except Exception as e:
+#         print("Error while adding documents to existing database:", e)
+# def delete_chunks_by_source(chroma_path, source_to_delete):
+#     if not os.path.exists(chroma_path):
+#         print(f"Database at path '{chroma_path}' does not exist.")
+#         return
+#     try:
+#         #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+#         embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+#         db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
+#         print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
+#         metadata_list = db.get()["metadatas"]
+#         # Identify indices of chunks to delete
+#         indices_to_delete = [
+#             idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
+#         ]
+#         if not indices_to_delete:
+#             print(f"No chunks found with source '{source_to_delete}'.")
+#             return
+#         print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
+#         db.delete(indices=indices_to_delete)
+#         db.persist()
+#         print("Chunks deleted and database updated successfully.")
+#     except Exception as e:
+#         print(f"Error while deleting chunks by source: {e}")
+# # update a data store
+# def update_data_store(file_path, db_name):
+#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+#     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
+#     try:
+#         documents,table_document = load_document(file_path)
+#         print("Documents loaded successfully.")
+#     except Exception as e:
+#         print(f"Error loading documents: {e}")
+#         return
+#     try:
+#         chunks = split_text(documents)
+#         print(f"Text split into {len(chunks)} chunks.")
+#     except Exception as e:
+#         print(f"Error splitting text: {e}")
+#         return
+#     try:
+#         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
+#         print(f"Data saved to Chroma for database {db_name}.")
+#     except Exception as e:
+#         print(f"Error saving to Chroma: {e}")
+#         return
+########################################################################################################################################################
+####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
+########################################################################################################################################################
+def generate_data_store(file_path, db_name):
+    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
+    try:
+        #documents,grouped_documents = load_document(file_path)
+        grouped_documents = load_document(file_path)
+        print("Documents loaded successfully.")
+    except Exception as e:
+        print(f"Error loading documents: {e}")
+        return
+    try:
+        chunks = split_text(grouped_documents)
+        print(f"Text split into {len(chunks)} chunks.")
+    except Exception as e:
+        print(f"Error splitting text: {e}")
+        return
+    try:
+        #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
+        asyncio.run(save_to_chroma(chunks, db_name))
+        print(f"Data saved to Chroma for database {db_name}.")
+    except Exception as e:
+        print(f"Error saving to Chroma: {e}")
+        return