cd@bziiit.com
		
	commited on
		
		
					Commit 
							
							·
						
						a3d26e6
	
1
								Parent(s):
							
							4e6d9da
								
First commit
Browse files- .gitignore +4 -0
 - app.py +45 -0
 - pages/chatbot.py +36 -0
 - pages/documents.py +35 -0
 - pages/form.py +6 -0
 - pages/persistent_documents.py +35 -0
 - pages/prompt_system.py +12 -0
 - prompt_template.py +8 -0
 - rag.py +100 -0
 - requirements.txt +19 -0
 - vectore_store/ConnectorStrategy.py +14 -0
 - vectore_store/PineconeConnector.py +110 -0
 - vectore_store/VectoreStoreManager.py +15 -0
 - vectore_store/__init__.py +0 -0
 
    	
        .gitignore
    ADDED
    
    | 
         @@ -0,0 +1,4 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            */__pycache__/*
         
     | 
| 2 | 
         
            +
            __pycache__
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            .env
         
     | 
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,45 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
            import dotenv
         
     | 
| 3 | 
         
            +
            import os
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            from rag import Rag
         
     | 
| 6 | 
         
            +
            from vectore_store.PineconeConnector import PineconeConnector
         
     | 
| 7 | 
         
            +
            from vectore_store.VectoreStoreManager import VectoreStoreManager
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            GROUP_NAME = "Groupe 1"
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            def main():
         
     | 
| 12 | 
         
            +
                    
         
     | 
| 13 | 
         
            +
                if len(st.session_state) == 0:
         
     | 
| 14 | 
         
            +
                    # Define Vectore store strategy
         
     | 
| 15 | 
         
            +
                    pinecone_connector = PineconeConnector()
         
     | 
| 16 | 
         
            +
                    vs_manager = VectoreStoreManager(pinecone_connector)
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                    st.session_state["messages"] = []
         
     | 
| 19 | 
         
            +
                    st.session_state["assistant"] = Rag(vectore_store=vs_manager)
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
                st.set_page_config(page_title=GROUP_NAME)
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
                st.title(GROUP_NAME)
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                prompt_system = st.Page("pages/prompt_system.py", title="Prompt système", icon="📋", default=True)
         
     | 
| 26 | 
         
            +
                saved_documents = st.Page("pages/persistent_documents.py", title="Documents Communs", icon="📋")
         
     | 
| 27 | 
         
            +
                documents = st.Page("pages/documents.py", title="Documents", icon="📋")
         
     | 
| 28 | 
         
            +
                form = st.Page("pages/form.py", title="Formulaire", icon="📋")
         
     | 
| 29 | 
         
            +
                chatbot = st.Page("pages/chatbot.py", title="Chatbot", icon="📋")
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
                pg = st.navigation(
         
     | 
| 32 | 
         
            +
                    [
         
     | 
| 33 | 
         
            +
                        saved_documents,
         
     | 
| 34 | 
         
            +
                        prompt_system,
         
     | 
| 35 | 
         
            +
                        documents,
         
     | 
| 36 | 
         
            +
                        form,
         
     | 
| 37 | 
         
            +
                        chatbot
         
     | 
| 38 | 
         
            +
                    ]
         
     | 
| 39 | 
         
            +
                )
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
                pg.run()
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 45 | 
         
            +
                main()
         
     | 
    	
        pages/chatbot.py
    ADDED
    
    | 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
            from streamlit_chat import message
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            def display_messages():
         
     | 
| 5 | 
         
            +
                for i, (msg, is_user) in enumerate(st.session_state["messages"]):
         
     | 
| 6 | 
         
            +
                    message(msg, is_user=is_user, key=str(i))
         
     | 
| 7 | 
         
            +
                st.session_state["thinking_spinner"] = st.empty()
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            def process_input():
         
     | 
| 11 | 
         
            +
                if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
         
     | 
| 12 | 
         
            +
                    user_text = st.session_state["user_input"].strip()
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
                    with st.session_state["thinking_spinner"], st.spinner(f"Je réfléchis"):
         
     | 
| 16 | 
         
            +
                        agent_text = st.session_state["assistant"].ask(user_text, st.session_state["messages"] if "messages" in st.session_state else [])
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                    st.session_state["messages"].append((user_text, True))
         
     | 
| 19 | 
         
            +
                    st.session_state["messages"].append((agent_text, False))
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            def page():
         
     | 
| 23 | 
         
            +
                st.subheader("Posez vos questions")
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                if "assistant" not in st.session_state:
         
     | 
| 26 | 
         
            +
                    st.text("Assistant non initialisé")
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                prompt_sys = st.session_state.prompt_system if 'prompt_system' in st.session_state and st.session_state.prompt_system != '' else "Renseignez votre prompt system"
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
                st.text("Prompt system : " + prompt_sys)
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                display_messages()
         
     | 
| 34 | 
         
            +
                st.text_input("Message", key="user_input", on_change=process_input)
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            page()
         
     | 
    	
        pages/documents.py
    ADDED
    
    | 
         @@ -0,0 +1,35 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import tempfile
         
     | 
| 3 | 
         
            +
            import streamlit as st
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            def read_and_save_file():
         
     | 
| 6 | 
         
            +
                st.session_state["messages"] = []
         
     | 
| 7 | 
         
            +
                st.session_state["user_input"] = ""
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
                for file in st.session_state["file_uploader"]:
         
     | 
| 10 | 
         
            +
                    with tempfile.NamedTemporaryFile(delete=False) as tf:
         
     | 
| 11 | 
         
            +
                        tf.write(file.getbuffer())
         
     | 
| 12 | 
         
            +
                        file_path = tf.name
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
                    with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
         
     | 
| 15 | 
         
            +
                        st.session_state["assistant"].ingest(file_path)
         
     | 
| 16 | 
         
            +
                    os.remove(file_path)
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            def page():
         
     | 
| 21 | 
         
            +
                st.subheader("Charger vos documents")
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
                # File uploader
         
     | 
| 24 | 
         
            +
                uploaded_file = st.file_uploader(
         
     | 
| 25 | 
         
            +
                    "Télécharger un ou plusieurs documents",
         
     | 
| 26 | 
         
            +
                    type=["pdf"],
         
     | 
| 27 | 
         
            +
                    key="file_uploader",
         
     | 
| 28 | 
         
            +
                    accept_multiple_files=True,
         
     | 
| 29 | 
         
            +
                    on_change=read_and_save_file,
         
     | 
| 30 | 
         
            +
                )
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                st.session_state["ingestion_spinner"] = st.empty()
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            page()
         
     | 
    	
        pages/form.py
    ADDED
    
    | 
         @@ -0,0 +1,6 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            def page():
         
     | 
| 4 | 
         
            +
                st.subheader("Définissez vos paramètres")
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            page()
         
     | 
    	
        pages/persistent_documents.py
    ADDED
    
    | 
         @@ -0,0 +1,35 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import tempfile
         
     | 
| 3 | 
         
            +
            import streamlit as st
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            def uploadToDb():
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
                for file in st.session_state["file_uploader_commun"]:
         
     | 
| 8 | 
         
            +
                    with tempfile.NamedTemporaryFile(delete=False) as tf:
         
     | 
| 9 | 
         
            +
                        tf.write(file.getbuffer())
         
     | 
| 10 | 
         
            +
                        file_path = tf.name
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
                    with st.session_state["ingestion_spinner"], st.spinner(f"Chargement {file.name}"):
         
     | 
| 13 | 
         
            +
                        st.session_state["assistant"].ingestToDb(file_path, filename=file.name)
         
     | 
| 14 | 
         
            +
                    os.remove(file_path)
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            def page():
         
     | 
| 17 | 
         
            +
                st.subheader("Montez des documents communs")
         
     | 
| 18 | 
         
            +
                
         
     | 
| 19 | 
         
            +
                st.file_uploader(
         
     | 
| 20 | 
         
            +
                    "Télécharger un documents",
         
     | 
| 21 | 
         
            +
                    type=["pdf"],
         
     | 
| 22 | 
         
            +
                    key="file_uploader_commun",
         
     | 
| 23 | 
         
            +
                    accept_multiple_files=True,
         
     | 
| 24 | 
         
            +
                    on_change=uploadToDb,
         
     | 
| 25 | 
         
            +
                )
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
                st.session_state["ingestion_spinner"] = st.empty()
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                st.divider()
         
     | 
| 30 | 
         
            +
                st.write("Documents dans la base de données", bold=True)
         
     | 
| 31 | 
         
            +
                
         
     | 
| 32 | 
         
            +
                for doc in st.session_state["assistant"].vector_store.getDocs():
         
     | 
| 33 | 
         
            +
                    st.write(" - "+doc)
         
     | 
| 34 | 
         
            +
                
         
     | 
| 35 | 
         
            +
            page()
         
     | 
    	
        pages/prompt_system.py
    ADDED
    
    | 
         @@ -0,0 +1,12 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            def page():
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
                st.subheader("Renseignez votre prompt system")
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
                prompt = st.text_area("Prompt system", st.session_state.prompt_system if 'prompt_system' in st.session_state else "")
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
                # Session State also supports attribute based syntax
         
     | 
| 10 | 
         
            +
                st.session_state['prompt_system'] = prompt
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            page()
         
     | 
    	
        prompt_template.py
    ADDED
    
    | 
         @@ -0,0 +1,8 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            base_template = '''
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            Documents partagées : {commonContext}
         
     | 
| 4 | 
         
            +
            Document de référence : {documentContext}
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            Voici l'historique des messages : {messages}
         
     | 
| 7 | 
         
            +
            Les attentes de l'utilisateur sont : {query}
         
     | 
| 8 | 
         
            +
            '''
         
     | 
    	
        rag.py
    ADDED
    
    | 
         @@ -0,0 +1,100 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            from dotenv import load_dotenv
         
     | 
| 4 | 
         
            +
            from langchain_community.vectorstores import FAISS
         
     | 
| 5 | 
         
            +
            from langchain_mistralai.chat_models import ChatMistralAI
         
     | 
| 6 | 
         
            +
            from langchain_mistralai.embeddings import MistralAIEmbeddings
         
     | 
| 7 | 
         
            +
            from langchain.schema.output_parser import StrOutputParser
         
     | 
| 8 | 
         
            +
            from langchain_community.document_loaders import PyPDFLoader
         
     | 
| 9 | 
         
            +
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         
     | 
| 10 | 
         
            +
            from langchain.schema.runnable import RunnablePassthrough
         
     | 
| 11 | 
         
            +
            from langchain.prompts import PromptTemplate
         
     | 
| 12 | 
         
            +
            from langchain_community.vectorstores.utils import filter_complex_metadata
         
     | 
| 13 | 
         
            +
            #add new import
         
     | 
| 14 | 
         
            +
            from langchain_community.document_loaders.csv_loader import CSVLoader
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            from prompt_template import base_template
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            # load .env in local dev
         
     | 
| 20 | 
         
            +
            load_dotenv()
         
     | 
| 21 | 
         
            +
            env_api_key = os.environ.get("MISTRAL_API_KEY")
         
     | 
| 22 | 
         
            +
            llm_model = "open-mixtral-8x7b"
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
            class Rag:
         
     | 
| 25 | 
         
            +
                document_vector_store = None
         
     | 
| 26 | 
         
            +
                retriever = None
         
     | 
| 27 | 
         
            +
                chain = None
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                def __init__(self, vectore_store=None):
         
     | 
| 30 | 
         
            +
                    
         
     | 
| 31 | 
         
            +
                    self.model = ChatMistralAI(model=llm_model)
         
     | 
| 32 | 
         
            +
                    self.embedding = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=env_api_key)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
                    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, length_function=len)
         
     | 
| 35 | 
         
            +
                    self.prompt = PromptTemplate.from_template(base_template)
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
                    self.vector_store = vectore_store
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                def setModel(self, model):
         
     | 
| 40 | 
         
            +
                    self.model = model
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
                def ingestToDb(self, file_path: str, filename: str):
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                    docs = PyPDFLoader(file_path=file_path).load()
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
                    # Extract all text from the document
         
     | 
| 47 | 
         
            +
                    text = ""
         
     | 
| 48 | 
         
            +
                    for page in docs:
         
     | 
| 49 | 
         
            +
                        text += page.page_content
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                    # Split the text into chunks
         
     | 
| 52 | 
         
            +
                    chunks = self.text_splitter.split_text(text)
         
     | 
| 53 | 
         
            +
                    
         
     | 
| 54 | 
         
            +
                    return self.vector_store.addDoc(filename=filename, text_chunks=chunks, embedding=self.embedding)
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                def getDbFiles(self):
         
     | 
| 57 | 
         
            +
                    return self.vector_store.getDocs()
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
                def ingest(self, pdf_file_path: str):
         
     | 
| 60 | 
         
            +
                    docs = PyPDFLoader(file_path=pdf_file_path).load()
         
     | 
| 61 | 
         
            +
                   
         
     | 
| 62 | 
         
            +
                    chunks = self.text_splitter.split_documents(docs)
         
     | 
| 63 | 
         
            +
                    chunks = filter_complex_metadata(chunks)
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
                    document_vector_store = FAISS.from_documents(chunks, self.embedding)
         
     | 
| 66 | 
         
            +
                    
         
     | 
| 67 | 
         
            +
                    self.retriever = document_vector_store.as_retriever(
         
     | 
| 68 | 
         
            +
                        search_type="similarity_score_threshold",
         
     | 
| 69 | 
         
            +
                        search_kwargs={
         
     | 
| 70 | 
         
            +
                            "k": 3,
         
     | 
| 71 | 
         
            +
                            "score_threshold": 0.5,
         
     | 
| 72 | 
         
            +
                        },
         
     | 
| 73 | 
         
            +
                    )
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
                    self.chain = self.prompt | self.model | StrOutputParser()
         
     | 
| 76 | 
         
            +
             
     | 
| 77 | 
         
            +
                def ask(self, query: str, messages: list):
         
     | 
| 78 | 
         
            +
                    if not self.chain:
         
     | 
| 79 | 
         
            +
                        return "Ajouter un document PDF d'abord."
         
     | 
| 80 | 
         
            +
                    
         
     | 
| 81 | 
         
            +
                    print("messages ", messages)
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
                    # Retrieve the context document
         
     | 
| 84 | 
         
            +
                    documentContext = self.retriever.invoke(query)
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
                    # Retrieve the VectoreStore
         
     | 
| 87 | 
         
            +
                    contextCommon = None
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
                    return self.chain.invoke({
         
     | 
| 90 | 
         
            +
                        "query": query,
         
     | 
| 91 | 
         
            +
                        "documentContext": documentContext,
         
     | 
| 92 | 
         
            +
                        "commonContext": contextCommon,
         
     | 
| 93 | 
         
            +
                        "messages": messages
         
     | 
| 94 | 
         
            +
                    })
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
                def clear(self):
         
     | 
| 97 | 
         
            +
                    self.document_vector_store = None
         
     | 
| 98 | 
         
            +
                    self.vector_store = None
         
     | 
| 99 | 
         
            +
                    self.retriever = None
         
     | 
| 100 | 
         
            +
                    self.chain = None
         
     | 
    	
        requirements.txt
    ADDED
    
    | 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            streamlit==1.37.0
         
     | 
| 2 | 
         
            +
            streamlit_chat
         
     | 
| 3 | 
         
            +
            # abc
         
     | 
| 4 | 
         
            +
            python-dotenv
         
     | 
| 5 | 
         
            +
            pymupdf
         
     | 
| 6 | 
         
            +
            python-multipart
         
     | 
| 7 | 
         
            +
            pydantic
         
     | 
| 8 | 
         
            +
            langchain-pinecone 
         
     | 
| 9 | 
         
            +
            pinecone-notebooks
         
     | 
| 10 | 
         
            +
            pinecone-client[grpc]
         
     | 
| 11 | 
         
            +
            async-timeout
         
     | 
| 12 | 
         
            +
            pymupdf
         
     | 
| 13 | 
         
            +
            python-dotenv
         
     | 
| 14 | 
         
            +
            typing-extensions
         
     | 
| 15 | 
         
            +
            langchain
         
     | 
| 16 | 
         
            +
            langchain-openai
         
     | 
| 17 | 
         
            +
            langchain-community
         
     | 
| 18 | 
         
            +
            langchain-pinecone
         
     | 
| 19 | 
         
            +
            langchain_mistralai
         
     | 
    	
        vectore_store/ConnectorStrategy.py
    ADDED
    
    | 
         @@ -0,0 +1,14 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from abc import ABC, abstractmethod
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            class ConnectorStrategy(ABC):
         
     | 
| 4 | 
         
            +
                @abstractmethod
         
     | 
| 5 | 
         
            +
                def getDocs(self):
         
     | 
| 6 | 
         
            +
                    pass
         
     | 
| 7 | 
         
            +
                
         
     | 
| 8 | 
         
            +
                @abstractmethod
         
     | 
| 9 | 
         
            +
                def addDoc(self, filename, text_chunks, embedding):
         
     | 
| 10 | 
         
            +
                    pass
         
     | 
| 11 | 
         
            +
                
         
     | 
| 12 | 
         
            +
                @abstractmethod
         
     | 
| 13 | 
         
            +
                def retriever(self, query, embedding):
         
     | 
| 14 | 
         
            +
                    pass
         
     | 
    	
        vectore_store/PineconeConnector.py
    ADDED
    
    | 
         @@ -0,0 +1,110 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os 
         
     | 
| 2 | 
         
            +
            from dotenv import load_dotenv
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            from .ConnectorStrategy import ConnectorStrategy
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            from pinecone import Pinecone, ServerlessSpec
         
     | 
| 7 | 
         
            +
            from langchain_openai import OpenAIEmbeddings
         
     | 
| 8 | 
         
            +
            from langchain_pinecone import PineconeVectorStore
         
     | 
| 9 | 
         
            +
            from langchain_core.documents import Document
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            import unicodedata
         
     | 
| 12 | 
         
            +
            import time
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            class PineconeConnector(ConnectorStrategy):
         
     | 
| 15 | 
         
            +
                def __init__(self):
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
                    load_dotenv()
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
                    pinecone_api_key = os.environ.get("PINECONE_API_KEY")
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
                    self.index_name = os.environ.get("PINECONE_INDEX_NAME")
         
     | 
| 22 | 
         
            +
                    self.namespace = os.environ.get("PINECONE_NAMESPACE")
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
                    print(f"Index name: {self.index_name}")
         
     | 
| 25 | 
         
            +
                    print(f"Namespace: {self.namespace}")
         
     | 
| 26 | 
         
            +
                    print(f"Pinecone API Key: {pinecone_api_key}")
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
                    pc = Pinecone(api_key=pinecone_api_key)
         
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
                    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
                    if self.index_name not in existing_indexes:
         
     | 
| 33 | 
         
            +
                        pc.create_index(
         
     | 
| 34 | 
         
            +
                            name=self.index_name,
         
     | 
| 35 | 
         
            +
                            dimension=3072,
         
     | 
| 36 | 
         
            +
                            metric="cosine",
         
     | 
| 37 | 
         
            +
                            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
         
     | 
| 38 | 
         
            +
                        )
         
     | 
| 39 | 
         
            +
                        while not pc.describe_index(self.index_name).status["ready"]:
         
     | 
| 40 | 
         
            +
                            time.sleep(1)
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
                    self.index = pc.Index(self.index_name)
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                
         
     | 
| 45 | 
         
            +
                def getDocs(self):
         
     | 
| 46 | 
         
            +
                    # Simulate getting docs from Pinecone
         
     | 
| 47 | 
         
            +
                    print("Fetching documents from Pinecone")
         
     | 
| 48 | 
         
            +
                    
         
     | 
| 49 | 
         
            +
                    docs_names = []
         
     | 
| 50 | 
         
            +
                    for ids in self.index.list(namespace=self.namespace):
         
     | 
| 51 | 
         
            +
                        for id in ids:
         
     | 
| 52 | 
         
            +
                            name_doc = "_".join(id.split("_")[:-1])
         
     | 
| 53 | 
         
            +
                            if name_doc not in docs_names:
         
     | 
| 54 | 
         
            +
                                docs_names.append(name_doc)
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                    return docs_names
         
     | 
| 57 | 
         
            +
                
         
     | 
| 58 | 
         
            +
                
         
     | 
| 59 | 
         
            +
                def addDoc(self, filename, text_chunks, embedding):
         
     | 
| 60 | 
         
            +
                    try:
         
     | 
| 61 | 
         
            +
                        vector_store = PineconeVectorStore(index=self.index, embedding=embedding,namespace=self.namespace)
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
                        file_name = filename.split(".")[0].replace(" ","_").replace("-","_").replace(".","_").replace("/","_").replace("\\","_").strip()
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
                        documents = []
         
     | 
| 66 | 
         
            +
                        uuids = []
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
                        print(file_name)
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
                        for i, chunk in enumerate(text_chunks):
         
     | 
| 71 | 
         
            +
                            clean_filename = remove_non_standard_ascii(file_name)
         
     | 
| 72 | 
         
            +
                            uuid = f"{clean_filename}_{i}"
         
     | 
| 73 | 
         
            +
                            
         
     | 
| 74 | 
         
            +
                            print(f"Adding document with ID {uuid}")
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
                            document = Document(
         
     | 
| 77 | 
         
            +
                                page_content=chunk,
         
     | 
| 78 | 
         
            +
                                metadata={ "filename":filename, "chunk_id":uuid },
         
     | 
| 79 | 
         
            +
                            )
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
                            uuids.append(uuid)
         
     | 
| 82 | 
         
            +
                            documents.append(document)
         
     | 
| 83 | 
         
            +
                        
         
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
                        vector_store.add_documents(documents=documents, ids=uuids)
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
                        return {"filename_id":clean_filename}
         
     | 
| 88 | 
         
            +
                    
         
     | 
| 89 | 
         
            +
                    except Exception as e:
         
     | 
| 90 | 
         
            +
                        print(e)
         
     | 
| 91 | 
         
            +
                        return False
         
     | 
| 92 | 
         
            +
                
         
     | 
| 93 | 
         
            +
                def retriever(self, query, embedding):
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
                    print(f"Retrieving documents from Pinecone for query '{query}'")
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
                    vector_store = PineconeVectorStore(index=self.index, embedding=embedding,namespace=self.namespace)
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
                    retriever = vector_store.as_retriever(
         
     | 
| 100 | 
         
            +
                        search_type="similarity_score_threshold",
         
     | 
| 101 | 
         
            +
                        search_kwargs={"k": 3, "score_threshold": 0.6},
         
     | 
| 102 | 
         
            +
                    )
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
                    return retriever.invoke(query)
         
     | 
| 105 | 
         
            +
                
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
            def remove_non_standard_ascii(input_string: str) -> str:
         
     | 
| 108 | 
         
            +
                normalized_string = unicodedata.normalize('NFKD', input_string)
         
     | 
| 109 | 
         
            +
                return ''.join(char for char in normalized_string if 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit() or char in ' .,!?')
         
     | 
| 110 | 
         
            +
             
     | 
    	
        vectore_store/VectoreStoreManager.py
    ADDED
    
    | 
         @@ -0,0 +1,15 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            from vectore_store import ConnectorStrategy
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            class VectoreStoreManager:
         
     | 
| 5 | 
         
            +
                def __init__(self, strategy: ConnectorStrategy):
         
     | 
| 6 | 
         
            +
                    self.strategy = strategy
         
     | 
| 7 | 
         
            +
                
         
     | 
| 8 | 
         
            +
                def getDocs(self):
         
     | 
| 9 | 
         
            +
                    return self.strategy.getDocs()
         
     | 
| 10 | 
         
            +
                
         
     | 
| 11 | 
         
            +
                def addDoc(self, filename, text_chunks, embedding):
         
     | 
| 12 | 
         
            +
                    self.strategy.addDoc(filename, text_chunks, embedding)
         
     | 
| 13 | 
         
            +
                
         
     | 
| 14 | 
         
            +
                def retriever(self, query, embedding):
         
     | 
| 15 | 
         
            +
                    return self.strategy.retriever(query, embedding)
         
     | 
    	
        vectore_store/__init__.py
    ADDED
    
    | 
         
            File without changes
         
     |