fahmiaziz98 commited on
Commit
ba900f0
·
1 Parent(s): 8bb0a69

commat cammit 1

Browse files
app.py CHANGED
@@ -1,4 +1,58 @@
 
1
  import streamlit as st
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import streamlit as st
3
+ from src.indexing.document_processor import DocumentProcessor
4
+ from src.indexing.vectore_store import VectorStoreManager
5
+ from src.retriever.retriever import RetrieverManager
6
 
7
+ UPLOAD_FOLDER = "uploads/"
8
+ PERSIST_DIRECTORY = "chroma_db/"
9
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
10
+ os.makedirs(PERSIST_DIRECTORY, exist_ok=True)
11
+
12
+ if "messages" not in st.session_state:
13
+ st.session_state.messages = []
14
+ if "retriever" not in st.session_state:
15
+ st.session_state.retriever = None
16
+ if "vector_store" not in st.session_state:
17
+ st.session_state.vector_store = None
18
+
19
+ st.set_page_config(
20
+ page_title="RAG Chatbot",
21
+ layout="wide",
22
+ page_icon="📘",
23
+ )
24
+ st.title("Agentic RAG Chatbot")
25
+
26
+
27
+ with st.sidebar:
28
+ st.header("PDF Upload")
29
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
30
+ st.info("Supported file type: PDF")
31
+
32
+ if uploaded_file:
33
+ with st.spinner("Processing PDF..."):
34
+
35
+ file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
36
+ with open(file_path, "wb") as f:
37
+ f.write(uploaded_file.getbuffer())
38
+
39
+
40
+ doc_processor = DocumentProcessor()
41
+ chunks = doc_processor.load_and_split_pdf(file_path)
42
+
43
+ # Buat vector store
44
+ vector_store_manager = VectorStoreManager()
45
+ vector_store = vector_store_manager.index_documents(
46
+ documents=chunks,
47
+ collection_name=uploaded_file.name,
48
+ persist_directory=PERSIST_DIRECTORY
49
+ )
50
+ st.session_state.vector_store = vector_store
51
+
52
+ # Setup retriever
53
+ retriever_manager = RetrieverManager(vector_store)
54
+ base_retriever = retriever_manager.create_base_retriever()
55
+ compression_retriever = retriever_manager.create_compression_retriever(base_retriever)
56
+ st.session_state.retriever = compression_retriever
57
+
58
+ st.success("File processed successfully!")
requirements.txt CHANGED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langgraph
3
+ langchain-huggingface
4
+ langchain-google-genai
5
+ google-ai-generativelanguage==0.6.15
6
+ langchain-community
7
+ langchain-chroma
8
+ pypdf
9
+ tiktoken
10
+ rank_bm25
11
+ flashrank
src/indexing/__init__.py ADDED
File without changes
src/indexing/document_processor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import PyPDFLoader
3
+
4
+ class DocumentProcessor:
5
+ def __init__(self, chunk_size=500, chunk_overlap=100):
6
+ self.text_splitter = RecursiveCharacterTextSplitter(
7
+ chunk_size=chunk_size,
8
+ chunk_overlap=chunk_overlap
9
+ )
10
+
11
+ def load_and_split_pdf(self, file_path: str):
12
+ """Load PDF and split into chunks"""
13
+ loader = PyPDFLoader(file_path)
14
+ docs = loader.load()
15
+ chunks = self.text_splitter.split_documents(docs)
16
+ return chunks
src/indexing/vectore_store.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ from langchain_chroma import Chroma
3
+
4
+ class VectorStoreManager:
5
+ def __init__(self, embedding_model="intfloat/multilingual-e5-small"):
6
+ self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
7
+
8
+ def create_vector_store(self, collection_name="my_collection", persist_directory=None):
9
+ """Create a new vector store"""
10
+ store_params = {
11
+ "collection_name": collection_name,
12
+ "embedding_function": self.embeddings,
13
+ }
14
+ if persist_directory:
15
+ store_params["persist_directory"] = persist_directory
16
+
17
+ return Chroma(**store_params)
18
+
19
+ def index_documents(self, documents, collection_name="my_collection", persist_directory=None):
20
+ """Index documents into vector store"""
21
+ vector_store = self.create_vector_store(collection_name, persist_directory)
22
+ vector_store.add_documents(documents=documents)
23
+ return vector_store
src/retriever/__init__.py ADDED
File without changes
src/retriever/retriever.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
2
+ from langchain.retrievers import ContextualCompressionRetriever
3
+ from langchain.retrievers.document_compressors import FlashrankRerank
4
+
5
+
6
+ class RetrieverManager:
7
+ def __init__(self, vector_store):
8
+ self.vector_store = vector_store
9
+
10
+ def create_base_retriever(self, search_type="similarity", k=3):
11
+ """Create basic vector store retriever"""
12
+ return self.vector_store.as_retriever(
13
+ search_type=search_type,
14
+ search_kwargs={"k": k}
15
+ )
16
+
17
+ def create_ensemble_retriever(self, texts, vector_weight=0.5, keyword_weight=0.5):
18
+ """Create ensemble retriever combining vector and keyword search"""
19
+ vector_retriever = self.create_base_retriever()
20
+ keyword_retriever = BM25Retriever.from_documents(texts)
21
+ keyword_retriever.k = 3
22
+
23
+ return EnsembleRetriever(
24
+ retrievers=[vector_retriever, keyword_retriever],
25
+ weights=[vector_weight, keyword_weight]
26
+ )
27
+
28
+ def create_compression_retriever(self, base_retriever, top_n=5):
29
+ """Create compression retriever with reranking"""
30
+ compressor = FlashrankRerank(top_n=top_n)
31
+ return ContextualCompressionRetriever(
32
+ base_compressor=compressor,
33
+ base_retriever=base_retriever
34
+ )