Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on Aug 19, 2024

Commit

c82efb6

1 Parent(s): e488f16

format changes

Browse files

Files changed (10) hide show

code/app.py +3 -3
code/main.py +0 -2
code/modules/chat/helpers.py +0 -2
code/modules/chat/langchain/langchain_rag.py +0 -1
code/modules/chat/langchain/utils.py +0 -3
code/modules/chat_processor/helpers.py +0 -2
code/modules/config/project_config.yml +1 -1
code/modules/dataloader/data_loader.py +11 -9
code/modules/retriever/helpers.py +0 -1
code/modules/vectorstore/store_manager.py +6 -7

code/app.py CHANGED Viewed

@@ -242,9 +242,9 @@ async def post_signin(request: Request):
     user_details.metadata["last_login"] = current_datetime
     # if new user, set the number of tries
     if "tokens_left" not in user_details.metadata:
-        user_details.metadata["tokens_left"] = (
-            TOKENS_LEFT  # set the number of tokens left for the new user
-        )
     if "all_time_tokens_allocated" not in user_details.metadata:
         user_details.metadata["all_time_tokens_allocated"] = ALL_TIME_TOKENS_ALLOCATED
     if "in_cooldown" not in user_details.metadata:

     user_details.metadata["last_login"] = current_datetime
     # if new user, set the number of tries
     if "tokens_left" not in user_details.metadata:
+        user_details.metadata[
+            "tokens_left"
+        ] = TOKENS_LEFT  # set the number of tokens left for the new user
     if "all_time_tokens_allocated" not in user_details.metadata:
         user_details.metadata["all_time_tokens_allocated"] = ALL_TIME_TOKENS_ALLOCATED
     if "in_cooldown" not in user_details.metadata:

code/main.py CHANGED Viewed

@@ -505,7 +505,6 @@ class Chatbot:
             token_count += token_count_cb.total_tokens
             for question in list_of_questions:
                 actions.append(
                     cl.Action(
                         name="follow up question",
@@ -549,7 +548,6 @@ class Chatbot:
     @cl.header_auth_callback
     def header_auth_callback(headers: dict) -> Optional[cl.User]:
         print("\n\n\nI am here\n\n\n")
         # try: # TODO: Add try-except block after testing
         # TODO: Implement to get the user information from the headers (not the cookie)

             token_count += token_count_cb.total_tokens
             for question in list_of_questions:
                 actions.append(
                     cl.Action(
                         name="follow up question",
     @cl.header_auth_callback
     def header_auth_callback(headers: dict) -> Optional[cl.User]:
         print("\n\n\nI am here\n\n\n")
         # try: # TODO: Add try-except block after testing
         # TODO: Implement to get the user information from the headers (not the cookie)

code/modules/chat/helpers.py CHANGED Viewed

@@ -42,7 +42,6 @@ def get_sources(res, answer, stream=True, view_sources=False):
         full_answer += answer
     if view_sources:
         # Then, display the sources
         # check if the answer has sources
         if len(source_dict) == 0:
@@ -51,7 +50,6 @@ def get_sources(res, answer, stream=True, view_sources=False):
         else:
             full_answer += "\n\n**Sources:**\n"
             for idx, (url_name, source_data) in enumerate(source_dict.items()):
                 full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
                 name = f"Source {idx + 1} Text\n"

         full_answer += answer
     if view_sources:
         # Then, display the sources
         # check if the answer has sources
         if len(source_dict) == 0:
         else:
             full_answer += "\n\n**Sources:**\n"
             for idx, (url_name, source_data) in enumerate(source_dict.items()):
                 full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
                 name = f"Source {idx + 1} Text\n"

code/modules/chat/langchain/langchain_rag.py CHANGED Viewed

@@ -19,7 +19,6 @@ from .utils import (
 class Langchain_RAG_V1(BaseRAG):
     def __init__(
         self,
         llm,

 class Langchain_RAG_V1(BaseRAG):
     def __init__(
         self,
         llm,

code/modules/chat/langchain/utils.py CHANGED Viewed

@@ -26,7 +26,6 @@ CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
@@ -139,7 +138,6 @@ class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
 class CustomRunnableWithHistory(RunnableWithMessageHistory):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
@@ -282,7 +280,6 @@ def create_retrieval_chain(
 # TODO: Remove Hard-coded values
 async def return_questions(query, response, chat_history_str, context, config):
     system = (
         "You are someone that suggests a question based on the student's input and chat history. "
         "Generate a question that is relevant to the student's input and chat history. "

 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
 class CustomRunnableWithHistory(RunnableWithMessageHistory):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
 # TODO: Remove Hard-coded values
 async def return_questions(query, response, chat_history_str, context, config):
     system = (
         "You are someone that suggests a question based on the student's input and chat history. "
         "Generate a question that is relevant to the student's input and chat history. "

code/modules/chat_processor/helpers.py CHANGED Viewed

@@ -156,7 +156,6 @@ async def update_user_info(user_info):
 async def check_user_cooldown(user_info, current_time):
     # # Check if no tokens left
     tokens_left = user_info.metadata.get("tokens_left", 0)
     if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False):
@@ -214,7 +213,6 @@ async def reset_tokens_for_user(user_info):
     # Calculate how many tokens should have been regenerated proportionally
     if current_tokens < max_tokens:
         # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration
         regeneration_rate_per_second = max_tokens / REGEN_TIME

 async def check_user_cooldown(user_info, current_time):
     # # Check if no tokens left
     tokens_left = user_info.metadata.get("tokens_left", 0)
     if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False):
     # Calculate how many tokens should have been regenerated proportionally
     if current_tokens < max_tokens:
         # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration
         regeneration_rate_per_second = max_tokens / REGEN_TIME

code/modules/config/project_config.yml CHANGED Viewed

@@ -3,5 +3,5 @@ retriever:
     RAGatouille: "XThomasBU/Colbert_Index"
 metadata:
-  metada_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
   slide_base_link: "https://dl4ds.github.io"

     RAGatouille: "XThomasBU/Colbert_Index"
 metadata:
+  metadata_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
   slide_base_link: "https://dl4ds.github.io"

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -222,7 +222,7 @@ class ChunkProcessor:
     def chunk_docs(self, file_reader, uploaded_files, weblinks):
         addl_metadata = get_metadata(
-            *self.config["metadata"]["metada_links"], self.config
         )  # For any additional metadata
         # remove already processed files if reparse_files is False
@@ -324,7 +324,6 @@ class ChunkProcessor:
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
                 documents = [
@@ -440,13 +439,16 @@ if __name__ == "__main__":
     data_loader = DataLoader(config, logger=logger)
     # Just for testing
-    document_chunks, document_names, documents, document_metadata = (
-        data_loader.get_chunks(
-            [
-                "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
-            ],
-            [],
-        )
     )
     print(document_names[:5])

     def chunk_docs(self, file_reader, uploaded_files, weblinks):
         addl_metadata = get_metadata(
+            *self.config["metadata"]["metadata_links"], self.config
         )  # For any additional metadata
         # remove already processed files if reparse_files is False
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
                 documents = [
     data_loader = DataLoader(config, logger=logger)
     # Just for testing
+    (
+        document_chunks,
+        document_names,
+        documents,
+        document_metadata,
+    ) = data_loader.get_chunks(
+        [
+            "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
+        ],
+        [],
     )
     print(document_names[:5])

code/modules/retriever/helpers.py CHANGED Viewed

@@ -6,7 +6,6 @@ from typing import List
 class VectorStoreRetrieverScore(VectorStoreRetriever):
     # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun

 class VectorStoreRetrieverScore(VectorStoreRetriever):
     # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun

code/modules/vectorstore/store_manager.py CHANGED Viewed

@@ -47,7 +47,6 @@ class VectorStoreManager:
         return logger
     def load_files(self):
         files = os.listdir(self.config["vectorstore"]["data_path"])
         files = [
             os.path.join(self.config["vectorstore"]["data_path"], file)
@@ -69,7 +68,6 @@ class VectorStoreManager:
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         embedding_model_loader = EmbeddingModelLoader(self.config)
         embedding_model = embedding_model_loader.load_embedding_model()
@@ -100,7 +98,6 @@ class VectorStoreManager:
         )
     def create_database(self):
         start_time = time.time()  # Start time for creating database
         data_loader = DataLoader(self.config, self.logger)
         self.logger.info("Loading data")
@@ -110,9 +107,12 @@ class VectorStoreManager:
         self.logger.info(f"Number of webpages: {len(webpages)}")
         if f"{self.config['vectorstore']['url_file_path']}" in files:
             files.remove(f"{self.config['vectorstores']['url_file_path']}")  # cleanup
-        document_chunks, document_names, documents, document_metadata = (
-            data_loader.get_chunks(files, webpages)
-        )
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
         metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
@@ -128,7 +128,6 @@ class VectorStoreManager:
         )
     def load_database(self):
         start_time = time.time()  # Start time for loading database
         if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()

         return logger
     def load_files(self):
         files = os.listdir(self.config["vectorstore"]["data_path"])
         files = [
             os.path.join(self.config["vectorstore"]["data_path"], file)
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         embedding_model_loader = EmbeddingModelLoader(self.config)
         embedding_model = embedding_model_loader.load_embedding_model()
         )
     def create_database(self):
         start_time = time.time()  # Start time for creating database
         data_loader = DataLoader(self.config, self.logger)
         self.logger.info("Loading data")
         self.logger.info(f"Number of webpages: {len(webpages)}")
         if f"{self.config['vectorstore']['url_file_path']}" in files:
             files.remove(f"{self.config['vectorstores']['url_file_path']}")  # cleanup
+        (
+            document_chunks,
+            document_names,
+            documents,
+            document_metadata,
+        ) = data_loader.get_chunks(files, webpages)
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
         metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
         )
     def load_database(self):
         start_time = time.time()  # Start time for loading database
         if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()