Spaces:
Runtime error
Runtime error
| import os | |
| import pickle | |
| import langchain | |
| import faiss | |
| from langchain import HuggingFaceHub, PromptTemplate | |
| from langchain.chains import ConversationalRetrievalChain, LLMChain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.llms import OpenAI | |
| from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader | |
| from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings | |
| from langchain.memory import ConversationBufferWindowMemory | |
| from langchain.prompts.chat import ( | |
| ChatPromptTemplate, | |
| HumanMessagePromptTemplate, | |
| SystemMessagePromptTemplate, | |
| StringPromptTemplate | |
| ) | |
| from langchain.output_parsers import PydanticOutputParser | |
| from langchain.tools.json.tool import JsonSpec | |
| from typing import List, Union, Callable | |
| from langchain.schema import AgentAction, AgentFinish | |
| import re | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from custom_faiss import MyFAISS | |
| from langchain.cache import InMemoryCache | |
| from langchain.chat_models import ChatGooglePalm | |
| from langchain.document_loaders import JSONLoader | |
| from langchain.agents import initialize_agent, Tool, AgentType | |
| from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser, BaseMultiActionAgent | |
| from langchain.tools import StructuredTool | |
| from langchain.chains import create_tagging_chain | |
| from typing import List, Tuple, Any, Union | |
| from langchain.schema import AgentAction, AgentFinish | |
| from pydantic import BaseModel, Field | |
| from typing import Optional | |
| class ToolArgsSchema(BaseModel): | |
| student_name: Optional[str] = Field(description="The name of the student") | |
| question: str = Field(description="The question being asked") | |
| question_type: str = Field(description="The type of question being asked") | |
| interest: Optional[str] = Field(description="The interest of the student") | |
| class Config: | |
| schema_extra = { | |
| "required": ["question", "question_type"] | |
| } | |
| langchain.llm_cache = InMemoryCache() | |
| model_name = "GPT-4" | |
| pickle_file = "_vs.pkl" | |
| index_file = "_vs.index" | |
| models_folder = "models/" | |
| os.environ["LANGCHAIN_TRACING"] = "true" | |
| discussions_file_path = "discussion_entries.json" | |
| llm = OpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, verbose=True) | |
| embeddings = OpenAIEmbeddings(model='text-embedding-ada-002') | |
| chat_history = [] | |
| memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10) | |
| vectorstore_index = None | |
| agent_prompt = """ | |
| I am the LLM AI canvas discussion grading assistant. | |
| I can answer two types of questions: grade-based questions and interest-based questions. | |
| Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. ALWAYS return total score when it is grading based question. | |
| Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts. | |
| You have access to the following tools: | |
| {tools} | |
| Use the following format: | |
| Question: the input question you must answer | |
| Thought: you should always think about type of question it is | |
| Action: the action to take, should be one of [{tool_names}] | |
| Action Input: the input to the action | |
| Observation: the result of the action | |
| (this Thought/Action/Action Input/Observation can repeat N times) | |
| Thought: I now know the final answer | |
| Final Answer: the final answer to the original input question | |
| Begin! | |
| Question: {input} | |
| {agent_scratchpad} | |
| """ | |
| # Set up a prompt template | |
| class CustomPromptTemplate(StringPromptTemplate): | |
| # The template to use | |
| template: str | |
| ############## NEW ###################### | |
| # The list of tools available | |
| tools_getter: Callable | |
| def format(self, **kwargs) -> str: | |
| # Get the intermediate steps (AgentAction, Observation tuples) | |
| # Format them in a particular way | |
| intermediate_steps = kwargs.pop("intermediate_steps") | |
| thoughts = "" | |
| for action, observation in intermediate_steps: | |
| thoughts += action.log | |
| thoughts += f"\nObservation: {observation}\nThought: " | |
| # Set the agent_scratchpad variable to that value | |
| kwargs["agent_scratchpad"] = thoughts | |
| ############## NEW ###################### | |
| tools = self.tools_getter(kwargs["input"]) | |
| # Create a tools variable from the list of tools provided | |
| kwargs["tools"] = "\n".join( | |
| [f"{tool.name}: {tool.description}" for tool in tools] | |
| ) | |
| # Create a list of tool names for the tools provided | |
| kwargs["tool_names"] = ", ".join([tool.name for tool in tools]) | |
| return self.template.format(**kwargs) | |
| class CustomOutputParser(AgentOutputParser): | |
| def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]: | |
| print("llm_output") | |
| print(llm_output) | |
| # Check if agent should finish | |
| if "Final Answer:" in llm_output: | |
| return AgentFinish( | |
| # Return values is generally always a dictionary with a single `output` key | |
| # It is not recommended to try anything else at the moment :) | |
| return_values={"output": llm_output.split("Final Answer:")[-1].strip()}, | |
| log=llm_output, | |
| ) | |
| # Parse out the action and action input | |
| regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" | |
| match = re.search(regex, llm_output, re.DOTALL) | |
| if not match: | |
| raise ValueError(f"Could not parse LLM output: `{llm_output}`") | |
| action = match.group(1).strip() | |
| action_input = match.group(2) | |
| # Return the action and action input | |
| return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output) | |
| system_template = """ | |
| I am the LLM AI canvas discussion grading assistant. | |
| I can answer two types of questions: grade-based questions and interest-based questions. | |
| Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. | |
| Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts. | |
| To grade student discussions, I will follow the rubric below. | |
| Student Post | |
| 3 points: Post includes 8 nouns and text describing how these nouns relate to the student. | |
| 2 points: Student's post includes 8 nouns but does not offer how those nouns relate to the student. | |
| 1 point: Student's post has significant missing details. | |
| 0 points: The student does not provide an initial post, or otherwise does not follow assignment instructions. | |
| Response to Others | |
| 3 points: Student responds to at least 3 other student discussion threads AND responds to questions asked of them. Student posts insightful comments that prompt on target discussion. These posts also avoid throw away comments such as I agree, Me too, Good idea. | |
| 2 points: Student was notably lacking in one criterion. | |
| 1 point: Student was notably lacking in two criteria. | |
| 0 points: The student does not interact in the threads of other students. | |
| I will be able to identify each student by name, and I will be able to share their likings, interests, and other characteristics. I will also be able to filter out students based on their interests. | |
| I will not deviate from the grading scheme. I will grade each discussion entry and reply carefully, and I will share the grades of all individuals by name on the basis of the rubric. I will ALWAYS return total score when it is grading based question. | |
| The discussions and their replies are in following format: | |
| Student Post: Student Name | |
| Reply to: Another Student Discussion ID | |
| Your answer to grade based questions should be in following format: | |
| Student Post: X points | |
| Response to Others: X points | |
| Total: X points | |
| Following are the relevant discussions to grade or answer the interest based questions | |
| ---------------- | |
| Discussions: | |
| {context}""" | |
| messages = [ | |
| SystemMessagePromptTemplate.from_template(system_template), | |
| HumanMessagePromptTemplate.from_template("{question}"), | |
| ] | |
| CHAT_PROMPT = ChatPromptTemplate.from_messages(messages) | |
| def set_model_and_embeddings(): | |
| global chat_history | |
| # set_model(model) | |
| # set_embeddings(model) | |
| chat_history = [] | |
| def set_embeddings(model): | |
| global embeddings | |
| if model == "GPT-3.5" or model == "GPT-4": | |
| print("Loading OpenAI embeddings") | |
| embeddings = OpenAIEmbeddings(model='text-embedding-ada-002') | |
| elif model == "Flan UL2" or model == "Flan T5": | |
| print("Loading Hugging Face embeddings") | |
| embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2") | |
| def get_search_index(): | |
| global vectorstore_index, model_name | |
| if os.path.isfile(get_file_path(model_name, pickle_file)) and os.path.isfile( | |
| get_file_path(model_name, index_file)) and os.path.getsize(get_file_path(model_name, pickle_file)) > 0: | |
| # Load index from pickle file | |
| with open(get_file_path(model_name, pickle_file), "rb") as f: | |
| # search_index = Chroma(persist_directory=models_folder, embedding_function=embeddings) | |
| search_index = pickle.load(f) | |
| print("Loaded index") | |
| else: | |
| search_index = create_index(model_name) | |
| print("Created index") | |
| vectorstore_index = search_index | |
| return search_index | |
| def create_index(model): | |
| source_chunks = create_chunk_documents() | |
| search_index = search_index_from_docs(source_chunks) | |
| # search_index.persist() | |
| faiss.write_index(search_index.index, get_file_path(model, index_file)) | |
| # Save index to pickle file | |
| with open(get_file_path(model, pickle_file), "wb") as f: | |
| pickle.dump(search_index, f) | |
| return search_index | |
| def get_file_path(model, file): | |
| # If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file | |
| if model == "GPT-3.5" or model == "GPT-4": | |
| return models_folder + "openai" + file | |
| else: | |
| return models_folder + "hf" + file | |
| def search_index_from_docs(source_chunks): | |
| # print("source chunks: " + str(len(source_chunks))) | |
| # print("embeddings: " + str(embeddings)) | |
| search_index = MyFAISS.from_documents(source_chunks, embeddings) | |
| return search_index | |
| def get_html_files(): | |
| loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True) | |
| document_list = loader.load() | |
| for document in document_list: | |
| document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0] | |
| return document_list | |
| def metadata_func(record: dict, metadata: dict) -> dict: | |
| metadata["name"] = record.get("name") | |
| return metadata | |
| def get_json_file(): | |
| global discussions_file_path | |
| loader = JSONLoader( | |
| file_path=discussions_file_path, | |
| jq_schema='.[]', metadata_func=metadata_func, content_key="message") | |
| return loader.load() | |
| def fetch_data_for_embeddings(): | |
| # document_list = get_text_files() | |
| document_list = get_html_files() | |
| # document_list = get_json_file() | |
| print("document list: " + str(len(document_list))) | |
| return document_list | |
| def get_text_files(): | |
| loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True) | |
| document_list = loader.load() | |
| return document_list | |
| def create_chunk_documents(): | |
| sources = fetch_data_for_embeddings() | |
| splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0) | |
| source_chunks = splitter.split_documents(sources) | |
| print("chunks: " + str(len(source_chunks))) | |
| return sources | |
| def get_qa_chain(vectorstore_index, question, metadata): | |
| global llm, model_name | |
| print(llm) | |
| filter_dict = {"name": metadata.student_name} | |
| # embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76) | |
| # compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever()) | |
| retriever = get_retriever(filter_dict, vectorstore_index, metadata) | |
| print(retriever.get_relevant_documents(question)) | |
| chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True, | |
| verbose=True, get_chat_history=get_chat_history, | |
| combine_docs_chain_kwargs={"prompt": CHAT_PROMPT}) | |
| return chain | |
| def get_retriever(filter_dict, vectorstore_index, metadata): | |
| if metadata.question_type == "grade-based": | |
| retriever = vectorstore_index.as_retriever(search_type='mmr', | |
| search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10, | |
| 'filter': filter_dict}) | |
| else: | |
| retriever = vectorstore_index.as_retriever(search_type='mmr', | |
| search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10}) | |
| return retriever | |
| def get_chat_history(inputs) -> str: | |
| res = [] | |
| for human, ai in inputs: | |
| res.append(f"Human:{human}\nAI:{ai}") | |
| return "\n".join(res) | |
| def generate_answer(question, metadata: ToolArgsSchema) -> str: | |
| # print("filter: " + filter) | |
| global chat_history, vectorstore_index | |
| chain = get_qa_chain(vectorstore_index, question, metadata) | |
| result = chain( | |
| {"question": question, "chat_history": chat_history}) | |
| chat_history.extend([(question, result["answer"])]) | |
| sources = [] | |
| print(result) | |
| for document in result['source_documents']: | |
| source = document.metadata['source'] | |
| sources.append(source.split('/')[-1].split('.')[0]) | |
| print(sources) | |
| source = ',\n'.join(set(sources)) | |
| # return result['answer'] + '\nSOURCES: ' + source | |
| return result['answer'] | |
| def get_question_type(question): | |
| parser = PydanticOutputParser(pydantic_object=ToolArgsSchema) | |
| prompt_template = """I can answer two types of questions: grade-based questions and interest-based questions. | |
| Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. | |
| Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts. | |
| Question: {question} | |
| Find following information about the question asked. Return Optional empty if the information is not available.: | |
| Format instructions: {format_instructions}""" | |
| llm = OpenAI(temperature=0) | |
| prompt = PromptTemplate(template=prompt_template, input_variables=["question"], output_parser=parser, partial_variables={"format_instructions": parser.get_format_instructions()}) | |
| llm_chain = LLMChain( | |
| llm=llm, | |
| prompt=prompt, | |
| ) | |
| output = llm_chain.run(question) | |
| output = parser.parse(output) | |
| output = generate_answer(question, output) | |
| return output | |
| # class FakeAgent(BaseMultiActionAgent): | |
| # """Fake Custom Agent.""" | |
| # | |
| # @property | |
| # def input_keys(self): | |
| # return ["input"] | |
| # | |
| # def plan( | |
| # self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any | |
| # ) -> Union[List[AgentAction], AgentFinish]: | |
| # print("input keys") | |
| # print(self.input_keys) | |
| # print("intermediate steps") | |
| # print(intermediate_steps) | |
| # print("kwargs") | |
| # print(kwargs) | |
| # | |
| # """Given input, decided what to do. | |
| # | |
| # Args: | |
| # intermediate_steps: Steps the LLM has taken to date, | |
| # along with observations | |
| # **kwargs: User inputs. | |
| # | |
| # Returns: | |
| # Action specifying what tool to use. | |
| # """ | |
| # if len(intermediate_steps) == 0: | |
| # first_action = AgentAction(tool="question type", tool_input=kwargs["input"], log="") | |
| # print("first action") | |
| # print(first_action) | |
| # second_action = AgentAction(tool="Grade",tool_input=kwargs["input"], log="") | |
| # print("second action") | |
| # print(second_action) | |
| # return [ | |
| # first_action, | |
| # second_action, | |
| # ] | |
| # else: | |
| # return AgentFinish(return_values={"output": "bar"}, log="") | |
| # | |
| # async def aplan( | |
| # self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any | |
| # ) -> Union[List[AgentAction], AgentFinish]: | |
| # """Given input, decided what to do. | |
| # | |
| # Args: | |
| # intermediate_steps: Steps the LLM has taken to date, | |
| # along with observations | |
| # **kwargs: User inputs. | |
| # | |
| # Returns: | |
| # Action specifying what tool to use. | |
| # """ | |
| # if len(intermediate_steps) == 0: | |
| # return [ | |
| # AgentAction(tool="question type", tool_input=kwargs["input"], log=""), | |
| # AgentAction(tool="Grade", | |
| # tool_input={ | |
| # "student_name": kwargs["student_name"], | |
| # "question": kwargs["question"], | |
| # "question_type": kwargs["question_type"], | |
| # "interest": kwargs["interest"] | |
| # }, log=""), | |
| # ] | |
| # else: | |
| # return AgentFinish(return_values={"output": "bar"}, log="") | |
| # | |
| # | |
| # schema = { | |
| # "properties": { | |
| # "student_name" : {"type": "string", "description": "The name of the student"}, | |
| # "question": {"type": "string", "description": "The question being asked"}, | |
| # "question type" : {"type": "string", | |
| # "enum": ["student grades", "student specific", "interest specific"], | |
| # "description": "The type of question being asked"}, | |
| # "interest" : {"type": "string", "description": "The interest of the student"}, | |
| # }, | |
| # "required": ["question", "question type"] | |
| # } | |
| # def get_tagging_chain(question)-> str: | |
| # global schema | |
| # chain = create_tagging_chain(schema, llm) | |
| # first_answer = chain.run(question) | |
| # print("first answer:") | |
| # print(first_answer) | |
| # return first_answer | |
| # | |
| # | |
| # def get_grading_agent(): | |
| # | |
| # tools = [ | |
| # Tool( | |
| # name="question type", | |
| # func=get_tagging_chain, | |
| # description="Useful when you need to understand the type of the input." | |
| # ), | |
| # StructuredTool( | |
| # name="Grade", | |
| # func=generate_answer, | |
| # description="Useful when you need to answer questions about students, grades, interests, etc from the context of canvas discussion posts. If the question is student specific, student name is required.", | |
| # args_schema=ToolArgsSchema | |
| # ) | |
| # ] | |
| # # agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True) | |
| # | |
| # agent = FakeAgent(output_parser=CustomOutputParser()) | |
| # # prompt = CustomPromptTemplate(template=agent_prompt, tools=tools, input_variables=["input", "intermediate_steps"]) | |
| # # output_parser = CustomOutputParser() | |
| # # tool_names = [tool.name for tool in tools] | |
| # # llm_chain = LLMChain(llm=llm, prompt=prompt) | |
| # # agent = LLMSingleActionAgent( | |
| # # llm_chain=llm_chain, | |
| # # output_parser=output_parser, | |
| # # stop=["\nObservation:"], | |
| # # allowed_tools=tool_names, | |
| # # ) | |
| # agent_executor = AgentExecutor.from_agent_and_tools( | |
| # agent=agent, tools=tools, verbose=True | |
| # ) | |
| # | |
| # # return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True) | |
| # return agent_executor | |
| # | |
| # | |
| # | |
| # def grade_answer(question) -> str: | |
| # global chat_history, vectorstore_index | |
| # agent = get_grading_agent() | |
| # return agent.run(question) |