Spaces:
Paused
Paused
| # TODO: return all pages used to form answer | |
| # TODO: question samples | |
| # TEST: with and without GPU instance | |
| # TODO: visual questions on page image (in same app)? | |
| # expose more parameters | |
| import torch | |
| from llama_index.llms.huggingface import HuggingFaceLLM | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core import SimpleDirectoryReader | |
| from llama_index.core import VectorStoreIndex, SummaryIndex | |
| from llama_index.core.prompts import PromptTemplate | |
| from llama_index.core import Settings | |
| from PIL import Image | |
| import gradio as gr | |
| CHEAPMODE = torch.cuda.is_available() | |
| # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2" | |
| config = { | |
| # "LLM": "meta-llama/Meta-Llama-3-8B", | |
| # "LLM": "google/gemma-2b", | |
| # "LLM": "microsoft/phi-2", | |
| "LLM": "HuggingFaceH4/zephyr-7b-alpha", | |
| "embeddings": "BAAI/bge-small-en-v1.5", | |
| "similarity_top_k": 2, | |
| "context_window": 2048, | |
| "max_new_tokens": 200, | |
| "temperature": 0.7, | |
| "top_k": 5, | |
| "top_p": 0.95, | |
| "chunk_size": 512, | |
| "chunk_overlap": 50, | |
| } | |
| def center_element(el): | |
| return f"<div style='text-align: center;'>{el}</div>" | |
| title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understanding" | |
| title = center_element(title) | |
| description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page. | |
| <div class="center"> | |
| <a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf"> | |
| <img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png" | |
| title="Thesis.pdf" alt="Ideogram image generated with prompt engineering" width="500" class="center"/></a> | |
| </div> Click the visual above to be redirected to the PDF of the manuscript. | |
| Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace | |
| Spoiler: a quickly hacked together RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳ (10s+) | |
| """ | |
| description = center_element(description) | |
| def messages_to_prompt(messages): | |
| prompt = "" | |
| for message in messages: | |
| if message.role == "system": | |
| m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks." | |
| prompt += f"<|system|>\n{m}</s>\n" | |
| elif message.role == "user": | |
| prompt += f"<|user|>\n{message.content}</s>\n" | |
| elif message.role == "assistant": | |
| prompt += f"<|assistant|>\n{message.content}</s>\n" | |
| # ensure we start with a system prompt, insert blank if needed | |
| if not prompt.startswith("<|system|>\n"): | |
| prompt = "<|system|>\n</s>\n" + prompt | |
| # add final assistant prompt | |
| prompt = prompt + "<|assistant|>\n" | |
| return prompt | |
| def load_RAG_pipeline(config): | |
| # LLM | |
| quantization_config = None # dirty fix for CPU/GPU support | |
| if torch.cuda.is_available(): | |
| from transformers import BitsAndBytesConfig | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| llm = HuggingFaceLLM( | |
| model_name=config["LLM"], | |
| tokenizer_name=config["LLM"], | |
| query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"), | |
| context_window=config["context_window"], | |
| max_new_tokens=config["max_new_tokens"], | |
| model_kwargs={"quantization_config": quantization_config}, | |
| # tokenizer_kwargs={}, | |
| generate_kwargs={"temperature": config["temperature"], "top_k": config["top_k"], "top_p": config["top_p"]}, | |
| messages_to_prompt=messages_to_prompt, | |
| device_map="auto", | |
| ) | |
| # Llama-index | |
| Settings.llm = llm | |
| Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"]) | |
| print(Settings) | |
| Settings.chunk_size = config["chunk_size"] | |
| Settings.chunk_overlap = config["chunk_overlap"] | |
| # raw data | |
| documents = SimpleDirectoryReader("assets/txts").load_data() | |
| vector_index = VectorStoreIndex.from_documents(documents) | |
| # summary_index = SummaryIndex.from_documents(documents) | |
| # vector_index.persist(persist_dir="vectors") | |
| # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html | |
| query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"]) | |
| return query_engine | |
| default_query_engine = load_RAG_pipeline(config) | |
| # These are placeholder functions to simulate the behavior of the RAG setup. | |
| # You would need to implement these with the actual logic to retrieve and generate answers based on the document. | |
| def get_answer(question, query_engine=default_query_engine): | |
| # Here you should implement the logic to generate an answer based on the question and the document. | |
| # For example, you could use a machine learning model for RAG. | |
| # answer = "This is a placeholder answer." | |
| # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations | |
| response = query_engine.query(question) | |
| print(f"A: {response}") | |
| return response | |
| def get_answer_page(response): | |
| # Implement logic to retrieve the page number or an image of the page with the answer. | |
| # best image | |
| best_match = response.source_nodes[0].metadata["file_path"] | |
| answer_page = int(best_match[-8:-4]) | |
| image = Image.open(best_match.replace("txt", "png")) | |
| return image, f"Navigate to page {answer_page}" | |
| # Create the gr.Interface function | |
| def ask_my_thesis( | |
| question, | |
| similarity_top_k=config["similarity_top_k"], | |
| context_window=config["context_window"], | |
| max_new_tokens=config["max_new_tokens"], | |
| temperature=config["temperature"], | |
| top_k=config["top_k"], | |
| top_p=config["top_p"], | |
| chunk_size=config["chunk_size"], | |
| chunk_overlap=config["chunk_overlap"], | |
| ): | |
| print(f"Got Q: {question}") | |
| query_engine = default_query_engine | |
| # if any change in kwargs | |
| # Check if any of the kwargs have changed | |
| if ( | |
| temperature != config["temperature"] | |
| or top_p != config["top_p"] | |
| or max_new_tokens != config["max_new_tokens"] | |
| # or LLM != config["LLM"] | |
| # or embeddings != config["embeddings"] | |
| or similarity_top_k != config["similarity_top_k"] | |
| or context_window != config["context_window"] | |
| or top_k != config["top_k"] | |
| or chunk_size != config["chunk_size"] | |
| or chunk_overlap != config["chunk_overlap"] | |
| ): | |
| # Update the config dictionary with the new values | |
| config["temperature"] = temperature | |
| config["top_p"] = top_p | |
| config["max_new_tokens"] = max_new_tokens | |
| # config["LLM"] = LLM | |
| # config["embeddings"] = embeddings | |
| config["similarity_top_k"] = similarity_top_k | |
| config["context_window"] = context_window | |
| config["top_k"] = top_k | |
| config["chunk_size"] = chunk_size | |
| config["chunk_overlap"] = chunk_overlap | |
| query_engine = load_RAG_pipeline(config) | |
| answer = get_answer(question, query_engine=query_engine) | |
| image, answer_page = get_answer_page(answer) | |
| return answer.response, image, answer_page | |
| # Set up the interface options based on the design in the image. | |
| output_image = gr.Image(label="Answer Page") | |
| # examples | |
| examples = [ | |
| ["What model is state-of-the-art on DUDE?"], | |
| ["Why is knowledge distillation interesting?"], | |
| ["What is ANLS?"], | |
| ] | |
| # Define additional Gradio input elements | |
| additional_inputs = [ | |
| # gr.Input("text", label="Question"), | |
| # gr.Input("text", label="LLM", value=config["LLM"]), | |
| # gr.Input("text", label="Embeddings", value=config["embeddings"]), | |
| gr.Slider(1, 5, value=config["similarity_top_k"], label="Similarity Top K", step=1), | |
| gr.Slider(512, 8048, value=config["context_window"], label="Context Window"), | |
| gr.Slider(20, 500, value=config["max_new_tokens"], label="Max New Tokens"), | |
| gr.Slider(0, 1, value=config["temperature"], label="Temperature"), | |
| gr.Slider(1, 10, value=config["top_k"], label="Top K", step=1), | |
| gr.Slider(0, 1, value=config["top_p"], label="Nucleus Sampling"), | |
| gr.Slider(128, 4024, value=config["chunk_size"], label="Chunk Size"), | |
| gr.Slider(0, 200, value=config["chunk_overlap"], label="Chunk Overlap"), | |
| ] | |
| iface = gr.Interface( | |
| fn=ask_my_thesis, | |
| inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")], | |
| additional_inputs=additional_inputs, | |
| outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()], | |
| examples=examples, | |
| title=title, | |
| description=description, | |
| allow_flagging="auto", | |
| cache_examples=True, | |
| ) | |
| # https://github.com/gradio-app/gradio/issues/4309 | |
| # https://discuss.huggingface.co/t/add-background-image/16381/4 background image | |
| # Start the application. | |
| if __name__ == "__main__": | |
| iface.launch() | |