test-deepsearch

Sleeping

App Files Files Community

manu commited on Aug 23

Commit

a1627f5

verified ·

1 Parent(s): 40e26e1

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -56

app.py CHANGED Viewed

@@ -28,6 +28,10 @@ from openai import OpenAI
 # =============================
 api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
 ds: List[torch.Tensor] = []     # page embeddings
 images: List[Image.Image] = []  # PIL images in page order
 current_pdf_path: Optional[str] = None
@@ -174,7 +178,7 @@ def image_search(query: str, k: int = 5) -> List[int]:
     """
     Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
     MCP tool description:
-      - name: visual_deepsearch_image_search
       - description: Search within a PDF document for the most relevant pages to answer a query.
       - input_schema:
           type: object
@@ -212,7 +216,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
     """
     Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
     MCP tool description:
-      - name: visual_deepsearch_search_synthetize
       - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
       - input_schema:
           type: object
@@ -227,13 +231,13 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
         ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
     """
     top_k_indices = image_search(query, k)
-    expanded = set(top_k_indices)
-    for i in top_k_indices:
-        expanded.add(i - 1)
-        expanded.add(i + 1)
-    expanded = {i for i in expanded if 0 <= i < len(images)}
-    expanded = sorted(expanded)
-    expanded = expanded if len(expanded) < 20 else sorted(top_k_indices)
     # Build gallery results with 1-based page numbering
@@ -270,12 +274,12 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
 SYSTEM1 = (
     """
-You are a PDF research agent with a single tool: visual_deepsearch_image_search(query: string, k: int).
 Act iteratively:
   1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
   2) To run new searches, split the query into 1–3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
-  3) For each sub-query, call visual_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
-  4) You will receive the output of visual_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
   5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
 Workflow:
@@ -290,10 +294,10 @@ Deliverable:
 SYSTEM2 = """
-You are a PDF research agent with a single tool: visual_deepsearch_search_synthetize(query: string, k: int).
 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
-  2) For each sub-query, call visual_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
   3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Grounding & citations:
@@ -342,7 +346,7 @@ def stream_agent(question: str,
     visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
-    allowed_tools = "visual_deepsearch_image_search"  if visual_reasoning else "visual_deepsearch_search_synthetize"
     SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
     if not api_key:
@@ -498,12 +502,12 @@ def stream_agent(question: str,
         if next_indices and visual_reasoning:
             # Neighbor expansion for context
             base = set(next_indices)
-            expanded = set(base)
-            for i in base:
-                expanded.add(i - 1)
-                expanded.add(i + 1)
-            expanded = {i for i in expanded if 0 <= i < len(images)}
-            pending_indices = sorted(expanded) if len(expanded) < 20 else sorted(base)
             round_idx += 1
             continue
@@ -592,41 +596,41 @@ def build_ui():
         )
         # ---- Tab 1: Index & Preview
-        with gr.Tab("1) Index & Preview"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-                    index_btn = gr.Button("📥 Index Uploaded PDF", variant="secondary")
-                    url_box = gr.Textbox(
-                        label="Or index from URL",
-                        placeholder="https://example.com/file.pdf",
-                        value="",
-                    )
-                    index_url_btn = gr.Button("🌐 Load From URL", variant="secondary")
-                    status_box = gr.Textbox(label="Status", interactive=False)
-                with gr.Column(scale=2):
-                    pdf_view = PDF(label="PDF Preview")
-            # wiring
-            def handle_upload(file):
-                global current_pdf_path
-                if file is None:
-                    return "Please upload a PDF.", None
-                path = getattr(file, "name", file)
-                status = index_from_path(path)
-                current_pdf_path = path
-                return status, path
-            def handle_url(url: str):
-                global current_pdf_path
-                if not url or not url.lower().endswith(".pdf"):
-                    return "Please provide a direct PDF URL ending in .pdf", None
-                status, path = index_from_url(url)
-                current_pdf_path = path
-                return status, path
-            index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
-            index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
         # ---- Tab 2: Ask (Direct — returns indices)
         with gr.Tab("2) Direct Search"):
@@ -712,6 +716,9 @@ def build_ui():
 if __name__ == "__main__":
     demo = build_ui()
     # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
     # We keep the MCP server available, but the agent never uses MCP to pass images.
     demo.queue(max_size=5).launch(debug=True, mcp_server=True)

 # =============================
 api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
+from datasets import load_dataset
 ds: List[torch.Tensor] = []     # page embeddings
 images: List[Image.Image] = []  # PIL images in page order
 current_pdf_path: Optional[str] = None
     """
     Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
     MCP tool description:
+      - name: test_deepsearch_image_search
       - description: Search within a PDF document for the most relevant pages to answer a query.
       - input_schema:
           type: object
     """
     Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
     MCP tool description:
+      - name: test_deepsearch_search_synthetize
       - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
       - input_schema:
           type: object
         ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
     """
     top_k_indices = image_search(query, k)
+    # expanded = set(top_k_indices)
+    # for i in top_k_indices:
+    #     expanded.add(i - 1)
+    #     expanded.add(i + 1)
+    # expanded = {i for i in expanded if 0 <= i < len(images)}
+    # expanded = sorted(expanded)
+    expanded = top_k_indices
     # Build gallery results with 1-based page numbering
 SYSTEM1 = (
     """
+You are a PDF research agent with a single tool: test_deepsearch_image_search(query: string, k: int).
 Act iteratively:
   1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
   2) To run new searches, split the query into 1–3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
+  3) For each sub-query, call test_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
+  4) You will receive the output of test_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
   5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
 Workflow:
 SYSTEM2 = """
+You are a PDF research agent with a single tool: test_deepsearch_search_synthetize(query: string, k: int).
 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
+  2) For each sub-query, call test_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
   3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Grounding & citations:
     visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
+    allowed_tools = "test_deepsearch_image_search"  if visual_reasoning else "test_deepsearch_search_synthetize"
     SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
     if not api_key:
         if next_indices and visual_reasoning:
             # Neighbor expansion for context
             base = set(next_indices)
+            # expanded = set(base)
+            # for i in base:
+            #     expanded.add(i - 1)
+            #     expanded.add(i + 1)
+            # expanded = {i for i in expanded if 0 <= i < len(images)}
+            pending_indices = sorted(base)
             round_idx += 1
             continue
         )
         # ---- Tab 1: Index & Preview
+        # with gr.Tab("1) Index & Preview"):
+        #     with gr.Row():
+        #         with gr.Column(scale=1):
+        #             pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+        #             index_btn = gr.Button("📥 Index Uploaded PDF", variant="secondary")
+        #             url_box = gr.Textbox(
+        #                 label="Or index from URL",
+        #                 placeholder="https://example.com/file.pdf",
+        #                 value="",
+        #             )
+        #             index_url_btn = gr.Button("🌐 Load From URL", variant="secondary")
+        #             status_box = gr.Textbox(label="Status", interactive=False)
+        #         with gr.Column(scale=2):
+        #             pdf_view = PDF(label="PDF Preview")
+        #     # wiring
+        #     def handle_upload(file):
+        #         global current_pdf_path
+        #         if file is None:
+        #             return "Please upload a PDF.", None
+        #         path = getattr(file, "name", file)
+        #         status = index_from_path(path)
+        #         current_pdf_path = path
+        #         return status, path
+        #     def handle_url(url: str):
+        #         global current_pdf_path
+        #         if not url or not url.lower().endswith(".pdf"):
+        #             return "Please provide a direct PDF URL ending in .pdf", None
+        #         status, path = index_from_url(url)
+        #         current_pdf_path = path
+        #         return status, path
+        #     index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
+        #     index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
         # ---- Tab 2: Ask (Direct — returns indices)
         with gr.Tab("2) Direct Search"):
 if __name__ == "__main__":
     demo = build_ui()
+    images = load_dataset("vidore/esg_reports_human_labeled_v2", "corpus", split="test")["image"]
+    print("Indexing")
+    print(index_gpu(images))
     # mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
     # We keep the MCP server available, but the agent never uses MCP to pass images.
     demo.queue(max_size=5).launch(debug=True, mcp_server=True)