Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -28,6 +28,10 @@ from openai import OpenAI
|
|
| 28 |
# =============================
|
| 29 |
api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
ds: List[torch.Tensor] = [] # page embeddings
|
| 32 |
images: List[Image.Image] = [] # PIL images in page order
|
| 33 |
current_pdf_path: Optional[str] = None
|
|
@@ -174,7 +178,7 @@ def image_search(query: str, k: int = 5) -> List[int]:
|
|
| 174 |
"""
|
| 175 |
Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
|
| 176 |
MCP tool description:
|
| 177 |
-
- name:
|
| 178 |
- description: Search within a PDF document for the most relevant pages to answer a query.
|
| 179 |
- input_schema:
|
| 180 |
type: object
|
|
@@ -212,7 +216,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
|
|
| 212 |
"""
|
| 213 |
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 214 |
MCP tool description:
|
| 215 |
-
- name:
|
| 216 |
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 217 |
- input_schema:
|
| 218 |
type: object
|
|
@@ -227,13 +231,13 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
|
|
| 227 |
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
| 228 |
"""
|
| 229 |
top_k_indices = image_search(query, k)
|
| 230 |
-
expanded = set(top_k_indices)
|
| 231 |
-
for i in top_k_indices:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 235 |
-
expanded = sorted(expanded)
|
| 236 |
-
expanded =
|
| 237 |
|
| 238 |
|
| 239 |
# Build gallery results with 1-based page numbering
|
|
@@ -270,12 +274,12 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
|
|
| 270 |
|
| 271 |
SYSTEM1 = (
|
| 272 |
"""
|
| 273 |
-
You are a PDF research agent with a single tool:
|
| 274 |
Act iteratively:
|
| 275 |
1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
|
| 276 |
2) To run new searches, split the query into 1β3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
|
| 277 |
-
3) For each sub-query, call
|
| 278 |
-
4) You will receive the output of
|
| 279 |
5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
|
| 280 |
|
| 281 |
Workflow:
|
|
@@ -290,10 +294,10 @@ Deliverable:
|
|
| 290 |
|
| 291 |
|
| 292 |
SYSTEM2 = """
|
| 293 |
-
You are a PDF research agent with a single tool:
|
| 294 |
Act iteratively:
|
| 295 |
1) Split the user question into 1β4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
|
| 296 |
-
2) For each sub-query, call
|
| 297 |
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
| 298 |
|
| 299 |
Grounding & citations:
|
|
@@ -342,7 +346,7 @@ def stream_agent(question: str,
|
|
| 342 |
|
| 343 |
visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
|
| 344 |
|
| 345 |
-
allowed_tools = "
|
| 346 |
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
| 347 |
|
| 348 |
if not api_key:
|
|
@@ -498,12 +502,12 @@ def stream_agent(question: str,
|
|
| 498 |
if next_indices and visual_reasoning:
|
| 499 |
# Neighbor expansion for context
|
| 500 |
base = set(next_indices)
|
| 501 |
-
expanded = set(base)
|
| 502 |
-
for i in base:
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 506 |
-
pending_indices = sorted(
|
| 507 |
round_idx += 1
|
| 508 |
continue
|
| 509 |
|
|
@@ -592,41 +596,41 @@ def build_ui():
|
|
| 592 |
)
|
| 593 |
|
| 594 |
# ---- Tab 1: Index & Preview
|
| 595 |
-
with gr.Tab("1) Index & Preview"):
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
|
| 631 |
# ---- Tab 2: Ask (Direct β returns indices)
|
| 632 |
with gr.Tab("2) Direct Search"):
|
|
@@ -712,6 +716,9 @@ def build_ui():
|
|
| 712 |
|
| 713 |
if __name__ == "__main__":
|
| 714 |
demo = build_ui()
|
|
|
|
|
|
|
|
|
|
| 715 |
# mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
|
| 716 |
# We keep the MCP server available, but the agent never uses MCP to pass images.
|
| 717 |
demo.queue(max_size=5).launch(debug=True, mcp_server=True)
|
|
|
|
| 28 |
# =============================
|
| 29 |
api_key_env = os.getenv("OPENAI_API_KEY", "").strip()
|
| 30 |
|
| 31 |
+
from datasets import load_dataset
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
ds: List[torch.Tensor] = [] # page embeddings
|
| 36 |
images: List[Image.Image] = [] # PIL images in page order
|
| 37 |
current_pdf_path: Optional[str] = None
|
|
|
|
| 178 |
"""
|
| 179 |
Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
|
| 180 |
MCP tool description:
|
| 181 |
+
- name: test_deepsearch_image_search
|
| 182 |
- description: Search within a PDF document for the most relevant pages to answer a query.
|
| 183 |
- input_schema:
|
| 184 |
type: object
|
|
|
|
| 216 |
"""
|
| 217 |
Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 218 |
MCP tool description:
|
| 219 |
+
- name: test_deepsearch_search_synthetize
|
| 220 |
- description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
|
| 221 |
- input_schema:
|
| 222 |
type: object
|
|
|
|
| 231 |
ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
|
| 232 |
"""
|
| 233 |
top_k_indices = image_search(query, k)
|
| 234 |
+
# expanded = set(top_k_indices)
|
| 235 |
+
# for i in top_k_indices:
|
| 236 |
+
# expanded.add(i - 1)
|
| 237 |
+
# expanded.add(i + 1)
|
| 238 |
+
# expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 239 |
+
# expanded = sorted(expanded)
|
| 240 |
+
expanded = top_k_indices
|
| 241 |
|
| 242 |
|
| 243 |
# Build gallery results with 1-based page numbering
|
|
|
|
| 274 |
|
| 275 |
SYSTEM1 = (
|
| 276 |
"""
|
| 277 |
+
You are a PDF research agent with a single tool: test_deepsearch_image_search(query: string, k: int).
|
| 278 |
Act iteratively:
|
| 279 |
1) If you are given images, analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Most often, you should run new search calls using the tool to find additional missing information.
|
| 280 |
2) To run new searches, split the query into 1β3 focused sub-queries. You can use the potentially provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
|
| 281 |
+
3) For each sub-query, call test_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
|
| 282 |
+
4) You will receive the output of test_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
|
| 283 |
5) Back to step 1. Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
|
| 284 |
|
| 285 |
Workflow:
|
|
|
|
| 294 |
|
| 295 |
|
| 296 |
SYSTEM2 = """
|
| 297 |
+
You are a PDF research agent with a single tool: test_deepsearch_search_synthetize(query: string, k: int).
|
| 298 |
Act iteratively:
|
| 299 |
1) Split the user question into 1β4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
|
| 300 |
+
2) For each sub-query, call test_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
|
| 301 |
3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
|
| 302 |
|
| 303 |
Grounding & citations:
|
|
|
|
| 346 |
|
| 347 |
visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
|
| 348 |
|
| 349 |
+
allowed_tools = "test_deepsearch_image_search" if visual_reasoning else "test_deepsearch_search_synthetize"
|
| 350 |
SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
|
| 351 |
|
| 352 |
if not api_key:
|
|
|
|
| 502 |
if next_indices and visual_reasoning:
|
| 503 |
# Neighbor expansion for context
|
| 504 |
base = set(next_indices)
|
| 505 |
+
# expanded = set(base)
|
| 506 |
+
# for i in base:
|
| 507 |
+
# expanded.add(i - 1)
|
| 508 |
+
# expanded.add(i + 1)
|
| 509 |
+
# expanded = {i for i in expanded if 0 <= i < len(images)}
|
| 510 |
+
pending_indices = sorted(base)
|
| 511 |
round_idx += 1
|
| 512 |
continue
|
| 513 |
|
|
|
|
| 596 |
)
|
| 597 |
|
| 598 |
# ---- Tab 1: Index & Preview
|
| 599 |
+
# with gr.Tab("1) Index & Preview"):
|
| 600 |
+
# with gr.Row():
|
| 601 |
+
# with gr.Column(scale=1):
|
| 602 |
+
# pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 603 |
+
# index_btn = gr.Button("π₯ Index Uploaded PDF", variant="secondary")
|
| 604 |
+
# url_box = gr.Textbox(
|
| 605 |
+
# label="Or index from URL",
|
| 606 |
+
# placeholder="https://example.com/file.pdf",
|
| 607 |
+
# value="",
|
| 608 |
+
# )
|
| 609 |
+
# index_url_btn = gr.Button("π Load From URL", variant="secondary")
|
| 610 |
+
# status_box = gr.Textbox(label="Status", interactive=False)
|
| 611 |
+
# with gr.Column(scale=2):
|
| 612 |
+
# pdf_view = PDF(label="PDF Preview")
|
| 613 |
+
|
| 614 |
+
# # wiring
|
| 615 |
+
# def handle_upload(file):
|
| 616 |
+
# global current_pdf_path
|
| 617 |
+
# if file is None:
|
| 618 |
+
# return "Please upload a PDF.", None
|
| 619 |
+
# path = getattr(file, "name", file)
|
| 620 |
+
# status = index_from_path(path)
|
| 621 |
+
# current_pdf_path = path
|
| 622 |
+
# return status, path
|
| 623 |
+
|
| 624 |
+
# def handle_url(url: str):
|
| 625 |
+
# global current_pdf_path
|
| 626 |
+
# if not url or not url.lower().endswith(".pdf"):
|
| 627 |
+
# return "Please provide a direct PDF URL ending in .pdf", None
|
| 628 |
+
# status, path = index_from_url(url)
|
| 629 |
+
# current_pdf_path = path
|
| 630 |
+
# return status, path
|
| 631 |
+
|
| 632 |
+
# index_btn.click(handle_upload, inputs=[pdf_input], outputs=[status_box, pdf_view])
|
| 633 |
+
# index_url_btn.click(handle_url, inputs=[url_box], outputs=[status_box, pdf_view])
|
| 634 |
|
| 635 |
# ---- Tab 2: Ask (Direct β returns indices)
|
| 636 |
with gr.Tab("2) Direct Search"):
|
|
|
|
| 716 |
|
| 717 |
if __name__ == "__main__":
|
| 718 |
demo = build_ui()
|
| 719 |
+
images = load_dataset("vidore/esg_reports_human_labeled_v2", "corpus", split="test")["image"]
|
| 720 |
+
print("Indexing")
|
| 721 |
+
print(index_gpu(images))
|
| 722 |
# mcp_server=True exposes this app's MCP endpoint at /gradio_api/mcp/
|
| 723 |
# We keep the MCP server available, but the agent never uses MCP to pass images.
|
| 724 |
demo.queue(max_size=5).launch(debug=True, mcp_server=True)
|