Spaces:
Running
Running
Synced repo using 'sync_with_huggingface' Github Action
Browse filesoriginal:
- remote: "https://github.com/xiaoyao9184/docker-marker"
- commit: "53922256b5d246d3f11148c1a3dac0048a6fef4f"
sync_with_huggingface:
- repository: ""
- ref: ""
- gradio_app.py +40 -3
- requirements.txt +1 -1
gradio_app.py
CHANGED
|
@@ -117,7 +117,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 117 |
)
|
| 118 |
|
| 119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
| 120 |
-
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
| 121 |
|
| 122 |
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
| 123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
|
@@ -186,7 +186,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 186 |
)
|
| 187 |
|
| 188 |
output_format_dd.change(
|
| 189 |
-
fn=lambda x: gr.update(interactive=x == "json", value=x == "json"),
|
| 190 |
inputs=[output_format_dd],
|
| 191 |
outputs=[show_blocks_ckb],
|
| 192 |
api_name=False
|
|
@@ -201,7 +201,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 201 |
filename (str): Path to the input PDF file.
|
| 202 |
page_range (str): Page range to process (e.g., "0-5").
|
| 203 |
force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
|
| 204 |
-
output_format (str, optional): Output format. One of: "markdown", "html", "json".
|
| 205 |
Defaults to "markdown".
|
| 206 |
show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
|
| 207 |
Defaults to False.
|
|
@@ -314,6 +314,43 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 314 |
gr_debug_lay,
|
| 315 |
gr_img
|
| 316 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
run_marker_btn.click(
|
| 319 |
fn=run_marker_img,
|
|
|
|
| 117 |
)
|
| 118 |
|
| 119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
| 120 |
+
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html", "chunks"], value="markdown")
|
| 121 |
|
| 122 |
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
| 123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
|
|
|
| 186 |
)
|
| 187 |
|
| 188 |
output_format_dd.change(
|
| 189 |
+
fn=lambda x: gr.update(interactive=x == "json" or x == "chunks", value=x == "json" or x == "chunks",),
|
| 190 |
inputs=[output_format_dd],
|
| 191 |
outputs=[show_blocks_ckb],
|
| 192 |
api_name=False
|
|
|
|
| 201 |
filename (str): Path to the input PDF file.
|
| 202 |
page_range (str): Page range to process (e.g., "0-5").
|
| 203 |
force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs.
|
| 204 |
+
output_format (str, optional): Output format. One of: "markdown", "html", "json", "chunks".
|
| 205 |
Defaults to "markdown".
|
| 206 |
show_blocks (bool, optional): If True, show blocks in preview image with JSON output.
|
| 207 |
Defaults to False.
|
|
|
|
| 314 |
gr_debug_lay,
|
| 315 |
gr_img
|
| 316 |
]
|
| 317 |
+
elif output_format == "chunks":
|
| 318 |
+
if show_blocks:
|
| 319 |
+
doc_json = json.loads(text)
|
| 320 |
+
color_map = {}
|
| 321 |
+
sections = []
|
| 322 |
+
def traverse(block):
|
| 323 |
+
if "block_type" in block:
|
| 324 |
+
try:
|
| 325 |
+
index = list(BlockTypes.__members__).index(block["block_type"])
|
| 326 |
+
color = COLORS[index % len(COLORS)]
|
| 327 |
+
except (ValueError, IndexError):
|
| 328 |
+
color = "#cccccc" # fallback color
|
| 329 |
+
|
| 330 |
+
label = block["id"].replace("/page/0/", "")
|
| 331 |
+
color_map[label] = color
|
| 332 |
+
|
| 333 |
+
bbox = tuple(int(x) for x in block["bbox"])
|
| 334 |
+
sections.append((bbox, label))
|
| 335 |
+
if "blocks" in block and isinstance(block["blocks"], list):
|
| 336 |
+
for child in block["blocks"]:
|
| 337 |
+
traverse(child)
|
| 338 |
+
traverse(doc_json)
|
| 339 |
+
|
| 340 |
+
page_range = config_parser.generate_config_dict()["page_range"]
|
| 341 |
+
first_page = page_range[0] if page_range else 0
|
| 342 |
+
img = get_page_image(filename, first_page + 1, dpi=72)
|
| 343 |
+
|
| 344 |
+
gr_img = gr.update(value=(img, sections), color_map=color_map)
|
| 345 |
+
|
| 346 |
+
return [
|
| 347 |
+
gr.update(visible=False),
|
| 348 |
+
gr.update(visible=True, value=text),
|
| 349 |
+
gr.update(visible=False),
|
| 350 |
+
gr_debug_pdf,
|
| 351 |
+
gr_debug_lay,
|
| 352 |
+
gr_img
|
| 353 |
+
]
|
| 354 |
|
| 355 |
run_marker_btn.click(
|
| 356 |
fn=run_marker_img,
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
torch==2.7.0
|
| 2 |
-
marker-pdf[full]==1.
|
| 3 |
gradio[mcp]==5.28.0
|
| 4 |
|
| 5 |
# transformers 4.52.4 depends on huggingface-hub>=0.30.0
|
|
|
|
| 1 |
torch==2.7.0
|
| 2 |
+
marker-pdf[full]==1.8.0
|
| 3 |
gradio[mcp]==5.28.0
|
| 4 |
|
| 5 |
# transformers 4.52.4 depends on huggingface-hub>=0.30.0
|