Spaces:
Running
Running
Synced repo using 'sync_with_huggingface' Github Action
Browse files- gradio_app.py +12 -5
- requirements.txt +1 -1
gradio_app.py
CHANGED
|
@@ -119,11 +119,13 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
| 120 |
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
| 121 |
|
|
|
|
| 122 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
| 123 |
show_blocks_ckb = gr.Checkbox(label="Show Blocks", info="Display detected blocks, only when output is JSON", value=False, interactive=False)
|
| 124 |
debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information")
|
| 125 |
-
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
| 126 |
strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.")
|
|
|
|
|
|
|
| 127 |
run_marker_btn = gr.Button("Run Marker", interactive=False)
|
| 128 |
with gr.Column():
|
| 129 |
result_md = gr.Markdown(label="Result markdown", visible=False)
|
|
@@ -191,7 +193,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 191 |
)
|
| 192 |
|
| 193 |
# Run Marker
|
| 194 |
-
def run_marker_img(filename, page_range, force_ocr, output_format, show_blocks, debug, use_llm, strip_existing_ocr):
|
| 195 |
"""
|
| 196 |
Run marker on the given PDF file and return processed results in multiple formats.
|
| 197 |
|
|
@@ -209,7 +211,10 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 209 |
Defaults to False.
|
| 210 |
strip_existing_ocr (bool, optional): If True, strip embedded OCR text and re-run OCR.
|
| 211 |
Defaults to False.
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
| 213 |
Returns:
|
| 214 |
tuple:
|
| 215 |
- markdown_result (str): Markdown output string.
|
|
@@ -226,7 +231,9 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 226 |
"debug": debug,
|
| 227 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
| 228 |
"use_llm": use_llm,
|
| 229 |
-
"strip_existing_ocr": strip_existing_ocr
|
|
|
|
|
|
|
| 230 |
}
|
| 231 |
config_parser = ConfigParser(cli_options)
|
| 232 |
rendered = convert_pdf(
|
|
@@ -310,7 +317,7 @@ with gr.Blocks(title="Marker") as demo:
|
|
| 310 |
|
| 311 |
run_marker_btn.click(
|
| 312 |
fn=run_marker_img,
|
| 313 |
-
inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, show_blocks_ckb, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb],
|
| 314 |
outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout, in_img]
|
| 315 |
)
|
| 316 |
|
|
|
|
| 119 |
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"")
|
| 120 |
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html"], value="markdown")
|
| 121 |
|
| 122 |
+
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing")
|
| 123 |
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages")
|
| 124 |
show_blocks_ckb = gr.Checkbox(label="Show Blocks", info="Display detected blocks, only when output is JSON", value=False, interactive=False)
|
| 125 |
debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information")
|
|
|
|
| 126 |
strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.")
|
| 127 |
+
format_lines_ckb = gr.Checkbox(label="Format lines", value=False, info="Format lines in the document with OCR model")
|
| 128 |
+
disable_ocr_math_ckb = gr.Checkbox(label="Disable math", value=False, info="Disable math in OCR output - no inline math")
|
| 129 |
run_marker_btn = gr.Button("Run Marker", interactive=False)
|
| 130 |
with gr.Column():
|
| 131 |
result_md = gr.Markdown(label="Result markdown", visible=False)
|
|
|
|
| 193 |
)
|
| 194 |
|
| 195 |
# Run Marker
|
| 196 |
+
def run_marker_img(filename, page_range, force_ocr, output_format, show_blocks, debug, use_llm, strip_existing_ocr, format_lines, disable_ocr_math):
|
| 197 |
"""
|
| 198 |
Run marker on the given PDF file and return processed results in multiple formats.
|
| 199 |
|
|
|
|
| 211 |
Defaults to False.
|
| 212 |
strip_existing_ocr (bool, optional): If True, strip embedded OCR text and re-run OCR.
|
| 213 |
Defaults to False.
|
| 214 |
+
format_lines (bool, optional): If True, format lines in the document with OCR model.
|
| 215 |
+
Defaults to False.
|
| 216 |
+
disable_ocr_math (bool, optional): If True, disable math in OCR output - no inline math.
|
| 217 |
+
Defaults to False.
|
| 218 |
Returns:
|
| 219 |
tuple:
|
| 220 |
- markdown_result (str): Markdown output string.
|
|
|
|
| 231 |
"debug": debug,
|
| 232 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
| 233 |
"use_llm": use_llm,
|
| 234 |
+
"strip_existing_ocr": strip_existing_ocr,
|
| 235 |
+
"format_lines": format_lines,
|
| 236 |
+
"disable_ocr_math": disable_ocr_math,
|
| 237 |
}
|
| 238 |
config_parser = ConfigParser(cli_options)
|
| 239 |
rendered = convert_pdf(
|
|
|
|
| 317 |
|
| 318 |
run_marker_btn.click(
|
| 319 |
fn=run_marker_img,
|
| 320 |
+
inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, show_blocks_ckb, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb, format_lines_ckb, disable_ocr_math_ckb],
|
| 321 |
outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout, in_img]
|
| 322 |
)
|
| 323 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
torch==2.5.1
|
| 2 |
-
marker-pdf[full]==1.7.
|
| 3 |
gradio[mcp]==5.28.0
|
| 4 |
huggingface-hub==0.28.1
|
| 5 |
|
|
|
|
| 1 |
torch==2.5.1
|
| 2 |
+
marker-pdf[full]==1.7.1
|
| 3 |
gradio[mcp]==5.28.0
|
| 4 |
huggingface-hub==0.28.1
|
| 5 |
|