taprosoft
commited on
Commit
·
394280f
1
Parent(s):
cb2ed5c
feat: add sycamore
Browse files- Dockerfile +1 -1
- app.py +4 -0
- backends/__init__.py +2 -0
- backends/syca.py +51 -0
- requirements.txt +1 -0
Dockerfile
CHANGED
|
@@ -31,7 +31,7 @@ ENV HOME=/home/user \
|
|
| 31 |
PYTHONUNBUFFERED=1 \
|
| 32 |
GRADIO_SERVER_NAME=0.0.0.0
|
| 33 |
|
| 34 |
-
RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 35 |
|
| 36 |
# Set the working directory to the user's home directory
|
| 37 |
WORKDIR $HOME/app
|
|
|
|
| 31 |
PYTHONUNBUFFERED=1 \
|
| 32 |
GRADIO_SERVER_NAME=0.0.0.0
|
| 33 |
|
| 34 |
+
RUN pip3 install --use-deprecated=legacy-resolver --no-cache-dir --upgrade -r /code/requirements.txt
|
| 35 |
|
| 36 |
# Set the working directory to the user's home directory
|
| 37 |
WORKDIR $HOME/app
|
app.py
CHANGED
|
@@ -22,6 +22,7 @@ from backends import ( # convert_zerox,
|
|
| 22 |
convert_img2table,
|
| 23 |
convert_marker,
|
| 24 |
convert_mineru,
|
|
|
|
| 25 |
convert_unstructured,
|
| 26 |
)
|
| 27 |
from backends.settings import ENABLE_DEBUG_MODE
|
|
@@ -65,6 +66,8 @@ def convert_document(path, method, start_page=0, enabled=True):
|
|
| 65 |
text, debug_image_paths = convert_mineru(path, file_name)
|
| 66 |
elif method == "Gemini (API)":
|
| 67 |
text, debug_image_paths = convert_gemini(path, file_name)
|
|
|
|
|
|
|
| 68 |
# elif method == "Zerox":
|
| 69 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
| 70 |
elif method == "Img2Table":
|
|
@@ -155,6 +158,7 @@ SUPPORTED_METHODS = [
|
|
| 155 |
"Gemini (API)",
|
| 156 |
"Img2Table",
|
| 157 |
"GMFT",
|
|
|
|
| 158 |
# "Zerox"
|
| 159 |
]
|
| 160 |
|
|
|
|
| 22 |
convert_img2table,
|
| 23 |
convert_marker,
|
| 24 |
convert_mineru,
|
| 25 |
+
convert_sycamore,
|
| 26 |
convert_unstructured,
|
| 27 |
)
|
| 28 |
from backends.settings import ENABLE_DEBUG_MODE
|
|
|
|
| 66 |
text, debug_image_paths = convert_mineru(path, file_name)
|
| 67 |
elif method == "Gemini (API)":
|
| 68 |
text, debug_image_paths = convert_gemini(path, file_name)
|
| 69 |
+
elif method == "Sycamore":
|
| 70 |
+
text, debug_image_paths = convert_sycamore(path, file_name)
|
| 71 |
# elif method == "Zerox":
|
| 72 |
# text, debug_image_paths = convert_zerox(path, file_name)
|
| 73 |
elif method == "Img2Table":
|
|
|
|
| 158 |
"Gemini (API)",
|
| 159 |
"Img2Table",
|
| 160 |
"GMFT",
|
| 161 |
+
"Sycamore",
|
| 162 |
# "Zerox"
|
| 163 |
]
|
| 164 |
|
backends/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from .gmft import convert_gmft
|
|
| 4 |
from .img2table import convert_img2table
|
| 5 |
from .marker import convert_marker
|
| 6 |
from .mineru import convert_mineru
|
|
|
|
| 7 |
from .unstructured import convert_unstructured
|
| 8 |
|
| 9 |
# from .zerox import convert_zerox
|
|
@@ -17,4 +18,5 @@ __all__ = [
|
|
| 17 |
# "convert_zerox",
|
| 18 |
"convert_img2table",
|
| 19 |
"convert_gmft",
|
|
|
|
| 20 |
]
|
|
|
|
| 4 |
from .img2table import convert_img2table
|
| 5 |
from .marker import convert_marker
|
| 6 |
from .mineru import convert_mineru
|
| 7 |
+
from .syca import convert_sycamore
|
| 8 |
from .unstructured import convert_unstructured
|
| 9 |
|
| 10 |
# from .zerox import convert_zerox
|
|
|
|
| 18 |
# "convert_zerox",
|
| 19 |
"convert_img2table",
|
| 20 |
"convert_gmft",
|
| 21 |
+
"convert_sycamore",
|
| 22 |
]
|
backends/syca.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import sycamore
|
| 5 |
+
from sycamore import ExecMode
|
| 6 |
+
from sycamore.data import Document
|
| 7 |
+
from sycamore.data.document import DocumentPropertyTypes
|
| 8 |
+
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image
|
| 9 |
+
from sycamore.transforms.partition import ArynPartitioner
|
| 10 |
+
from sycamore.utils.markdown import elements_to_markdown
|
| 11 |
+
|
| 12 |
+
from .settings import ENABLE_DEBUG_MODE
|
| 13 |
+
|
| 14 |
+
logging.getLogger().setLevel(logging.INFO)
|
| 15 |
+
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore")
|
| 16 |
+
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
paritioner = ArynPartitioner(
|
| 20 |
+
use_partitioning_service=False,
|
| 21 |
+
extract_table_structure=True,
|
| 22 |
+
use_ocr=True,
|
| 23 |
+
extract_images=True,
|
| 24 |
+
)
|
| 25 |
+
context = sycamore.init(
|
| 26 |
+
exec_mode=ExecMode.LOCAL,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def image_page_filename_fn(doc: Document) -> str:
|
| 31 |
+
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER]
|
| 32 |
+
return f"page_{page_num}.png"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def convert_sycamore(path: str, file_name: str):
|
| 36 |
+
docset = context.read.binary(paths=path, binary_format="pdf").partition(
|
| 37 |
+
partitioner=paritioner,
|
| 38 |
+
)
|
| 39 |
+
debug_path = SYCAMORE_DEBUG_PATH / file_name
|
| 40 |
+
debug_path.mkdir(exist_ok=True)
|
| 41 |
+
image_paths = []
|
| 42 |
+
|
| 43 |
+
doc = docset.take_all()[0]
|
| 44 |
+
md = elements_to_markdown(doc.elements)
|
| 45 |
+
|
| 46 |
+
if ENABLE_DEBUG_MODE:
|
| 47 |
+
docset.flat_map(split_and_convert_to_image).map_batch(
|
| 48 |
+
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True}
|
| 49 |
+
).write.files(str(debug_path), filename_fn=image_page_filename_fn)
|
| 50 |
+
image_paths = [str(path) for path in debug_path.glob("*.png")]
|
| 51 |
+
return md, image_paths
|
requirements.txt
CHANGED
|
@@ -19,3 +19,4 @@ openai
|
|
| 19 |
opencv-contrib-python
|
| 20 |
gmft
|
| 21 |
img2table
|
|
|
|
|
|
| 19 |
opencv-contrib-python
|
| 20 |
gmft
|
| 21 |
img2table
|
| 22 |
+
sycamore-ai[local-inference]
|