Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
256eefa
1
Parent(s):
4edd44f
Hopefully fix task hang issues
Browse files- api/docs.py +42 -43
api/docs.py
CHANGED
|
@@ -41,7 +41,7 @@ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
|
|
| 41 |
force_ocr=False, ocr_backend=None, extract_tables=True)
|
| 42 |
|
| 43 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
| 44 |
-
LO_CONVERSION_MUTEX = asyncio.
|
| 45 |
|
| 46 |
# Supported file types for text extraction and their MIME type
|
| 47 |
FORMAT_MIME_TYPES = {
|
|
@@ -65,54 +65,53 @@ async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str,
|
|
| 65 |
filter: The conversion filter to use.
|
| 66 |
"""
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
)
|
| 96 |
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
|
| 118 |
async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
|
|
|
|
| 41 |
force_ocr=False, ocr_backend=None, extract_tables=True)
|
| 42 |
|
| 43 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
| 44 |
+
LO_CONVERSION_MUTEX = asyncio.Lock()
|
| 45 |
|
| 46 |
# Supported file types for text extraction and their MIME type
|
| 47 |
FORMAT_MIME_TYPES = {
|
|
|
|
| 65 |
filter: The conversion filter to use.
|
| 66 |
"""
|
| 67 |
|
| 68 |
+
async with LO_CONVERSION_MUTEX:
|
| 69 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 70 |
+
dir_path = Path(tmpdir)
|
| 71 |
+
input_file_path = dir_path / f"{filename}.{input_ext}"
|
| 72 |
+
output_file_path = dir_path / f"{filename}.{output_ext}"
|
| 73 |
+
|
| 74 |
+
# write the memory contents to the input file
|
| 75 |
+
with open(input_file_path, "wb") as in_file:
|
| 76 |
+
in_file.write(contents.read())
|
| 77 |
+
|
| 78 |
+
out_bytes = io.BytesIO()
|
| 79 |
+
|
| 80 |
+
# construct the command
|
| 81 |
+
command = [
|
| 82 |
+
"libreoffice",
|
| 83 |
+
"--headless",
|
| 84 |
+
"--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
|
| 85 |
+
"--outdir", tmpdir,
|
| 86 |
+
str(input_file_path) # Ensure path is a string for subprocess
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
# convert using libreoffice asynchronously
|
| 90 |
+
process = await asyncio.create_subprocess_exec(
|
| 91 |
+
*command,
|
| 92 |
+
stdout=asyncio.subprocess.PIPE,
|
| 93 |
+
stderr=asyncio.subprocess.PIPE
|
| 94 |
+
)
|
|
|
|
| 95 |
|
| 96 |
+
stdout, stderr = await process.communicate()
|
| 97 |
|
| 98 |
+
exit_code = await process.wait()
|
| 99 |
|
| 100 |
+
if exit_code != 0 and not output_file_path.exists():
|
| 101 |
+
raise subprocess.CalledProcessError(
|
| 102 |
+
exit_code,
|
| 103 |
+
command,
|
| 104 |
+
output=stdout,
|
| 105 |
+
stderr=stderr
|
| 106 |
+
)
|
| 107 |
|
| 108 |
+
# LO_CONVERSION_MUTEX.release()
|
| 109 |
|
| 110 |
+
with open(output_file_path, mode="rb") as out:
|
| 111 |
+
out_bytes.write(out.read())
|
| 112 |
|
| 113 |
+
out_bytes.seek(0)
|
| 114 |
+
return out_bytes
|
| 115 |
|
| 116 |
|
| 117 |
async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
|