Spaces:

OrganizedProgrammers
/

Docxtract

Sleeping

App Files Files Community

Lucas ARRIESSE commited on Aug 8

Commit

6607a5c

1 Parent(s): 46800f4

Make doc extraction async

Browse files

Files changed (2) hide show

api/docs.py +57 -26
app.py +1 -1

api/docs.py CHANGED Viewed

@@ -34,10 +34,13 @@ NSMAP = {
 # ================================== Converting of files to .txt ====================================
-def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
     Args:
         contents: File contents
@@ -46,6 +49,9 @@ def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext
         output_ext: Output extension (WITHOUT THE DOT)
         filter: The conversion filter to use.
     """
     with tempfile.TemporaryDirectory() as tmpdir:
         dir_path = Path(tmpdir)
         input_file_path = dir_path / f"{filename}.{input_ext}"
@@ -57,14 +63,35 @@ def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext
         out_bytes = io.BytesIO()
-        # convert using libreoffice
-        subprocess.run([
             "libreoffice",
             "--headless",
             "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
             "--outdir", tmpdir,
-            input_file_path
-        ], check=True)
         with open(output_file_path, mode="rb") as out:
             out_bytes.write(out.read())
@@ -73,27 +100,31 @@ def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext
         return out_bytes
-def get_docx_archive(url: str) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     doc_id = os.path.splitext(os.path.basename(url))[0]
-    resp = requests.get(url, verify=False, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
     resp.raise_for_status()
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
         for file_name in zf.namelist():
             if file_name.endswith(".docx"):
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
             elif file_name.endswith(".doc"):
                 in_bytes = io.BytesIO(zf.read(file_name))
-                docx_bytes = convert_file(in_bytes, doc_id, "doc", "docx")
                 return zipfile.ZipFile(docx_bytes)
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
@@ -157,11 +188,14 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     return output
-def docx_to_txt(doc_id: str, url: str) -> str:
-    docx_zip = get_docx_archive(url)
     modified_bytes = apply_docx_revisions(docx_zip)
-    final_bytes = convert_file(
         modified_bytes, f"{doc_id}", "docx", "txt")
     final_bytes_text = str(final_bytes.read(), encoding="utf-8")
@@ -262,7 +296,7 @@ async def get_docs_df(req: DataRequest, http_client: AsyncClient = Depends(get_h
 @router.post("/download_tdocs")
-def download_tdocs(req: DocDownloadRequest):
     """Download the specified TDocs and zips them in a single archive"""
     # Document IDs to download
@@ -270,31 +304,27 @@ def download_tdocs(req: DocDownloadRequest):
     logging.info(f"Downloading TDocs: {document_ids}")
-    documents_content: Dict[str, bytes] = {}
-    failed_documents: List[str] = []
-    def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
         """Attempts to convert a document to text and returns success status and content."""
         try:
-            text_lines = docx_to_txt(doc_id, doc_url)
             content_bytes = "\n".join(text_lines).encode("utf-8")
-            return content_bytes
         except Exception as e:
             logging.warning(
                 f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
             error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
                 "utf-8")
-            return error_message
-    for doc in req.documents:
-        content = _process_single_document(doc.document, doc.url)
-        documents_content[doc.document] = content
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
-        for doc_id, content_bytes in documents_content.items():
             safe_filename = f"{doc_id}.txt"
-            zip_file.writestr(safe_filename, content_bytes)
     zip_buffer.seek(0)
@@ -316,7 +346,7 @@ class ProgressUpdate(BaseModel):
 @router.post("/generate_requirements/sse")
-async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends(get_llm_router)):
     """Extract requirements from the specified xxxxCR docs using a LLM and returns SSE events about the progress of ongoing operations"""
     documents = req.documents
@@ -337,7 +367,8 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
         # convert the docx to txt for use
         try:
-            full = "\n".join(docx_to_txt(doc_id, url))
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))
             logging.error(f"Failed to process doc {doc_id} : {fmt}")

 # ================================== Converting of files to .txt ====================================
+# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
+CONVERSION_MUTEX = asyncio.Semaphore(1)
+async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
+    This is an asynchronous version.
     Args:
         contents: File contents
         output_ext: Output extension (WITHOUT THE DOT)
         filter: The conversion filter to use.
     """
+    await CONVERSION_MUTEX.acquire()
     with tempfile.TemporaryDirectory() as tmpdir:
         dir_path = Path(tmpdir)
         input_file_path = dir_path / f"{filename}.{input_ext}"
         out_bytes = io.BytesIO()
+        # construct the command
+        command = [
             "libreoffice",
             "--headless",
             "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
             "--outdir", tmpdir,
+            str(input_file_path)  # Ensure path is a string for subprocess
+        ]
+        # convert using libreoffice asynchronously
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        exit_code = await process.wait()
+        if exit_code != 0 and not output_file_path.exists():
+            raise subprocess.CalledProcessError(
+                exit_code,
+                command,
+                output=stdout,
+                stderr=stderr
+            )
+        CONVERSION_MUTEX.release()
         with open(output_file_path, mode="rb") as out:
             out_bytes.write(out.read())
         return out_bytes
+async def get_doc_archive(url: str, client: AsyncClient) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     doc_id = os.path.splitext(os.path.basename(url))[0]
+    resp = await client.get(url, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
     resp.raise_for_status()
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+        # there should be a single file per file
         for file_name in zf.namelist():
             if file_name.endswith(".docx"):
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
             elif file_name.endswith(".doc"):
                 in_bytes = io.BytesIO(zf.read(file_name))
+                docx_bytes = await convert_file(in_bytes, doc_id, "doc", "docx")
                 return zipfile.ZipFile(docx_bytes)
+            elif file_name.endswith(".pptx"):
+                in_bytes = io.BytesIO(zf.read())
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
     return output
+async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
+    # Grab the document archive
+    docx_zip = await get_doc_archive(url, client)
+    # Apply the docx_revisions if the
     modified_bytes = apply_docx_revisions(docx_zip)
+    final_bytes = await convert_file(
         modified_bytes, f"{doc_id}", "docx", "txt")
     final_bytes_text = str(final_bytes.read(), encoding="utf-8")
 @router.post("/download_tdocs")
+async def download_tdocs(req: DocDownloadRequest, http_client: AsyncClient = Depends(get_http_client)):
     """Download the specified TDocs and zips them in a single archive"""
     # Document IDs to download
     logging.info(f"Downloading TDocs: {document_ids}")
+    async def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
         """Attempts to convert a document to text and returns success status and content."""
         try:
+            text_lines = await doc_to_txt(doc_id, doc_url, http_client)
             content_bytes = "\n".join(text_lines).encode("utf-8")
+            return {"doc_id": doc_id, "content": content_bytes}
         except Exception as e:
             logging.warning(
                 f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
             error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
                 "utf-8")
+            return {"doc_id": doc_id, "content": error_message}
+    convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
+        for task in convert_tasks:
+            doc_id = task["doc_id"]
             safe_filename = f"{doc_id}.txt"
+            zip_file.writestr(safe_filename, task["content"])
     zip_buffer.seek(0)
 @router.post("/generate_requirements/sse")
+async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends(get_llm_router), http_client: AsyncClient = Depends(get_http_client)):
     """Extract requirements from the specified xxxxCR docs using a LLM and returns SSE events about the progress of ongoing operations"""
     documents = req.documents
         # convert the docx to txt for use
         try:
+            doc = await doc_to_txt(doc_id, url, http_client)
+            full = "\n".join(doc)
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))
             logging.error(f"Failed to process doc {doc_id} : {fmt}")

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import api.solutions
 from dependencies import get_llm_router, get_prompt_templates, init_dependencies
 import api.docs
 import api.requirements
-from api.docs import docx_to_txt
 from schemas import *
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, StreamingResponse

 from dependencies import get_llm_router, get_prompt_templates, init_dependencies
 import api.docs
 import api.requirements
+from api.docs import doc_to_txt
 from schemas import *
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, StreamingResponse