Spaces:

OrganizedProgrammers
/

Docxtract

Sleeping

App Files Files Community

Lucas ARRIESSE commited on Aug 28

Commit

4edd44f

1 Parent(s): 7e24a88

Add debug statements to narrow issue

Browse files

Files changed (1) hide show

api/docs.py +11 -0

api/docs.py CHANGED Viewed

@@ -125,21 +125,32 @@ async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> l
     if ext == ".doc":
         logging.debug(f"Converting {filename} .doc --> .docx")
         docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     elif ext == ".docx":
         # Applying doc revisions to docx files (especially for pCR / draftCR files)
         logging.debug(f"Updating .docx revisions for {filename}.")
         applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
         extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     elif ext == ".ppt":
         logging.debug(f"Converting {filename} .ppt --> .pptx")
         docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     else:
         if ext in FORMAT_MIME_TYPES:  # check if file extension is supported
             extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
             final_text = extracted_data.content
         else:

     if ext == ".doc":
         logging.debug(f"Converting {filename} .doc --> .docx")
         docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
+        logging.debug(
+            f"Extracting content for filename: {filename}, ext: {ext} with converted doc")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
+        logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
     elif ext == ".docx":
         # Applying doc revisions to docx files (especially for pCR / draftCR files)
         logging.debug(f"Updating .docx revisions for {filename}.")
         applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
+        logging.debug(
+            f"Extracting content for filename: {filename}, ext: {ext} with converted docx")
         extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
+        logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
     elif ext == ".ppt":
         logging.debug(f"Converting {filename} .ppt --> .pptx")
         docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
+        logging.debug(
+            f"Extracting content for filename: {filename}, ext: {ext} with converted ppt")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
+        logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
     else:
         if ext in FORMAT_MIME_TYPES:  # check if file extension is supported
+            logging.debug(
+                f"Extracting content for filename: {filename}, ext: {ext}")
             extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
             final_text = extracted_data.content
         else: