Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
4edd44f
1
Parent(s):
7e24a88
Add debug statements to narrow issue
Browse files- api/docs.py +11 -0
api/docs.py
CHANGED
|
@@ -125,21 +125,32 @@ async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> l
|
|
| 125 |
if ext == ".doc":
|
| 126 |
logging.debug(f"Converting {filename} .doc --> .docx")
|
| 127 |
docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
|
|
|
|
|
|
|
| 128 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
| 129 |
final_text = extracted_data.content
|
|
|
|
| 130 |
elif ext == ".docx":
|
| 131 |
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
| 132 |
logging.debug(f"Updating .docx revisions for {filename}.")
|
| 133 |
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
|
|
|
|
|
|
| 134 |
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
| 135 |
final_text = extracted_data.content
|
|
|
|
| 136 |
elif ext == ".ppt":
|
| 137 |
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
| 138 |
docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
|
|
|
|
|
|
|
| 139 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
| 140 |
final_text = extracted_data.content
|
|
|
|
| 141 |
else:
|
| 142 |
if ext in FORMAT_MIME_TYPES: # check if file extension is supported
|
|
|
|
|
|
|
| 143 |
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
| 144 |
final_text = extracted_data.content
|
| 145 |
else:
|
|
|
|
| 125 |
if ext == ".doc":
|
| 126 |
logging.debug(f"Converting {filename} .doc --> .docx")
|
| 127 |
docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
|
| 128 |
+
logging.debug(
|
| 129 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted doc")
|
| 130 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
| 131 |
final_text = extracted_data.content
|
| 132 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
| 133 |
elif ext == ".docx":
|
| 134 |
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
| 135 |
logging.debug(f"Updating .docx revisions for {filename}.")
|
| 136 |
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
| 137 |
+
logging.debug(
|
| 138 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted docx")
|
| 139 |
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
| 140 |
final_text = extracted_data.content
|
| 141 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
| 142 |
elif ext == ".ppt":
|
| 143 |
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
| 144 |
docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
|
| 145 |
+
logging.debug(
|
| 146 |
+
f"Extracting content for filename: {filename}, ext: {ext} with converted ppt")
|
| 147 |
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
| 148 |
final_text = extracted_data.content
|
| 149 |
+
logging.debug(f"Got text content for filename: {filename}, ext: {ext}")
|
| 150 |
else:
|
| 151 |
if ext in FORMAT_MIME_TYPES: # check if file extension is supported
|
| 152 |
+
logging.debug(
|
| 153 |
+
f"Extracting content for filename: {filename}, ext: {ext}")
|
| 154 |
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
| 155 |
final_text = extracted_data.content
|
| 156 |
else:
|