Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
46800f4
1
Parent(s):
5f1cdfa
wip
Browse files- api/docs.py +29 -44
- api/solutions.py +1 -2
api/docs.py
CHANGED
|
@@ -6,7 +6,6 @@ from fastapi.routing import APIRouter
|
|
| 6 |
import logging
|
| 7 |
import io
|
| 8 |
import zipfile
|
| 9 |
-
import json
|
| 10 |
import os
|
| 11 |
from httpx import AsyncClient
|
| 12 |
from pydantic import BaseModel
|
|
@@ -17,8 +16,8 @@ import re
|
|
| 17 |
import tempfile
|
| 18 |
from lxml import etree
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
-
from fastapi import Depends,
|
| 21 |
-
from dependencies import
|
| 22 |
from fastapi.responses import StreamingResponse
|
| 23 |
from litellm.router import Router
|
| 24 |
|
|
@@ -99,15 +98,20 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
|
|
| 99 |
raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
|
| 100 |
|
| 101 |
|
| 102 |
-
def
|
| 103 |
-
"""
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
return etree.fromstring(xml_bytes, parser=parser)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
def clean_document_xml(root: etree._Element) -> None:
|
| 110 |
-
"""Nettoie le XML en modifiant l'arbre directement"""
|
| 111 |
# Suppression des balises <w:del> et leur contenu
|
| 112 |
for del_elem in root.xpath('//w:del', namespaces=NSMAP):
|
| 113 |
parent = del_elem.getparent()
|
|
@@ -117,11 +121,12 @@ def clean_document_xml(root: etree._Element) -> None:
|
|
| 117 |
# Désencapsulation des balises <w:ins>
|
| 118 |
for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
|
| 119 |
parent = ins_elem.getparent()
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
| 125 |
|
| 126 |
# Nettoyage des commentaires
|
| 127 |
for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
|
|
@@ -130,20 +135,18 @@ def clean_document_xml(root: etree._Element) -> None:
|
|
| 130 |
if parent is not None:
|
| 131 |
parent.remove(elem)
|
| 132 |
|
| 133 |
-
|
| 134 |
-
def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> io.BytesIO:
|
| 135 |
-
"""Crée un nouveau docx avec le XML modifié"""
|
| 136 |
output = io.BytesIO()
|
| 137 |
|
| 138 |
with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
|
| 139 |
# Copier tous les fichiers non modifiés
|
| 140 |
-
for
|
| 141 |
-
if
|
| 142 |
-
new_zip.writestr(
|
| 143 |
|
| 144 |
# Ajouter le document.xml modifié
|
| 145 |
xml_str = etree.tostring(
|
| 146 |
-
|
| 147 |
xml_declaration=True,
|
| 148 |
encoding='UTF-8',
|
| 149 |
pretty_print=True
|
|
@@ -156,10 +159,7 @@ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._El
|
|
| 156 |
|
| 157 |
def docx_to_txt(doc_id: str, url: str) -> str:
|
| 158 |
docx_zip = get_docx_archive(url)
|
| 159 |
-
|
| 160 |
-
clean_document_xml(root)
|
| 161 |
-
|
| 162 |
-
modified_bytes = create_modified_docx(docx_zip, root)
|
| 163 |
|
| 164 |
final_bytes = convert_file(
|
| 165 |
modified_bytes, f"{doc_id}", "docx", "txt")
|
|
@@ -278,32 +278,17 @@ def download_tdocs(req: DocDownloadRequest):
|
|
| 278 |
try:
|
| 279 |
text_lines = docx_to_txt(doc_id, doc_url)
|
| 280 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
| 281 |
-
return
|
| 282 |
except Exception as e:
|
| 283 |
logging.warning(
|
| 284 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
| 285 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
| 286 |
"utf-8")
|
| 287 |
-
return
|
| 288 |
|
| 289 |
for doc in req.documents:
|
| 290 |
-
|
| 291 |
documents_content[doc.document] = content
|
| 292 |
-
if not success:
|
| 293 |
-
failed_documents.append(doc.doc_id)
|
| 294 |
-
|
| 295 |
-
# sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
|
| 296 |
-
for requested_doc_id in document_ids:
|
| 297 |
-
if requested_doc_id not in documents_content:
|
| 298 |
-
error_msg = (
|
| 299 |
-
f"Failed to retrieve or process document '{requested_doc_id}'. "
|
| 300 |
-
).encode("utf-8")
|
| 301 |
-
|
| 302 |
-
documents_content[requested_doc_id] = error_msg
|
| 303 |
-
logging.warning(
|
| 304 |
-
f"Document '{requested_doc_id}' was requested but not found or processed.")
|
| 305 |
-
if requested_doc_id not in failed_documents:
|
| 306 |
-
failed_documents.append(requested_doc_id)
|
| 307 |
|
| 308 |
zip_buffer = io.BytesIO()
|
| 309 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
|
|
|
| 6 |
import logging
|
| 7 |
import io
|
| 8 |
import zipfile
|
|
|
|
| 9 |
import os
|
| 10 |
from httpx import AsyncClient
|
| 11 |
from pydantic import BaseModel
|
|
|
|
| 16 |
import tempfile
|
| 17 |
from lxml import etree
|
| 18 |
from bs4 import BeautifulSoup
|
| 19 |
+
from fastapi import Depends, HTTPException
|
| 20 |
+
from dependencies import get_http_client, get_llm_router
|
| 21 |
from fastapi.responses import StreamingResponse
|
| 22 |
from litellm.router import Router
|
| 23 |
|
|
|
|
| 98 |
raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
|
| 99 |
|
| 100 |
|
| 101 |
+
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
| 102 |
+
"""
|
| 103 |
+
Applique les révisions des .docx avant de retourner le contenu
|
| 104 |
+
"""
|
|
|
|
| 105 |
|
| 106 |
+
try:
|
| 107 |
+
xml_bytes = docx_zip.read('word/document.xml')
|
| 108 |
+
except KeyError:
|
| 109 |
+
raise FileNotFoundError(
|
| 110 |
+
"word/document.xml not found in the DOCX archive.")
|
| 111 |
+
|
| 112 |
+
parser = etree.XMLParser(remove_blank_text=True)
|
| 113 |
+
root = etree.fromstring(xml_bytes, parser=parser)
|
| 114 |
|
|
|
|
|
|
|
| 115 |
# Suppression des balises <w:del> et leur contenu
|
| 116 |
for del_elem in root.xpath('//w:del', namespaces=NSMAP):
|
| 117 |
parent = del_elem.getparent()
|
|
|
|
| 121 |
# Désencapsulation des balises <w:ins>
|
| 122 |
for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
|
| 123 |
parent = ins_elem.getparent()
|
| 124 |
+
if parent is not None:
|
| 125 |
+
index = parent.index(ins_elem)
|
| 126 |
+
for child in ins_elem.iterchildren():
|
| 127 |
+
parent.insert(index, child)
|
| 128 |
+
index += 1
|
| 129 |
+
parent.remove(ins_elem)
|
| 130 |
|
| 131 |
# Nettoyage des commentaires
|
| 132 |
for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
|
|
|
|
| 135 |
if parent is not None:
|
| 136 |
parent.remove(elem)
|
| 137 |
|
| 138 |
+
# 3. Create a new docx with the modified XML
|
|
|
|
|
|
|
| 139 |
output = io.BytesIO()
|
| 140 |
|
| 141 |
with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
|
| 142 |
# Copier tous les fichiers non modifiés
|
| 143 |
+
for file_info in docx_zip.infolist():
|
| 144 |
+
if file_info.filename != 'word/document.xml':
|
| 145 |
+
new_zip.writestr(file_info, docx_zip.read(file_info.filename))
|
| 146 |
|
| 147 |
# Ajouter le document.xml modifié
|
| 148 |
xml_str = etree.tostring(
|
| 149 |
+
root,
|
| 150 |
xml_declaration=True,
|
| 151 |
encoding='UTF-8',
|
| 152 |
pretty_print=True
|
|
|
|
| 159 |
|
| 160 |
def docx_to_txt(doc_id: str, url: str) -> str:
|
| 161 |
docx_zip = get_docx_archive(url)
|
| 162 |
+
modified_bytes = apply_docx_revisions(docx_zip)
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
final_bytes = convert_file(
|
| 165 |
modified_bytes, f"{doc_id}", "docx", "txt")
|
|
|
|
| 278 |
try:
|
| 279 |
text_lines = docx_to_txt(doc_id, doc_url)
|
| 280 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
| 281 |
+
return content_bytes
|
| 282 |
except Exception as e:
|
| 283 |
logging.warning(
|
| 284 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
| 285 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
| 286 |
"utf-8")
|
| 287 |
+
return error_message
|
| 288 |
|
| 289 |
for doc in req.documents:
|
| 290 |
+
content = _process_single_document(doc.document, doc.url)
|
| 291 |
documents_content[doc.document] = content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
zip_buffer = io.BytesIO()
|
| 294 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
api/solutions.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import asyncio
|
| 2 |
import json
|
| 3 |
-
import
|
| 4 |
-
from fastapi import APIRouter, Depends, HTTPException, Response
|
| 5 |
from httpx import AsyncClient
|
| 6 |
from jinja2 import Environment, TemplateNotFound
|
| 7 |
from litellm.router import Router
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import json
|
| 3 |
+
from fastapi import APIRouter, Depends
|
|
|
|
| 4 |
from httpx import AsyncClient
|
| 5 |
from jinja2 import Environment, TemplateNotFound
|
| 6 |
from litellm.router import Router
|