Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

om4r932 commited on Jun 19

Commit

040cfa1

1 Parent(s): 1b96641

Add query requirements capacity + TODO : Categorization

Browse files

Files changed (2) hide show

app.py +136 -52
index.html +111 -18

app.py CHANGED Viewed

@@ -1,62 +1,68 @@
-from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
-import litellm
 import pandas as pd
-from pydantic import BaseModel, Field
-from typing import Any, List, Dict, Optional
 import re
 import subprocess
 import requests
 import os
 from lxml import etree
 import zipfile
 import io
 import warnings
 warnings.filterwarnings("ignore")
 from bs4 import BeautifulSoup
 app = FastAPI(title="Requirements Extractor")
 app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_methods=["*"], allow_origins=["*"])
-class MeetingsRequest(BaseModel):
-    working_group: str
-class MeetingsResponse(BaseModel):
-    meetings: Dict[str, str]
-class DataRequest(BaseModel):
-    working_group: str
-    meeting: str
-class DataResponse(BaseModel):
-    data: List[Dict[Any, Any]]
-class DocRequirements(BaseModel):
-    doc_id: str
-    context: str
-    requirements: List[str]
-class DocInfo(BaseModel):
-    document: str
-    url: str
-class RequirementsRequest(BaseModel):
-    documents: List[DocInfo]
-class RequirementsResponse(BaseModel):
-    requirements: List[DocRequirements]
 NSMAP = {
     'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
     'v': 'urn:schemas-microsoft-com:vml'
 }
 def get_docx_archive(url: str) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
     resp = requests.get(url, verify=False, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
@@ -64,10 +70,33 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
         for file_name in zf.namelist():
-            if file_name.endswith((".docx", ".doc")):
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
 def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
@@ -210,25 +239,80 @@ def get_change_request_dataframe(req: DataRequest):
     return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 @app.post("/generate_requirements", response_model=RequirementsResponse)
-def gen_reqs(req: RequirementsRequest):
     documents = req.documents
-    output = []
-    for doc in documents:
         doc_id = doc.document
         url = doc.url
-        full = "\n".join(docx_to_txt(doc_id, url))
-        resp_ai = litellm.completion(
-            model="gemini/gemini-2.0-flash",
-            api_key=os.environ.get("GEMINI"),
-            messages=[{"role":"user","content": f"Here's the document whose ID is {doc_id} with requirements : {full}\n\nI want you to extract all the requirements and give me a context (not giving the section or whatever, a sentence is needed) where that calls for those requirements. If multiples covered contexts is present, make as many requirements list by context as you want."}],
-            response_format=DocRequirements
-        )
-        reqs = DocRequirements.model_validate_json(resp_ai.choices[0].message.content)
-        output.append(reqs)
-    return RequirementsResponse(requirements=output)

+import traceback
+from fastapi import FastAPI, BackgroundTasks
+from schemas import *
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
+from litellm.router import Router
+from aiolimiter import AsyncLimiter
 import pandas as pd
+import asyncio
 import re
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+nltk.download('wordnet')
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
 import subprocess
 import requests
+from dotenv import load_dotenv
+load_dotenv()
 import os
 from lxml import etree
 import zipfile
 import io
 import warnings
 warnings.filterwarnings("ignore")
 from bs4 import BeautifulSoup
 app = FastAPI(title="Requirements Extractor")
 app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_methods=["*"], allow_origins=["*"])
+llm_router = Router(model_list=[{"model_name": "gemini-v1", "litellm_params": {"model": "gemini/gemini-2.0-flash", "api_key": os.environ.get("GEMINI"), "max_retries": 10, "rpm": 15}},
+                                {"model_name": "gemini-v2", "litellm_params": {"model": "gemini/gemini-2.5-flash", "api_key": os.environ.get("GEMINI"), "max_retries": 10, "rpm": 10}}]
+                                , fallbacks=[{"gemini-v2": ["gemini-v1"]}], num_retries=10)
+limiter_mapping = {
+    model["model_name"]: AsyncLimiter(model["litellm_params"]["rpm"], 60)
+    for model in llm_router.model_list
+}
+lemmatizer = WordNetLemmatizer()
 NSMAP = {
     'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
     'v': 'urn:schemas-microsoft-com:vml'
 }
+def lemma(text: str):
+    stop_words = set(stopwords.words('english'))
+    txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
+    tokens = [token for token in word_tokenize(txt.lower()) if token not in stop_words]
+    return [lemmatizer.lemmatize(token) for token in tokens]
 def get_docx_archive(url: str) -> zipfile.ZipFile:
     """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
     if not url.endswith("zip"):
         raise ValueError("URL doit pointer vers un fichier ZIP")
+    doc_id = os.path.splitext(os.path.basename(url))[0]
     resp = requests.get(url, verify=False, headers={
         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     })
     with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
         for file_name in zf.namelist():
+            if file_name.endswith(".docx"):
                 docx_bytes = zf.read(file_name)
                 return zipfile.ZipFile(io.BytesIO(docx_bytes))
+            elif file_name.endswith(".doc"):
+                input_path = f"/tmp/{doc_id}.doc"
+                output_path = f"/tmp/{doc_id}.docx"
+                docx_bytes = zf.read(file_name)
+                with open(input_path, "wb") as f:
+                    f.write(docx_bytes)
+                subprocess.run([
+                    "libreoffice",
+                    "--headless",
+                    "--convert-to", "docx",
+                    "--outdir", "/tmp",
+                    input_path
+                ], check=True)
+                with open(output_path, "rb") as f:
+                    docx_bytes = f.read()
+                os.remove(input_path)
+                os.remove(output_path)
+                return zipfile.ZipFile(io.BytesIO(docx_bytes))
     raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
 def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
     return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
 @app.post("/generate_requirements", response_model=RequirementsResponse)
+async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
     documents = req.documents
+    n_docs = len(documents)
+    async def process_document(doc):
         doc_id = doc.document
         url = doc.url
+        try:
+            full = "\n".join(docx_to_txt(doc_id, url))
+        except Exception as e:
+            traceback.print_exception(e)
+            return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
+        try:
+            model_used = "gemini-v2"  # À adapter si fallback activé
+            async with limiter_mapping[model_used]:
+                resp_ai = await llm_router.acompletion(
+                    model=model_used,
+                    messages=[{"role":"user","content": f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found."}],
+                    response_format=RequirementsResponse
+                )
+            return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
+        except Exception as e:
+            if "rate limit" in str(e).lower():
+                try:
+                    model_used = "gemini-v2"  # À adapter si fallback activé
+                    async with limiter_mapping[model_used]:
+                        resp_ai = await llm_router.acompletion(
+                            model=model_used,
+                            messages=[{"role":"user","content": f"Here's the document whose ID is {doc_id} : {full}\n\nExtract all requirements and group them by context, returning a list of objects where each object includes a document ID, a concise description of the context where the requirements apply (not a chapter title or copied text), and a list of associated requirements; always return the result as a list, even if only one context is found."}],
+                            response_format=RequirementsResponse
+                        )
+                    return RequirementsResponse.model_validate_json(resp_ai.choices[0].message.content).requirements
+                except Exception as fallback_e:
+                    traceback.print_exception(fallback_e)
+                    return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
+            else:
+                traceback.print_exception(e)
+                return RequirementsResponse(requirements=[DocRequirements(document=doc_id, context="Error LLM", requirements=[])]).requirements
+    async def process_batch(batch):
+        results = await asyncio.gather(*(process_document(doc) for doc in batch))
+        return [item for sublist in results for item in sublist]
+    all_requirements = []
+    if n_docs <= 30:
+        batch_results = await process_batch(documents)
+        all_requirements.extend(batch_results)
+    else:
+        batch_size = 30
+        batches = [documents[i:i + batch_size] for i in range(0, n_docs, batch_size)]
+        for i, batch in enumerate(batches):
+            batch_results = await process_batch(batch)
+            all_requirements.extend(batch_results)
+            if i < len(batches) - 1:
+                background_tasks.add_task(asyncio.sleep, 60)
+    return RequirementsResponse(requirements=all_requirements)
+@app.post("/get_reqs_from_query", response_model=ReqSearchResponse)
+def find_requirements_from_problem_description(req: ReqSearchRequest):
+    requirements = req.requirements
+    query = req.query
+    requirements_text = "\n".join([f"[Document: {r.document} | Context: {r.context} | Requirement: {r.requirement}]" for r in requirements])
+    print("Called the LLM")
+    resp_ai = llm_router.completion(
+        model="gemini-v2",
+        messages=[{"role":"user","content": f"Given all the requirements : \n {requirements_text} \n and the problem description \"{query}\", return a list of objects each with document ID, context, and requirement for the most relevant requirements that reference or best cover the problem."}],
+        response_format=ReqSearchResponse
+    )
+    print("Answered")
+    return ReqSearchResponse.model_validate_json(resp_ai.choices[0].message.content)

index.html CHANGED Viewed

@@ -10,7 +10,7 @@
 <body class="p-8 bg-base-100">
     <div class="container mx-auto">
         <h1 class="text-4xl font-bold text-center mb-8">Requirements Extractor</h1>
-        <div>
             <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
                 <select class="select select-bordered" id="workingGroupSelect">
                     <option disabled selected value="">Working Group</option>
@@ -39,22 +39,38 @@
                     <option disabled selected value="">Type</option>
                     <option>Tous</option>
                 </select>
                 <select class="select select-bordered" id="docStatus">
                     <option disabled selected value="">Status</option>
                     <option>Tous</option>
                 </select>
                 <select class="select select-bordered" id="agendaItem">
-                    <option disabled selected value = "">Agenda</option>
                     <option>Tous</option>
                 </select>
             </div>
         </div>
         <!-- Tableau des données -->
-        <div class="max-h-[65vh] overflow-y-auto">
             <table class="table table-zebra w-full" id="dataFrame">
                 <thead class="sticky top-0 bg-base-200 z-10">
                     <tr class="bg-base-200">
@@ -71,11 +87,23 @@
             </table>
         </div>
-        <center><button class="btn mt-6 gap-4" id="getReqs">Get Requirements</button></center>
     </div>
     <script>
         function getDataFrame(){
             const wg = document.getElementById('workingGroupSelect').value;
             const meeting = document.getElementById('meetingSelect').value;
             document.getElementById('docType').innerHTML = `
@@ -84,21 +112,23 @@
             `
             document.getElementById('docStatus').innerHTML = `
-                <option disabled selected value="">Type</option>
                 <option>Tous</option>
             `
             document.getElementById('agendaItem').innerHTML = `
-                <option disabled selected value="">Type</option>
                 <option>Tous</option>
             `
             const dataFrame = document.getElementById("dataFrame");
-            document.getElementById("getTDocs").setAttribute('disabled', 'true')
-            document.getElementById("getTDocs").innerHTML = "Loading ...";
             fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
             .then(resp => resp.json())
             .then(data => {
                 document.getElementById("filters").classList.remove("hidden")
                 const dataframeBody = dataFrame.querySelector("tbody");
                 dataframeBody.innerHTML = "";
                 const setType = new Set();
@@ -147,8 +177,8 @@
                 })
             })
-            document.getElementById("getTDocs").removeAttribute("disabled")
-            document.getElementById("getTDocs").innerHTML = "Get TDocs";
         }
         function filterTable() {
@@ -186,18 +216,72 @@
             })
         }
-        function tableToGenBody(tableSelector) {
             // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
-            let columnsMap = {"TDoc": "doc_id", "URL": "url"};
-            const table = document.querySelector(tableSelector);
-            const headers = Array.from(table.querySelectorAll('thead th')).map(th => th.innerText.trim());
             // Indices des colonnes à extraire
             const selectedIndices = headers
                 .map((header, idx) => columnsMap[header] ? idx : -1)
                 .filter(idx => idx !== -1);
-            return Array.from(table.querySelectorAll('tbody tr'))
                 .filter(row => getComputedStyle(row).display !== 'none')
                 .map(row => {
                     const cells = Array.from(row.querySelectorAll('td'));
@@ -218,6 +302,15 @@
         document.getElementById('agendaItem').addEventListener('change', filterTable)
         document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
         document.getElementById('getTDocs').addEventListener('click', getDataFrame)
     </script>
 </body>
 </html>

 <body class="p-8 bg-base-100">
     <div class="container mx-auto">
         <h1 class="text-4xl font-bold text-center mb-8">Requirements Extractor</h1>
+        <div id="dataFrameForm">
             <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
                 <select class="select select-bordered" id="workingGroupSelect">
                     <option disabled selected value="">Working Group</option>
                     <option disabled selected value="">Type</option>
                     <option>Tous</option>
                 </select>
                 <select class="select select-bordered" id="docStatus">
                     <option disabled selected value="">Status</option>
                     <option>Tous</option>
                 </select>
                 <select class="select select-bordered" id="agendaItem">
+                    <option disabled selected value="">Agenda Item</option>
                     <option>Tous</option>
                 </select>
             </div>
         </div>
+        <div class="flex justify-center mt-12 min-h-screen hidden" id="queryReqForm">
+            <div class="w-full max-w-md">
+                <div class="grid grid-cols-1 gap-4">
+                    <textarea placeholder="Enter your problem description here ..."
+                        class="w-full mx-auto px-4 py-2 border rounded" id="problemDescription" />
+                    <button class="w-1/2 mx-auto px-4 py-2 bg-blue-600 text-white rounded hover:bg-blue-700" id="queryReq">
+                        Find requirements
+                    </button>
+                </div>
+            </div>
+        </div>
+        <center>
+            <span class="loading loading-bars loading-xl hidden" id="loadingBar"></span>
+            <p class="hidden" id="progressText"></p>
+        </center>
         <!-- Tableau des données -->
+        <div class="max-h-[65vh] overflow-y-auto" id="dataFrameDiv">
             <table class="table table-zebra w-full" id="dataFrame">
                 <thead class="sticky top-0 bg-base-200 z-10">
                     <tr class="bg-base-200">
             </table>
         </div>
+        <center>
+            <div id="buttons">
+                <p id="reqStatus" class="mt-6 hidden">Requirements extracted</p>
+                <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
+                    <button class="btn mt-6" id="getReqs">Get Requirements</button>
+                    <button class="btn mt-6 hidden" id="searchReq">Query requirements</button>
+                    <button class="btn mt-6 hidden" id="categorizeReq">Categorize requirements</button>
+                </div>
+            </div>
+        </center>
     </div>
     <script>
+        let requirements;
         function getDataFrame(){
+            document.getElementById("loadingBar").classList.remove("hidden");
             const wg = document.getElementById('workingGroupSelect').value;
             const meeting = document.getElementById('meetingSelect').value;
             document.getElementById('docType').innerHTML = `
             `
             document.getElementById('docStatus').innerHTML = `
+                <option disabled selected value="">Status</option>
                 <option>Tous</option>
             `
             document.getElementById('agendaItem').innerHTML = `
+                <option disabled selected value="">Agenda Item</option>
                 <option>Tous</option>
             `
             const dataFrame = document.getElementById("dataFrame");
+            document.getElementById("progressText").classList.remove('hidden')
+            document.getElementById("progressText").innerHTML = "Loading ...";
+            document.getElementById("loadingBar").classList.remove("hidden")
             fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
             .then(resp => resp.json())
             .then(data => {
                 document.getElementById("filters").classList.remove("hidden")
+                document.getElementById("loadingBar").classList.add("hidden");
                 const dataframeBody = dataFrame.querySelector("tbody");
                 dataframeBody.innerHTML = "";
                 const setType = new Set();
                 })
             })
+            document.getElementById("progressText").classList.add('hidden')
+            document.getElementById("loadingBar").classList.add("hidden")
         }
         function filterTable() {
             })
         }
+        function generateRequirements(){
+            const bodyreq = tableToGenBody();
+            document.getElementById("progressText").classList.remove('hidden');
+            document.getElementById("progressText").innerHTML = "Generating requirements, please wait, it may take a while ...";
+            document.getElementById("loadingBar").classList.remove("hidden");
+            fetch("/generate_requirements", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"documents": bodyreq})})
+            .then(resp => resp.json())
+            .then(data => {
+                requirements = [];
+                data.requirements.forEach(obj => {
+                    obj.requirements.forEach(req => {
+                        requirements.push({"document": obj.document, "context": obj.context, "requirement": req})
+                    })
+                })
+                document.getElementById("loadingBar").classList.add("hidden");
+                document.getElementById("progressText").classList.add("hidden");
+                document.getElementById("reqStatus").classList.remove("hidden");
+                document.getElementById("getReqs").classList.add("hidden");
+                document.getElementById("searchReq").classList.remove("hidden");
+                document.getElementById("categorizeReq").classList.remove("hidden");
+            })
+        }
+        function queryRequirements(){
+            fetch("/get_reqs_from_query", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({query: document.getElementById("problemDescription").value, requirements})})
+            .then(resp => resp.json())
+            .then(data => {
+                const dataFrame = document.getElementById("dataFrameDiv");
+                const dataFrameHead = dataFrame.querySelector("thead");
+                const dataFrameBody = dataFrame.querySelector("tbody");
+                dataFrame.classList.remove("hidden");
+                dataFrameHead.innerHTML = `
+                    <th>TDoc</th>
+                    <th>Context</th>
+                    <th>Requirement</th>
+                `;
+                dataFrameBody.innerHTML = "";
+                data.requirements.forEach(req => {
+                    const tr = document.createElement("tr");
+                    tr.innerHTML = `
+                        <td>${req["document"]}</td>
+                        <td>${req["context"]}</td>
+                        <td>${req["requirement"]}</td>
+                    `;
+                    dataFrameBody.appendChild(tr);
+                })
+            })
+        }
+        function tableToGenBody() {
             // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
+            let columnsMap = {"TDoc": "document", "URL": "url"};
+            const headers = Array.from(dataFrame.querySelectorAll('thead th')).map(th => th.innerText.trim());
             // Indices des colonnes à extraire
             const selectedIndices = headers
                 .map((header, idx) => columnsMap[header] ? idx : -1)
                 .filter(idx => idx !== -1);
+            return Array.from(dataFrame.querySelectorAll('tbody tr'))
                 .filter(row => getComputedStyle(row).display !== 'none')
                 .map(row => {
                     const cells = Array.from(row.querySelectorAll('td'));
         document.getElementById('agendaItem').addEventListener('change', filterTable)
         document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
         document.getElementById('getTDocs').addEventListener('click', getDataFrame)
+        document.getElementById("getReqs").addEventListener("click", generateRequirements);
+        document.getElementById("queryReq").addEventListener("click", queryRequirements)
+        document.getElementById('searchReq').addEventListener('click', ()=>{
+            document.getElementById('dataFrameForm').classList.add('hidden');
+            document.getElementById('filters').classList.add('hidden');
+            document.getElementById('queryReqForm').classList.remove('hidden');
+            document.getElementById('dataFrameDiv').classList.add('hidden');
+            document.getElementById('buttons').classList.add('hidden');
+        })
     </script>
 </body>
 </html>