Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

Lucas ARRIESSE commited on Aug 8

Commit

d2dc29e

1 Parent(s): 8ac47d4

Remove usages of NLTK (that is unused actually)

Browse files

Files changed (3) hide show

api/docs.py +3 -16
app.py +0 -6
requirements.txt +0 -1

api/docs.py CHANGED Viewed

@@ -2,9 +2,7 @@ import asyncio
 from typing import Dict, List, Literal, Tuple
 from fastapi.routing import APIRouter
 import logging
-import string
 import io
-import traceback
 import zipfile
 import json
 import os
@@ -15,10 +13,7 @@ import subprocess
 import pandas as pd
 import re
 from lxml import etree
-from nltk.tokenize import word_tokenize
 from bs4 import BeautifulSoup
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
 from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
@@ -30,21 +25,12 @@ from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadReque
 router = APIRouter(tags=["document extraction"])
 # ==================================================== Utilities =================================================================
-lemmatizer = WordNetLemmatizer()
 NSMAP = {
     'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
     'v': 'urn:schemas-microsoft-com:vml'
 }
-def lemma(text: str):
-    stop_words = set(stopwords.words('english'))
-    txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
-    tokens = [token for token in word_tokenize(
-        txt.lower()) if token not in stop_words]
-    return [lemmatizer.lemmatize(token) for token in tokens]
 def get_docx_archive(url: str) -> zipfile.ZipFile:
@@ -358,7 +344,8 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
     documents = req.documents
     n_docs = len(documents)
-    logging.info("Generating requirements for documents: {}".format(req.documents))
     # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
     concurrency_sema = asyncio.Semaphore(4)

 from typing import Dict, List, Literal, Tuple
 from fastapi.routing import APIRouter
 import logging
 import io
 import zipfile
 import json
 import os
 import pandas as pd
 import re
 from lxml import etree
 from bs4 import BeautifulSoup
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
 from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 router = APIRouter(tags=["document extraction"])
 # ==================================================== Utilities =================================================================
 NSMAP = {
     'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
     'v': 'urn:schemas-microsoft-com:vml'
 }
+# ================================== Converting of files to .txt ====================================
 def get_docx_archive(url: str) -> zipfile.ZipFile:
     documents = req.documents
     n_docs = len(documents)
+    logging.info(
+        "Generating requirements for documents: {}".format(req.documents))
     # limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
     concurrency_sema = asyncio.Semaphore(4)

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import logging
 from dotenv import load_dotenv
 from typing import Literal
 from jinja2 import Environment, TemplateNotFound
-import nltk
 import warnings
 import os
 from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
@@ -30,11 +29,6 @@ logging.basicConfig(
 # Initialize global dependencies
 init_dependencies()
-# Download required packages for NLTK
-nltk.download('stopwords')
-nltk.download('punkt_tab')
-nltk.download('wordnet')
 warnings.filterwarnings("ignore")
 app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")

 from dotenv import load_dotenv
 from typing import Literal
 from jinja2 import Environment, TemplateNotFound
 import warnings
 import os
 from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
 # Initialize global dependencies
 init_dependencies()
 warnings.filterwarnings("ignore")
 app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")

requirements.txt CHANGED Viewed

@@ -9,6 +9,5 @@ lxml
 openpyxl
 beautifulsoup4
 aiolimiter
-nltk
 httpx
 Jinja2

 openpyxl
 beautifulsoup4
 aiolimiter
 httpx
 Jinja2