Lucas ARRIESSE
commited on
Commit
·
d2dc29e
1
Parent(s):
8ac47d4
Remove usages of NLTK (that is unused actually)
Browse files- api/docs.py +3 -16
- app.py +0 -6
- requirements.txt +0 -1
api/docs.py
CHANGED
|
@@ -2,9 +2,7 @@ import asyncio
|
|
| 2 |
from typing import Dict, List, Literal, Tuple
|
| 3 |
from fastapi.routing import APIRouter
|
| 4 |
import logging
|
| 5 |
-
import string
|
| 6 |
import io
|
| 7 |
-
import traceback
|
| 8 |
import zipfile
|
| 9 |
import json
|
| 10 |
import os
|
|
@@ -15,10 +13,7 @@ import subprocess
|
|
| 15 |
import pandas as pd
|
| 16 |
import re
|
| 17 |
from lxml import etree
|
| 18 |
-
from nltk.tokenize import word_tokenize
|
| 19 |
from bs4 import BeautifulSoup
|
| 20 |
-
from nltk.corpus import stopwords
|
| 21 |
-
from nltk.stem import WordNetLemmatizer
|
| 22 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
| 23 |
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
| 24 |
from fastapi.responses import StreamingResponse
|
|
@@ -30,21 +25,12 @@ from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadReque
|
|
| 30 |
router = APIRouter(tags=["document extraction"])
|
| 31 |
|
| 32 |
# ==================================================== Utilities =================================================================
|
| 33 |
-
|
| 34 |
-
lemmatizer = WordNetLemmatizer()
|
| 35 |
-
|
| 36 |
NSMAP = {
|
| 37 |
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
| 38 |
'v': 'urn:schemas-microsoft-com:vml'
|
| 39 |
}
|
| 40 |
|
| 41 |
-
|
| 42 |
-
def lemma(text: str):
|
| 43 |
-
stop_words = set(stopwords.words('english'))
|
| 44 |
-
txt = text.translate(str.maketrans('', '', string.punctuation)).strip()
|
| 45 |
-
tokens = [token for token in word_tokenize(
|
| 46 |
-
txt.lower()) if token not in stop_words]
|
| 47 |
-
return [lemmatizer.lemmatize(token) for token in tokens]
|
| 48 |
|
| 49 |
|
| 50 |
def get_docx_archive(url: str) -> zipfile.ZipFile:
|
|
@@ -358,7 +344,8 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
|
|
| 358 |
documents = req.documents
|
| 359 |
n_docs = len(documents)
|
| 360 |
|
| 361 |
-
logging.info(
|
|
|
|
| 362 |
|
| 363 |
# limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
|
| 364 |
concurrency_sema = asyncio.Semaphore(4)
|
|
|
|
| 2 |
from typing import Dict, List, Literal, Tuple
|
| 3 |
from fastapi.routing import APIRouter
|
| 4 |
import logging
|
|
|
|
| 5 |
import io
|
|
|
|
| 6 |
import zipfile
|
| 7 |
import json
|
| 8 |
import os
|
|
|
|
| 13 |
import pandas as pd
|
| 14 |
import re
|
| 15 |
from lxml import etree
|
|
|
|
| 16 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 17 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
| 18 |
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
| 19 |
from fastapi.responses import StreamingResponse
|
|
|
|
| 25 |
router = APIRouter(tags=["document extraction"])
|
| 26 |
|
| 27 |
# ==================================================== Utilities =================================================================
|
|
|
|
|
|
|
|
|
|
| 28 |
NSMAP = {
|
| 29 |
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
| 30 |
'v': 'urn:schemas-microsoft-com:vml'
|
| 31 |
}
|
| 32 |
|
| 33 |
+
# ================================== Converting of files to .txt ====================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def get_docx_archive(url: str) -> zipfile.ZipFile:
|
|
|
|
| 344 |
documents = req.documents
|
| 345 |
n_docs = len(documents)
|
| 346 |
|
| 347 |
+
logging.info(
|
| 348 |
+
"Generating requirements for documents: {}".format(req.documents))
|
| 349 |
|
| 350 |
# limit max concurrency of LLM requests to prevent a huge pile of errors because of small rate limits
|
| 351 |
concurrency_sema = asyncio.Semaphore(4)
|
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import logging
|
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
from typing import Literal
|
| 5 |
from jinja2 import Environment, TemplateNotFound
|
| 6 |
-
import nltk
|
| 7 |
import warnings
|
| 8 |
import os
|
| 9 |
from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
|
|
@@ -30,11 +29,6 @@ logging.basicConfig(
|
|
| 30 |
# Initialize global dependencies
|
| 31 |
init_dependencies()
|
| 32 |
|
| 33 |
-
# Download required packages for NLTK
|
| 34 |
-
nltk.download('stopwords')
|
| 35 |
-
nltk.download('punkt_tab')
|
| 36 |
-
nltk.download('wordnet')
|
| 37 |
-
|
| 38 |
warnings.filterwarnings("ignore")
|
| 39 |
|
| 40 |
app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
|
|
|
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
from typing import Literal
|
| 5 |
from jinja2 import Environment, TemplateNotFound
|
|
|
|
| 6 |
import warnings
|
| 7 |
import os
|
| 8 |
from fastapi import Depends, FastAPI, BackgroundTasks, HTTPException, Request, Response
|
|
|
|
| 29 |
# Initialize global dependencies
|
| 30 |
init_dependencies()
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
warnings.filterwarnings("ignore")
|
| 33 |
|
| 34 |
app = FastAPI(title="Requirements Extractor", docs_url="/apidocs")
|
requirements.txt
CHANGED
|
@@ -9,6 +9,5 @@ lxml
|
|
| 9 |
openpyxl
|
| 10 |
beautifulsoup4
|
| 11 |
aiolimiter
|
| 12 |
-
nltk
|
| 13 |
httpx
|
| 14 |
Jinja2
|
|
|
|
| 9 |
openpyxl
|
| 10 |
beautifulsoup4
|
| 11 |
aiolimiter
|
|
|
|
| 12 |
httpx
|
| 13 |
Jinja2
|