Spaces:
Build error
Build error
XThomasBU
commited on
Commit
·
4fc2bf8
1
Parent(s):
1ef2150
added timeout
Browse files
code/modules/config/constants.py
CHANGED
|
@@ -3,6 +3,8 @@ import os
|
|
| 3 |
|
| 4 |
load_dotenv()
|
| 5 |
|
|
|
|
|
|
|
| 6 |
# API Keys - Loaded from the .env file
|
| 7 |
|
| 8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
|
| 3 |
|
| 4 |
load_dotenv()
|
| 5 |
|
| 6 |
+
TIMEOUT = 60
|
| 7 |
+
|
| 8 |
# API Keys - Loaded from the .env file
|
| 9 |
|
| 10 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
code/modules/dataloader/data_loader.py
CHANGED
|
@@ -22,6 +22,7 @@ from modules.dataloader.pdf_readers.base import PDFReader
|
|
| 22 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
| 23 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
| 24 |
from modules.dataloader.helpers import get_metadata
|
|
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
BASE_DIR = os.getcwd()
|
|
@@ -32,7 +33,7 @@ class HTMLReader:
|
|
| 32 |
pass
|
| 33 |
|
| 34 |
def read_url(self, url):
|
| 35 |
-
response = requests.get(url)
|
| 36 |
if response.status_code == 200:
|
| 37 |
return response.text
|
| 38 |
else:
|
|
@@ -52,7 +53,7 @@ class HTMLReader:
|
|
| 52 |
absolute_url = urljoin(base_url, href)
|
| 53 |
link["href"] = absolute_url
|
| 54 |
|
| 55 |
-
resp = requests.head(absolute_url)
|
| 56 |
if resp.status_code != 200:
|
| 57 |
logger.warning(
|
| 58 |
f"Link {absolute_url} is broken. Status code: {resp.status_code}"
|
|
@@ -127,7 +128,7 @@ class FileReader:
|
|
| 127 |
return [Document(page_content=self.web_reader.read_html(url))]
|
| 128 |
|
| 129 |
def read_tex_from_url(self, tex_url):
|
| 130 |
-
response = requests.get(tex_url)
|
| 131 |
if response.status_code == 200:
|
| 132 |
return [Document(page_content=response.text)]
|
| 133 |
else:
|
|
|
|
| 22 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
| 23 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
| 24 |
from modules.dataloader.helpers import get_metadata
|
| 25 |
+
from modules.config.constants import TIMEOUT
|
| 26 |
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
BASE_DIR = os.getcwd()
|
|
|
|
| 33 |
pass
|
| 34 |
|
| 35 |
def read_url(self, url):
|
| 36 |
+
response = requests.get(url, timeout=TIMEOUT)
|
| 37 |
if response.status_code == 200:
|
| 38 |
return response.text
|
| 39 |
else:
|
|
|
|
| 53 |
absolute_url = urljoin(base_url, href)
|
| 54 |
link["href"] = absolute_url
|
| 55 |
|
| 56 |
+
resp = requests.head(absolute_url, timeout=TIMEOUT)
|
| 57 |
if resp.status_code != 200:
|
| 58 |
logger.warning(
|
| 59 |
f"Link {absolute_url} is broken. Status code: {resp.status_code}"
|
|
|
|
| 128 |
return [Document(page_content=self.web_reader.read_html(url))]
|
| 129 |
|
| 130 |
def read_tex_from_url(self, tex_url):
|
| 131 |
+
response = requests.get(tex_url, timeout=TIMEOUT)
|
| 132 |
if response.status_code == 200:
|
| 133 |
return [Document(page_content=response.text)]
|
| 134 |
else:
|
code/modules/dataloader/helpers.py
CHANGED
|
@@ -2,6 +2,7 @@ import requests
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
from urllib.parse import urlparse
|
| 4 |
import tempfile
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def get_urls_from_file(file_path: str):
|
|
@@ -27,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
|
|
| 27 |
lecture_metadata = {}
|
| 28 |
|
| 29 |
# Get the main lectures page content
|
| 30 |
-
r_lectures = requests.get(lectures_url)
|
| 31 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
| 32 |
|
| 33 |
# Get the main schedule page content
|
| 34 |
-
r_schedule = requests.get(schedule_url)
|
| 35 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
| 36 |
|
| 37 |
# Find all lecture blocks
|
|
@@ -119,7 +120,7 @@ def download_pdf_from_url(pdf_url):
|
|
| 119 |
Returns:
|
| 120 |
str: The local file path of the downloaded PDF file.
|
| 121 |
"""
|
| 122 |
-
response = requests.get(pdf_url)
|
| 123 |
if response.status_code == 200:
|
| 124 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 125 |
temp_file.write(response.content)
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
from urllib.parse import urlparse
|
| 4 |
import tempfile
|
| 5 |
+
from modules.config.constants import TIMEOUT
|
| 6 |
|
| 7 |
|
| 8 |
def get_urls_from_file(file_path: str):
|
|
|
|
| 28 |
lecture_metadata = {}
|
| 29 |
|
| 30 |
# Get the main lectures page content
|
| 31 |
+
r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
|
| 32 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
| 33 |
|
| 34 |
# Get the main schedule page content
|
| 35 |
+
r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
|
| 36 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
| 37 |
|
| 38 |
# Find all lecture blocks
|
|
|
|
| 120 |
Returns:
|
| 121 |
str: The local file path of the downloaded PDF file.
|
| 122 |
"""
|
| 123 |
+
response = requests.get(pdf_url, timeout=TIMEOUT)
|
| 124 |
if response.status_code == 200:
|
| 125 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 126 |
temp_file.write(response.content)
|
code/modules/dataloader/pdf_readers/gpt.py
CHANGED
|
@@ -6,6 +6,7 @@ from io import BytesIO
|
|
| 6 |
from openai import OpenAI
|
| 7 |
from pdf2image import convert_from_path
|
| 8 |
from langchain.schema import Document
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class GPTParser:
|
|
@@ -59,6 +60,7 @@ class GPTParser:
|
|
| 59 |
"https://api.openai.com/v1/chat/completions",
|
| 60 |
headers=headers,
|
| 61 |
json=payload,
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
resp = response.json()
|
|
|
|
| 6 |
from openai import OpenAI
|
| 7 |
from pdf2image import convert_from_path
|
| 8 |
from langchain.schema import Document
|
| 9 |
+
from modules.config.constants import TIMEOUT
|
| 10 |
|
| 11 |
|
| 12 |
class GPTParser:
|
|
|
|
| 60 |
"https://api.openai.com/v1/chat/completions",
|
| 61 |
headers=headers,
|
| 62 |
json=payload,
|
| 63 |
+
timeout=TIMEOUT,
|
| 64 |
)
|
| 65 |
|
| 66 |
resp = response.json()
|
code/modules/dataloader/pdf_readers/llama.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import requests
|
| 3 |
from llama_parse import LlamaParse
|
| 4 |
from langchain.schema import Document
|
| 5 |
-
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
| 6 |
from modules.dataloader.helpers import download_pdf_from_url
|
| 7 |
|
| 8 |
|
|
@@ -52,7 +52,11 @@ class LlamaParser:
|
|
| 52 |
files = [
|
| 53 |
(
|
| 54 |
"file",
|
| 55 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
)
|
| 57 |
]
|
| 58 |
|
|
|
|
| 2 |
import requests
|
| 3 |
from llama_parse import LlamaParse
|
| 4 |
from langchain.schema import Document
|
| 5 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
|
| 6 |
from modules.dataloader.helpers import download_pdf_from_url
|
| 7 |
|
| 8 |
|
|
|
|
| 52 |
files = [
|
| 53 |
(
|
| 54 |
"file",
|
| 55 |
+
(
|
| 56 |
+
"file",
|
| 57 |
+
requests.get(pdf_url, timeout=TIMEOUT).content,
|
| 58 |
+
"application/octet-stream",
|
| 59 |
+
),
|
| 60 |
)
|
| 61 |
]
|
| 62 |
|
code/modules/dataloader/webpage_crawler.py
CHANGED
|
@@ -4,6 +4,7 @@ import asyncio
|
|
| 4 |
import requests
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
from urllib.parse import urljoin, urldefrag
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
class WebpageCrawler:
|
|
@@ -19,7 +20,7 @@ class WebpageCrawler:
|
|
| 19 |
|
| 20 |
def url_exists(self, url: str) -> bool:
|
| 21 |
try:
|
| 22 |
-
response = requests.head(url)
|
| 23 |
return response.status_code == 200
|
| 24 |
except requests.ConnectionError:
|
| 25 |
return False
|
|
@@ -89,7 +90,7 @@ class WebpageCrawler:
|
|
| 89 |
|
| 90 |
def is_webpage(self, url: str) -> bool:
|
| 91 |
try:
|
| 92 |
-
response = requests.head(url, allow_redirects=True)
|
| 93 |
content_type = response.headers.get("Content-Type", "").lower()
|
| 94 |
return "text/html" in content_type
|
| 95 |
except requests.RequestException:
|
|
|
|
| 4 |
import requests
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
from urllib.parse import urljoin, urldefrag
|
| 7 |
+
from modules.config.constants import TIMEOUT
|
| 8 |
|
| 9 |
|
| 10 |
class WebpageCrawler:
|
|
|
|
| 20 |
|
| 21 |
def url_exists(self, url: str) -> bool:
|
| 22 |
try:
|
| 23 |
+
response = requests.head(url, timeout=TIMEOUT)
|
| 24 |
return response.status_code == 200
|
| 25 |
except requests.ConnectionError:
|
| 26 |
return False
|
|
|
|
| 90 |
|
| 91 |
def is_webpage(self, url: str) -> bool:
|
| 92 |
try:
|
| 93 |
+
response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
|
| 94 |
content_type = response.headers.get("Content-Type", "").lower()
|
| 95 |
return "text/html" in content_type
|
| 96 |
except requests.RequestException:
|