Spaces:
Runtime error
Runtime error
Commit
·
101dfab
1
Parent(s):
0b8eb3e
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,8 +8,10 @@ from langchain.llms import HuggingFaceHub
|
|
| 8 |
from langchain.embeddings import HuggingFaceHubEmbeddings
|
| 9 |
from langchain.vectorstores import Chroma
|
| 10 |
from langchain.chains import RetrievalQA
|
|
|
|
| 11 |
from trafilatura import fetch_url, extract
|
| 12 |
from trafilatura.spider import focused_crawler
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
|
|
@@ -20,11 +22,14 @@ def url_changes(url, pages_to_visit, urls_to_scrape, repo_id):
|
|
| 20 |
to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
|
| 21 |
print(f"{len(links)} to be crawled")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
results_df = pd.DataFrame()
|
| 24 |
for url in links:
|
| 25 |
downloaded = fetch_url(url)
|
| 26 |
if downloaded:
|
| 27 |
-
result = extract(downloaded, output_format='json')
|
| 28 |
result = json.loads(result)
|
| 29 |
|
| 30 |
results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])
|
|
|
|
| 8 |
from langchain.embeddings import HuggingFaceHubEmbeddings
|
| 9 |
from langchain.vectorstores import Chroma
|
| 10 |
from langchain.chains import RetrievalQA
|
| 11 |
+
|
| 12 |
from trafilatura import fetch_url, extract
|
| 13 |
from trafilatura.spider import focused_crawler
|
| 14 |
+
from trafilatura.settings import use_config
|
| 15 |
|
| 16 |
|
| 17 |
|
|
|
|
| 22 |
to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
|
| 23 |
print(f"{len(links)} to be crawled")
|
| 24 |
|
| 25 |
+
config = use_config()
|
| 26 |
+
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
|
| 27 |
+
|
| 28 |
results_df = pd.DataFrame()
|
| 29 |
for url in links:
|
| 30 |
downloaded = fetch_url(url)
|
| 31 |
if downloaded:
|
| 32 |
+
result = extract(downloaded, output_format='json', config=config)
|
| 33 |
result = json.loads(result)
|
| 34 |
|
| 35 |
results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])
|