added missing dependencies for wikipedia reader and transcription tools. also removed truncation of webpage extraction tool
Browse files- requirements.txt +3 -0
- tools.py +2 -2
requirements.txt
CHANGED
|
@@ -11,3 +11,6 @@ pandas
|
|
| 11 |
av
|
| 12 |
yt-dlp
|
| 13 |
beautifulsoup4
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
av
|
| 12 |
yt-dlp
|
| 13 |
beautifulsoup4
|
| 14 |
+
torch
|
| 15 |
+
transformers
|
| 16 |
+
lxml
|
tools.py
CHANGED
|
@@ -162,8 +162,8 @@ def extract_url_content(url_list: list[str]) -> str:
|
|
| 162 |
if extract_results and 'results' in extract_results and len(extract_results['results']) > 0:
|
| 163 |
for i, page_content in enumerate(extract_results['results']):
|
| 164 |
del extract_results['results'][i]['images']
|
| 165 |
-
if len(page_content['raw_content']) > 40000:
|
| 166 |
-
|
| 167 |
return json.dumps(extract_results['results'], indent=2)
|
| 168 |
else:
|
| 169 |
return f"No content could be extracted from the provided URLs: {url_list}"
|
|
|
|
| 162 |
if extract_results and 'results' in extract_results and len(extract_results['results']) > 0:
|
| 163 |
for i, page_content in enumerate(extract_results['results']):
|
| 164 |
del extract_results['results'][i]['images']
|
| 165 |
+
# if len(page_content['raw_content']) > 40000:
|
| 166 |
+
# extract_results['results'][i]['raw_content'] = page_content['raw_content'][:40000] + '... [truncated]'
|
| 167 |
return json.dumps(extract_results['results'], indent=2)
|
| 168 |
else:
|
| 169 |
return f"No content could be extracted from the provided URLs: {url_list}"
|