Move utility code into separate modules and add MarkdownWebBaseLoader implementation
Browse files- app.py +1 -1
- requirements.txt +10 -5
- retrieval.py +80 -0
- tools.py +20 -174
- web_utilities.py +297 -0
app.py
CHANGED
|
@@ -148,7 +148,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 148 |
)
|
| 149 |
else:
|
| 150 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 151 |
-
for item in
|
| 152 |
result = solve_question(item)
|
| 153 |
results_log.append(result)
|
| 154 |
with open(results_file_path, "w") as results_file:
|
|
|
|
| 148 |
)
|
| 149 |
else:
|
| 150 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 151 |
+
for item in filtered_questions_data:
|
| 152 |
result = solve_question(item)
|
| 153 |
results_log.append(result)
|
| 154 |
with open(results_file_path, "w") as results_file:
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
beautifulsoup4==4.13.4
|
| 2 |
datasets==3.5.1
|
| 3 |
duckduckgo-search==8.0.1
|
|
@@ -6,17 +7,21 @@ gradio==5.29.0
|
|
| 6 |
hf_xet==1.1.2
|
| 7 |
huggingface-hub==0.30.2
|
| 8 |
langchain==0.3.25
|
| 9 |
-
langchain-
|
| 10 |
-
langchain-
|
| 11 |
-
|
| 12 |
-
langchain-
|
| 13 |
-
langchain-
|
|
|
|
|
|
|
| 14 |
langfuse==2.60.5
|
| 15 |
langgraph==0.4.1
|
|
|
|
| 16 |
numpy==2.2.5
|
| 17 |
openai-whisper==20240930
|
| 18 |
openpyxl==3.1.5
|
| 19 |
pandas==2.2.3
|
|
|
|
| 20 |
pyrootutils~=1.0.4
|
| 21 |
python-dotenv~=1.1.0
|
| 22 |
requests==2.32.3
|
|
|
|
| 1 |
+
anthropic==0.52.2
|
| 2 |
beautifulsoup4==4.13.4
|
| 3 |
datasets==3.5.1
|
| 4 |
duckduckgo-search==8.0.1
|
|
|
|
| 7 |
hf_xet==1.1.2
|
| 8 |
huggingface-hub==0.30.2
|
| 9 |
langchain==0.3.25
|
| 10 |
+
langchain-anthropic==0.3.15
|
| 11 |
+
langchain-community==0.3.24
|
| 12 |
+
langchain-core==0.3.64
|
| 13 |
+
langchain-groq==0.3.2
|
| 14 |
+
langchain-huggingface==0.2.0
|
| 15 |
+
langchain-openai==0.3.21
|
| 16 |
+
langchain-tavily==0.1.6
|
| 17 |
langfuse==2.60.5
|
| 18 |
langgraph==0.4.1
|
| 19 |
+
markdownify==1.1.0
|
| 20 |
numpy==2.2.5
|
| 21 |
openai-whisper==20240930
|
| 22 |
openpyxl==3.1.5
|
| 23 |
pandas==2.2.3
|
| 24 |
+
playwright==1.52.0
|
| 25 |
pyrootutils~=1.0.4
|
| 26 |
python-dotenv~=1.1.0
|
| 27 |
requests==2.32.3
|
retrieval.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Union
|
| 2 |
+
from dotenv import find_dotenv, load_dotenv
|
| 3 |
+
from langchain.chains import RetrievalQA
|
| 4 |
+
from langchain.chat_models import init_chat_model
|
| 5 |
+
from langchain.schema import Document
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_community.vectorstores import FAISS
|
| 8 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_default_splitter() -> RecursiveCharacterTextSplitter:
|
| 12 |
+
"""Returns a pre-configured text splitter."""
|
| 13 |
+
return RecursiveCharacterTextSplitter(
|
| 14 |
+
# Using markdown headers as separators is a good strategy
|
| 15 |
+
separators=["\n### ", "\n## ", "\n# ", "\n\n", "\n", " "],
|
| 16 |
+
chunk_size=1000,
|
| 17 |
+
chunk_overlap=200,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def get_default_embeddings() -> HuggingFaceEmbeddings:
|
| 21 |
+
"""Returns a pre-configured embedding model."""
|
| 22 |
+
return HuggingFaceEmbeddings(
|
| 23 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 24 |
+
model_kwargs={'device': 'cpu'}
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def build_retriever(
|
| 29 |
+
data: Union[str, List[Document]],
|
| 30 |
+
splitter: RecursiveCharacterTextSplitter = None,
|
| 31 |
+
embeddings: HuggingFaceEmbeddings = None,
|
| 32 |
+
top_k: int = 5):
|
| 33 |
+
"""Builds a retriever from either a raw text string or a list of documents.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
Args:
|
| 37 |
+
data (Union[str, List[Document]]): The source data to build the retriever from.
|
| 38 |
+
splitter (RecursiveCharacterTextSplitter, optional): The text splitter to use.
|
| 39 |
+
Defaults to get_default_splitter().
|
| 40 |
+
embeddings (HuggingFaceEmbeddings, optional): The embedding model to use.
|
| 41 |
+
Defaults to get_default_embeddings().
|
| 42 |
+
top_k (int, optional): The number of top results to return. Defaults to 5.
|
| 43 |
+
"""
|
| 44 |
+
splitter = splitter or get_default_splitter()
|
| 45 |
+
embeddings = embeddings or get_default_embeddings()
|
| 46 |
+
if isinstance(data, str):
|
| 47 |
+
# If the input is a raw string, split it into chunks first
|
| 48 |
+
chunks = splitter.split_text(data)
|
| 49 |
+
# Then convert those chunks into Document objects
|
| 50 |
+
docs = [Document(page_content=chunk) for chunk in chunks]
|
| 51 |
+
elif isinstance(data, list):
|
| 52 |
+
# If the input is already a list of documents, split them directly
|
| 53 |
+
docs = splitter.split_documents(data)
|
| 54 |
+
else:
|
| 55 |
+
raise ValueError(f"Unsupported data type: {type(data)}. Must be str or List[Document].")
|
| 56 |
+
|
| 57 |
+
index = FAISS.from_documents(docs, embeddings)
|
| 58 |
+
return index.as_retriever(search_kwargs={"k": top_k})
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def create_retrieval_qa(
|
| 62 |
+
retriever,
|
| 63 |
+
llm=None
|
| 64 |
+
) -> RetrievalQA:
|
| 65 |
+
"""Creates a RetrievalQA instance from a given retriever and LLM.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
retriever (BaseRetriever): The retriever to be used by the QA chain.
|
| 69 |
+
llm (LLM, optional): The language model to use. If not provided,
|
| 70 |
+
a default model will be initialized.
|
| 71 |
+
"""
|
| 72 |
+
if llm is None:
|
| 73 |
+
load_dotenv(find_dotenv())
|
| 74 |
+
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
| 75 |
+
return RetrievalQA.from_chain_type(
|
| 76 |
+
llm=llm,
|
| 77 |
+
chain_type="stuff",
|
| 78 |
+
retriever=retriever,
|
| 79 |
+
return_source_documents=True,
|
| 80 |
+
)
|
tools.py
CHANGED
|
@@ -1,42 +1,30 @@
|
|
| 1 |
import base64
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
-
import re
|
| 5 |
-
from typing import Optional, Dict
|
| 6 |
-
|
| 7 |
import pandas as pd
|
|
|
|
| 8 |
import requests
|
| 9 |
import whisper
|
| 10 |
|
| 11 |
-
from bs4 import BeautifulSoup
|
| 12 |
from datetime import datetime
|
| 13 |
from dotenv import find_dotenv, load_dotenv
|
| 14 |
from langchain.chains import RetrievalQA
|
| 15 |
from langchain.chat_models import init_chat_model
|
| 16 |
-
from langchain.schema import Document
|
| 17 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 18 |
from langchain_community.document_loaders import (
|
| 19 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
| 20 |
UnstructuredWordDocumentLoader, WebBaseLoader)
|
| 21 |
-
from langchain_community.tools import DuckDuckGoSearchResults
|
| 22 |
from langchain_community.utilities import GoogleSerperAPIWrapper
|
| 23 |
-
from langchain_community.vectorstores import FAISS
|
| 24 |
from langchain_core.prompts import ChatPromptTemplate
|
| 25 |
from langchain_core.tools import tool
|
| 26 |
-
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
| 27 |
from langchain_tavily import TavilySearch
|
| 28 |
-
from
|
| 29 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 30 |
from yt_dlp import YoutubeDL
|
| 31 |
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
UNWANTED_SECTIONS = {
|
| 34 |
-
"references",
|
| 35 |
-
"external links",
|
| 36 |
-
"further reading",
|
| 37 |
-
"see also",
|
| 38 |
-
"notes",
|
| 39 |
-
}
|
| 40 |
|
| 41 |
@tool
|
| 42 |
def get_weather_info(location: str) -> str:
|
|
@@ -147,153 +135,6 @@ def reverse_text(text: str) -> str:
|
|
| 147 |
return text[::-1]
|
| 148 |
|
| 149 |
|
| 150 |
-
def build_retriever(text: str):
|
| 151 |
-
"""Builds a retriever from the given text.
|
| 152 |
-
|
| 153 |
-
Args:
|
| 154 |
-
text (str): The text to be used for retrieval.
|
| 155 |
-
"""
|
| 156 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 157 |
-
separators=["\n### ", "\n## ", "\n# "],
|
| 158 |
-
chunk_size=1000,
|
| 159 |
-
chunk_overlap=200,
|
| 160 |
-
)
|
| 161 |
-
chunks = splitter.split_text(text)
|
| 162 |
-
docs = [
|
| 163 |
-
Document(page_content=chunk)
|
| 164 |
-
for chunk in chunks
|
| 165 |
-
]
|
| 166 |
-
hf_embed = HuggingFaceEmbeddings(
|
| 167 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 168 |
-
)
|
| 169 |
-
index = FAISS.from_documents(docs, hf_embed)
|
| 170 |
-
return index.as_retriever(search_kwargs={"k": 3})
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
def get_retrieval_qa(text: str):
|
| 174 |
-
"""Creates a RetrievalQA instance for the given text.
|
| 175 |
-
Args:
|
| 176 |
-
text (str): The text to be used for retrieval.
|
| 177 |
-
"""
|
| 178 |
-
retriever = build_retriever(text)
|
| 179 |
-
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
|
| 180 |
-
return RetrievalQA.from_chain_type(
|
| 181 |
-
llm=llm,
|
| 182 |
-
chain_type="stuff",
|
| 183 |
-
retriever=retriever,
|
| 184 |
-
return_source_documents=True,
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
def clean_html(html: str) -> str:
|
| 189 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 190 |
-
|
| 191 |
-
# 1. Remove <script> & <style>
|
| 192 |
-
for tag in soup(["script", "style"]):
|
| 193 |
-
tag.decompose()
|
| 194 |
-
|
| 195 |
-
# 2. Drop whole <section> blocks whose first heading is unwanted
|
| 196 |
-
for sec in soup.find_all("section"):
|
| 197 |
-
h = sec.find(["h1","h2","h3","h4","h5","h6"])
|
| 198 |
-
if h and any(h.get_text(strip=True).lower().startswith(u) for u in UNWANTED_SECTIONS):
|
| 199 |
-
sec.decompose()
|
| 200 |
-
|
| 201 |
-
# 3. Additional filtering by CSS selector
|
| 202 |
-
for selector in [".toc", ".navbox", ".vertical-navbox", ".hatnote", ".reflist", ".mw-references-wrap"]:
|
| 203 |
-
for el in soup.select(selector):
|
| 204 |
-
el.decompose()
|
| 205 |
-
|
| 206 |
-
# 4. Isolate the main content container if present
|
| 207 |
-
main = soup.find("div", class_="mw-parser-output")
|
| 208 |
-
return str(main or soup)
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
def fetch_page_markdown(page_key: str, lang: str="en") -> str:
|
| 212 |
-
"""Fetches the page HTML and returns the <body> as Markdown.
|
| 213 |
-
Args:
|
| 214 |
-
page_key (str): The unique key of the Wikipedia page.
|
| 215 |
-
lang (str): The language code for the Wikipedia edition to fetch (default: "en").
|
| 216 |
-
"""
|
| 217 |
-
url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
|
| 218 |
-
resp = requests.get(url, timeout=15)
|
| 219 |
-
resp.raise_for_status()
|
| 220 |
-
html = clean_html(resp.text) # Optional, but recommended: clean the HTML to remove unwanted sections
|
| 221 |
-
|
| 222 |
-
markdown = md(
|
| 223 |
-
html,
|
| 224 |
-
heading_style="ATX",
|
| 225 |
-
bullets="*+-",
|
| 226 |
-
table_infer_header=True,
|
| 227 |
-
strip=['a', 'span']
|
| 228 |
-
)
|
| 229 |
-
return markdown
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
def get_wikipedia_article(query: str) -> Dict[str, str]:
|
| 233 |
-
"""Fetches a Wikipedia article for a given query and returns its content in Markdown format.
|
| 234 |
-
|
| 235 |
-
Args:
|
| 236 |
-
query (str): The search query.
|
| 237 |
-
"""
|
| 238 |
-
headers = {
|
| 239 |
-
'User-Agent': 'MyLLMAgent (llm_agent@example.com)'
|
| 240 |
-
}
|
| 241 |
-
|
| 242 |
-
# Step 1: Search
|
| 243 |
-
search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
|
| 244 |
-
search_params = {'q': query, 'limit': 1}
|
| 245 |
-
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
|
| 246 |
-
|
| 247 |
-
if search_response.status_code != 200:
|
| 248 |
-
raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
|
| 249 |
-
|
| 250 |
-
results = search_response.json().get("pages", [])
|
| 251 |
-
if not results:
|
| 252 |
-
raise Exception(f"No results found for query: {query}")
|
| 253 |
-
|
| 254 |
-
page = results[0]
|
| 255 |
-
page_key = page["key"]
|
| 256 |
-
|
| 257 |
-
# Step 2: Get the wiki page, only keep relevant content and convert to Markdown
|
| 258 |
-
markdown = fetch_page_markdown(page_key)
|
| 259 |
-
return {
|
| 260 |
-
"page_key": page_key,
|
| 261 |
-
"markdown": markdown,
|
| 262 |
-
}
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def parse_sections(markdown_text: str) -> Dict[str, Dict]:
|
| 266 |
-
"""
|
| 267 |
-
Parses markdown into a nested dict:
|
| 268 |
-
{ section_title: {
|
| 269 |
-
"full": full_section_md,
|
| 270 |
-
"subsections": { sub_title: sub_md, ... }
|
| 271 |
-
}, ... }
|
| 272 |
-
"""
|
| 273 |
-
# First split top-level sections
|
| 274 |
-
top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
|
| 275 |
-
top_matches = list(top_pat.finditer(markdown_text))
|
| 276 |
-
sections: Dict[str, Dict] = {}
|
| 277 |
-
for i, m in enumerate(top_matches):
|
| 278 |
-
sec_title = m.group(1).strip()
|
| 279 |
-
start = m.start()
|
| 280 |
-
end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
|
| 281 |
-
sec_md = markdown_text[start:end].strip()
|
| 282 |
-
|
| 283 |
-
# Now split subsections within this block
|
| 284 |
-
sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
|
| 285 |
-
subs: Dict[str, str] = {}
|
| 286 |
-
sub_matches = list(sub_pat.finditer(sec_md))
|
| 287 |
-
for j, sm in enumerate(sub_matches):
|
| 288 |
-
sub_title = sm.group(1).strip()
|
| 289 |
-
sub_start = sm.start()
|
| 290 |
-
sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
|
| 291 |
-
subs[sub_title] = sec_md[sub_start:sub_end].strip()
|
| 292 |
-
|
| 293 |
-
sections[sec_title] = {"full": sec_md, "subsections": subs}
|
| 294 |
-
return sections
|
| 295 |
-
|
| 296 |
-
|
| 297 |
@tool
|
| 298 |
def wiki_search_qa(query: str, question: str) -> str:
|
| 299 |
"""Searches Wikipedia for a specific article and answers a question based on its content.
|
|
@@ -304,10 +145,13 @@ def wiki_search_qa(query: str, question: str) -> str:
|
|
| 304 |
Args:
|
| 305 |
query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
|
| 306 |
question (str): The question to answer using the article.
|
|
|
|
|
|
|
| 307 |
"""
|
| 308 |
article = get_wikipedia_article(query)
|
| 309 |
markdown = article["markdown"]
|
| 310 |
-
|
|
|
|
| 311 |
return qa.invoke(question)
|
| 312 |
|
| 313 |
|
|
@@ -344,8 +188,8 @@ def wiki_get_section(
|
|
| 344 |
Returns:
|
| 345 |
Markdown string of either the entire section or just the named subsection.
|
| 346 |
"""
|
| 347 |
-
|
| 348 |
-
markdown =
|
| 349 |
sections = parse_sections(markdown)
|
| 350 |
|
| 351 |
sec_info = sections.get(section)
|
|
@@ -368,7 +212,7 @@ def web_search(query: str, max_results: int = 5) -> str:
|
|
| 368 |
|
| 369 |
Args:
|
| 370 |
query (str): The search query.
|
| 371 |
-
max_results (int): The maximum number of results to return. Default is
|
| 372 |
"""
|
| 373 |
if os.getenv("SERPER_API_KEY"):
|
| 374 |
# Preferred choice: Use Google Serper API for search
|
|
@@ -400,6 +244,8 @@ def web_search(query: str, max_results: int = 5) -> str:
|
|
| 400 |
search_tool = DuckDuckGoSearchResults()
|
| 401 |
results = search_tool.invoke(query)
|
| 402 |
if results:
|
|
|
|
|
|
|
| 403 |
return results
|
| 404 |
else:
|
| 405 |
return "No results found."
|
|
@@ -412,12 +258,12 @@ def visit_website(url: str) -> str:
|
|
| 412 |
Args:
|
| 413 |
url (str): The URL of the website to visit.
|
| 414 |
"""
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
return
|
| 419 |
-
|
| 420 |
-
return "
|
| 421 |
|
| 422 |
|
| 423 |
@tool
|
|
|
|
| 1 |
import base64
|
| 2 |
import json
|
| 3 |
import os
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
+
import re
|
| 6 |
import requests
|
| 7 |
import whisper
|
| 8 |
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
from dotenv import find_dotenv, load_dotenv
|
| 11 |
from langchain.chains import RetrievalQA
|
| 12 |
from langchain.chat_models import init_chat_model
|
|
|
|
|
|
|
| 13 |
from langchain_community.document_loaders import (
|
| 14 |
UnstructuredPDFLoader, UnstructuredPowerPointLoader,
|
| 15 |
UnstructuredWordDocumentLoader, WebBaseLoader)
|
| 16 |
+
from langchain_community.tools import DuckDuckGoSearchResults
|
| 17 |
from langchain_community.utilities import GoogleSerperAPIWrapper
|
|
|
|
| 18 |
from langchain_core.prompts import ChatPromptTemplate
|
| 19 |
from langchain_core.tools import tool
|
|
|
|
| 20 |
from langchain_tavily import TavilySearch
|
| 21 |
+
from typing import Optional
|
| 22 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 23 |
from yt_dlp import YoutubeDL
|
| 24 |
|
| 25 |
+
from retrieval import build_retriever, create_retrieval_qa
|
| 26 |
+
from web_utilities import get_wikipedia_article, parse_sections, fetch_wikipedia_page, MarkdownWebBaseLoader
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
@tool
|
| 30 |
def get_weather_info(location: str) -> str:
|
|
|
|
| 135 |
return text[::-1]
|
| 136 |
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
@tool
|
| 139 |
def wiki_search_qa(query: str, question: str) -> str:
|
| 140 |
"""Searches Wikipedia for a specific article and answers a question based on its content.
|
|
|
|
| 145 |
Args:
|
| 146 |
query (str): A concise topic name with optional keywords, ideally matching the relevant Wikipedia page title.
|
| 147 |
question (str): The question to answer using the article.
|
| 148 |
+
Returns:
|
| 149 |
+
str: The answer to the question based on the retrieved article.
|
| 150 |
"""
|
| 151 |
article = get_wikipedia_article(query)
|
| 152 |
markdown = article["markdown"]
|
| 153 |
+
retriever = build_retriever(markdown)
|
| 154 |
+
qa = create_retrieval_qa(retriever=retriever)
|
| 155 |
return qa.invoke(question)
|
| 156 |
|
| 157 |
|
|
|
|
| 188 |
Returns:
|
| 189 |
Markdown string of either the entire section or just the named subsection.
|
| 190 |
"""
|
| 191 |
+
result_dict = fetch_wikipedia_page(page_key=page_key)
|
| 192 |
+
markdown = result_dict.get("markdown")
|
| 193 |
sections = parse_sections(markdown)
|
| 194 |
|
| 195 |
sec_info = sections.get(section)
|
|
|
|
| 212 |
|
| 213 |
Args:
|
| 214 |
query (str): The search query.
|
| 215 |
+
max_results (int): The maximum number of results to return. Default is 3.
|
| 216 |
"""
|
| 217 |
if os.getenv("SERPER_API_KEY"):
|
| 218 |
# Preferred choice: Use Google Serper API for search
|
|
|
|
| 244 |
search_tool = DuckDuckGoSearchResults()
|
| 245 |
results = search_tool.invoke(query)
|
| 246 |
if results:
|
| 247 |
+
# Clean up the results to remove any unnecessary spaces or newlines, e.g. \n\n\n
|
| 248 |
+
results = re.sub(r"\n{2,}", "\n", results.strip())
|
| 249 |
return results
|
| 250 |
else:
|
| 251 |
return "No results found."
|
|
|
|
| 258 |
Args:
|
| 259 |
url (str): The URL of the website to visit.
|
| 260 |
"""
|
| 261 |
+
try:
|
| 262 |
+
page_content = MarkdownWebBaseLoader(url).load()[0].page_content
|
| 263 |
+
# Use retrieval chain if page_content is large
|
| 264 |
+
return page_content
|
| 265 |
+
except Exception as e:
|
| 266 |
+
return f"Could not retrieve website content. Error: {e}"
|
| 267 |
|
| 268 |
|
| 269 |
@tool
|
web_utilities.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
import re
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from langchain.chains import RetrievalQA
|
| 8 |
+
from langchain_community.document_loaders import WebBaseLoader
|
| 9 |
+
from langchain_core.documents import Document
|
| 10 |
+
from markdownify import markdownify as md
|
| 11 |
+
from playwright.async_api import async_playwright
|
| 12 |
+
from typing import Any, AsyncIterator, Dict, List, Iterator, Optional, Sequence, Union
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
UNWANTED_SECTIONS = {
|
| 18 |
+
"references",
|
| 19 |
+
"external links",
|
| 20 |
+
"further reading",
|
| 21 |
+
"see also",
|
| 22 |
+
"notes",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def build_metadata(soup: Any, url: str) -> dict:
|
| 27 |
+
"""Build metadata from BeautifulSoup output."""
|
| 28 |
+
metadata = {"source": url}
|
| 29 |
+
if title := soup.find("title"):
|
| 30 |
+
metadata["title"] = title.get_text()
|
| 31 |
+
if description := soup.find("meta", attrs={"name": "description"}):
|
| 32 |
+
metadata["description"] = description.get("content", "No description found.")
|
| 33 |
+
if html := soup.find("html"):
|
| 34 |
+
metadata["language"] = html.get("lang", "No language found.")
|
| 35 |
+
return metadata
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class MarkdownWebBaseLoader(WebBaseLoader):
|
| 39 |
+
"""
|
| 40 |
+
A WebBaseLoader subclass that uses Playwright to render JS, then
|
| 41 |
+
strips boilerplate and converts structured pieces to Markdown.
|
| 42 |
+
"""
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
web_path: Union[str, Sequence[str]] = "",
|
| 46 |
+
header_template: Optional[dict] = None,
|
| 47 |
+
verify_ssl: bool = True,
|
| 48 |
+
proxies: Optional[dict] = None,
|
| 49 |
+
continue_on_failure: bool = False,
|
| 50 |
+
autoset_encoding: bool = True,
|
| 51 |
+
encoding: Optional[str] = None,
|
| 52 |
+
web_paths: Sequence[str] = (),
|
| 53 |
+
requests_per_second: int = 2,
|
| 54 |
+
default_parser: str = "html.parser",
|
| 55 |
+
requests_kwargs: Optional[Dict[str, Any]] = None,
|
| 56 |
+
raise_for_status: bool = False,
|
| 57 |
+
bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
|
| 58 |
+
bs_kwargs: Optional[Dict[str, Any]] = None,
|
| 59 |
+
session: Any = None,
|
| 60 |
+
markdown_kwargs: Optional[Dict[str, Any]] = None,
|
| 61 |
+
unwanted_css: Optional[List[str]] = None,
|
| 62 |
+
unwanted_headings: Optional[List[str]] = None,
|
| 63 |
+
render_wait: float = 1.0,
|
| 64 |
+
*,
|
| 65 |
+
show_progress: bool = True,
|
| 66 |
+
trust_env: bool = False,
|
| 67 |
+
) -> None:
|
| 68 |
+
"""Initialize loader.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
markdown_kwargs: Optional[Dict[str, Any]]: Arguments for markdownify.
|
| 72 |
+
unwanted_css: Optional[List[str]]: CSS selectors to remove from the page.
|
| 73 |
+
unwanted_headings: Optional[List[str]]: Headings to remove from the page.
|
| 74 |
+
render_wait: float: Time to wait for JS rendering (default: 2.0 seconds).
|
| 75 |
+
"""
|
| 76 |
+
super().__init__(
|
| 77 |
+
web_path=web_path,
|
| 78 |
+
header_template=header_template,
|
| 79 |
+
verify_ssl=verify_ssl,
|
| 80 |
+
proxies=proxies,
|
| 81 |
+
continue_on_failure=continue_on_failure,
|
| 82 |
+
autoset_encoding=autoset_encoding,
|
| 83 |
+
encoding=encoding,
|
| 84 |
+
web_paths=web_paths,
|
| 85 |
+
requests_per_second=requests_per_second,
|
| 86 |
+
default_parser=default_parser,
|
| 87 |
+
requests_kwargs=requests_kwargs,
|
| 88 |
+
raise_for_status=raise_for_status,
|
| 89 |
+
bs_get_text_kwargs=bs_get_text_kwargs,
|
| 90 |
+
bs_kwargs=bs_kwargs,
|
| 91 |
+
session=session,
|
| 92 |
+
show_progress=show_progress,
|
| 93 |
+
trust_env=trust_env,
|
| 94 |
+
)
|
| 95 |
+
self.markdown_kwargs = markdown_kwargs or {
|
| 96 |
+
"heading_style": "ATX",
|
| 97 |
+
"bullets": "*+-",
|
| 98 |
+
"strip": ["a", "span"],
|
| 99 |
+
"table_infer_header": True
|
| 100 |
+
}
|
| 101 |
+
self.unwanted_css = unwanted_css or [
|
| 102 |
+
".toc", ".navbox", ".sidebar", ".advertisement", ".cookie-banner", ".vertical-navbox",
|
| 103 |
+
".hatnote", ".reflist", ".mw-references-wrap"
|
| 104 |
+
]
|
| 105 |
+
self.unwanted_headings = [h.lower() for h in (unwanted_headings or UNWANTED_SECTIONS)]
|
| 106 |
+
self.render_wait = render_wait
|
| 107 |
+
|
| 108 |
+
@staticmethod
|
| 109 |
+
def _should_render(html: str, soup: Any) -> bool:
|
| 110 |
+
low_text = len(soup.get_text(strip=True)) < 100
|
| 111 |
+
has_noscript = bool(soup.find("noscript"))
|
| 112 |
+
cf_challenge = "just a moment" in html.lower() or "enable javascript" in html.lower()
|
| 113 |
+
many_scripts = len(soup.find_all("script")) > 20
|
| 114 |
+
return has_noscript or cf_challenge or low_text or many_scripts
|
| 115 |
+
|
| 116 |
+
async def _fetch_with_playwright(self, url: str) -> str:
|
| 117 |
+
async with async_playwright() as pw:
|
| 118 |
+
browser = await pw.chromium.launch(headless=True)
|
| 119 |
+
page = await browser.new_page()
|
| 120 |
+
# If you need cookies/auth, you can do:
|
| 121 |
+
# await page.set_extra_http_headers(self.session.headers)
|
| 122 |
+
await page.goto(url)
|
| 123 |
+
await asyncio.sleep(self.render_wait) # allow JS to finish
|
| 124 |
+
content = await page.content()
|
| 125 |
+
await browser.close()
|
| 126 |
+
return content
|
| 127 |
+
|
| 128 |
+
def _scrape(
|
| 129 |
+
self,
|
| 130 |
+
url: str,
|
| 131 |
+
parser: Union[str, None] = None,
|
| 132 |
+
bs_kwargs: Optional[dict] = None,
|
| 133 |
+
) -> Any:
|
| 134 |
+
if parser is None:
|
| 135 |
+
parser = "xml" if url.endswith(".xml") else self.default_parser
|
| 136 |
+
self._check_parser(parser)
|
| 137 |
+
|
| 138 |
+
resp = self.session.get(url, **self.requests_kwargs)
|
| 139 |
+
if self.raise_for_status:
|
| 140 |
+
resp.raise_for_status()
|
| 141 |
+
if self.encoding is not None:
|
| 142 |
+
resp.encoding = self.encoding
|
| 143 |
+
elif self.autoset_encoding:
|
| 144 |
+
resp.encoding = resp.apparent_encoding
|
| 145 |
+
html = resp.text
|
| 146 |
+
|
| 147 |
+
soup = BeautifulSoup(html, parser, **(bs_kwargs or {}))
|
| 148 |
+
|
| 149 |
+
# If the html looks JS-heavy, re-render with Playwright
|
| 150 |
+
if not url.endswith(".xml") and self._should_render(html, soup):
|
| 151 |
+
try:
|
| 152 |
+
rendered = asyncio.run(self._fetch_with_playwright(url))
|
| 153 |
+
soup = BeautifulSoup(rendered, parser, **(bs_kwargs or {}))
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.warning("Playwright rendering failed for %s: %s. Falling back to requests.", url, e)
|
| 156 |
+
|
| 157 |
+
return soup
|
| 158 |
+
|
| 159 |
+
@staticmethod
|
| 160 |
+
def normalize_whitespace(text: str) -> str:
|
| 161 |
+
"""
|
| 162 |
+
Collapse runs of spaces, tabs, etc. down to single spaces—but skip
|
| 163 |
+
inside fenced code blocks ```…``` or inline code `…`.
|
| 164 |
+
"""
|
| 165 |
+
# Replace non-breaking and invisible spaces with regular spaces
|
| 166 |
+
text = text.replace("\u00A0", " ")
|
| 167 |
+
# Strip zero-width spaces:
|
| 168 |
+
text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
|
| 169 |
+
|
| 170 |
+
# Split out fenced code -> keep code blocks intact while normalizing other text
|
| 171 |
+
parts = re.split(r'(```.*?```)', text, flags=re.S)
|
| 172 |
+
for i, part in enumerate(parts):
|
| 173 |
+
if not part.startswith("```"):
|
| 174 |
+
# further split out inline code
|
| 175 |
+
subparts = re.split(r'(`[^`\n]+`)', part)
|
| 176 |
+
for j, sp in enumerate(subparts):
|
| 177 |
+
if not sp.startswith("`"):
|
| 178 |
+
# collapse whitespace, strip edges of each segment
|
| 179 |
+
subparts[j] = re.sub(r'[ \t\r\f\v]+', ' ', sp).strip()
|
| 180 |
+
parts[i] = "".join(subparts)
|
| 181 |
+
# Rejoin and ensure paragraphs are separated by a single blank line
|
| 182 |
+
normalized = "\n\n".join(p for p in parts if p.strip() != "")
|
| 183 |
+
return normalized
|
| 184 |
+
|
| 185 |
+
def _convert_soup_to_text(self, soup: Any) -> str:
|
| 186 |
+
# Strip scripts & styles
|
| 187 |
+
for tag in soup(["script", "style"]):
|
| 188 |
+
tag.decompose()
|
| 189 |
+
# Drop blocks whose first heading matches unwanted
|
| 190 |
+
for sec in soup.find_all(["section", "div", "aside"]):
|
| 191 |
+
h = sec.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
| 192 |
+
if h and any(h.get_text(strip=True).lower().startswith(u) for u in self.unwanted_headings):
|
| 193 |
+
sec.decompose()
|
| 194 |
+
# Drop by CSS selector
|
| 195 |
+
for sel in self.unwanted_css:
|
| 196 |
+
for el in soup.select(sel):
|
| 197 |
+
el.decompose()
|
| 198 |
+
# Isolate the main content container if present
|
| 199 |
+
soup = soup.find("div", class_="mw-parser-output") or soup.find("main") or soup.find("article") or soup
|
| 200 |
+
|
| 201 |
+
# Convert to Markdown text with markdownify
|
| 202 |
+
markdown = md(str(soup), **self.markdown_kwargs)
|
| 203 |
+
markdown = self.normalize_whitespace(markdown)
|
| 204 |
+
return markdown
|
| 205 |
+
|
| 206 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 207 |
+
"""Lazy load text from the url(s) in web_path."""
|
| 208 |
+
for path in self.web_paths:
|
| 209 |
+
soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
|
| 210 |
+
text = self._convert_soup_to_text(soup)
|
| 211 |
+
metadata = build_metadata(soup, path)
|
| 212 |
+
yield Document(page_content=text, metadata=metadata)
|
| 213 |
+
|
| 214 |
+
async def alazy_load(self) -> AsyncIterator[Document]:
|
| 215 |
+
"""Async lazy load text from the url(s) in web_path."""
|
| 216 |
+
results = await self.ascrape_all(self.web_paths)
|
| 217 |
+
for path, soup in zip(self.web_paths, results):
|
| 218 |
+
text = self._convert_soup_to_text(soup)
|
| 219 |
+
metadata = build_metadata(soup, path)
|
| 220 |
+
yield Document(page_content=text, metadata=metadata)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def fetch_wikipedia_page(page_key: str, lang: str = "en") -> Dict[str, str]:
|
| 224 |
+
"""Fetches a Wikipedia page by its key and returns its content in Markdown format.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
page_key (str): The unique key of the Wikipedia page.
|
| 228 |
+
lang (str): The language code for the Wikipedia edition to fetch (default: "en").
|
| 229 |
+
"""
|
| 230 |
+
page_key = page_key.replace(" ", "_") # Ensure the page key is URL-safe
|
| 231 |
+
page_url = f"https://api.wikimedia.org/core/v1/wikipedia/{lang}/page/{page_key}/html"
|
| 232 |
+
visit_website_tool = MarkdownWebBaseLoader(page_url)
|
| 233 |
+
markdown = visit_website_tool.load()[0].page_content
|
| 234 |
+
return {
|
| 235 |
+
"page_key": page_key,
|
| 236 |
+
"markdown": markdown,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def get_wikipedia_article(query: str, lang: str = "en") -> Dict[str, str]:
|
| 241 |
+
"""Searches and fetches a Wikipedia article for a given query and returns its content in Markdown format.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
query (str): The search query.
|
| 245 |
+
lang (str): The language code for the Wikipedia edition to search (default: "en").
|
| 246 |
+
"""
|
| 247 |
+
headers = {
|
| 248 |
+
'User-Agent': 'MyLLMAgent (llm_agent@example.com)'
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
search_url = f"https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
|
| 252 |
+
search_params = {'q': query, 'limit': 1}
|
| 253 |
+
search_response = requests.get(search_url, headers=headers, params=search_params, timeout=15)
|
| 254 |
+
|
| 255 |
+
if search_response.status_code != 200:
|
| 256 |
+
raise Exception(f"Search error: {search_response.status_code} - {search_response.text}")
|
| 257 |
+
|
| 258 |
+
results = search_response.json().get("pages", [])
|
| 259 |
+
if not results:
|
| 260 |
+
raise Exception(f"No results found for query: {query}")
|
| 261 |
+
|
| 262 |
+
page = results[0]
|
| 263 |
+
page_key = page["key"]
|
| 264 |
+
|
| 265 |
+
return fetch_wikipedia_page(page_key, lang)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def parse_sections(markdown_text: str) -> Dict[str, Dict]:
|
| 269 |
+
"""
|
| 270 |
+
Parses markdown into a nested dict:
|
| 271 |
+
{ section_title: {
|
| 272 |
+
"full": full_section_md,
|
| 273 |
+
"subsections": { sub_title: sub_md, ... }
|
| 274 |
+
}, ... }
|
| 275 |
+
"""
|
| 276 |
+
# First split top-level sections
|
| 277 |
+
top_pat = re.compile(r"^##\s+(.*)$", re.MULTILINE)
|
| 278 |
+
top_matches = list(top_pat.finditer(markdown_text))
|
| 279 |
+
sections: Dict[str, Dict] = {}
|
| 280 |
+
for i, m in enumerate(top_matches):
|
| 281 |
+
sec_title = m.group(1).strip()
|
| 282 |
+
start = m.start()
|
| 283 |
+
end = top_matches[i+1].start() if i+1 < len(top_matches) else len(markdown_text)
|
| 284 |
+
sec_md = markdown_text[start:end].strip()
|
| 285 |
+
|
| 286 |
+
# Now split subsections within this block
|
| 287 |
+
sub_pat = re.compile(r"^###\s+(.*)$", re.MULTILINE)
|
| 288 |
+
subs: Dict[str, str] = {}
|
| 289 |
+
sub_matches = list(sub_pat.finditer(sec_md))
|
| 290 |
+
for j, sm in enumerate(sub_matches):
|
| 291 |
+
sub_title = sm.group(1).strip()
|
| 292 |
+
sub_start = sm.start()
|
| 293 |
+
sub_end = sub_matches[j+1].start() if j+1 < len(sub_matches) else len(sec_md)
|
| 294 |
+
subs[sub_title] = sec_md[sub_start:sub_end].strip()
|
| 295 |
+
|
| 296 |
+
sections[sec_title] = {"full": sec_md, "subsections": subs}
|
| 297 |
+
return sections
|