bivalve commited on
Commit
f86b3aa
·
1 Parent(s): 079d2d9

first attempt

Browse files
Files changed (4) hide show
  1. agent.py +148 -6
  2. app.py +4 -14
  3. requirements.txt +11 -1
  4. tools.py +531 -0
agent.py CHANGED
@@ -1,7 +1,149 @@
1
- from typing import TypedDict, Annotated
2
  from langgraph.graph.message import add_messages
3
- from langchain_core.messages import AnyMessage, HumanMessage, AIMessage
4
- from langgraph.prebuilt import ToolNode
5
- from langgraph.graph import START, StateGraph
6
- from langgraph.prebuilt import tools_condition
7
- from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict, Annotated, Optional
2
  from langgraph.graph.message import add_messages
3
+ from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage, ToolMessage
4
+ from langgraph.prebuilt import ToolNode, tools_condition
5
+ from langgraph.graph import START, StateGraph, END
6
+ from langchain_openai import ChatOpenAI
7
+ from pydantic import SecretStr
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from tools import download_file_from_url, basic_web_search, extract_url_content, wikipedia_reader, transcribe_audio_file, question_youtube_video
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+
15
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
16
+ MAIN_LLM_MODEL = os.getenv("MAIN_LLM_MODEL", "google/gemini-2.0-flash-lite-001")
17
+
18
+ # Generate the chat interface, including the tools
19
+ if not OPENROUTER_API_KEY:
20
+ raise ValueError("OPENROUTER_API_KEY is not set. Please ensure it is defined in your .env file or environment variables.")
21
+
22
+
23
+ def create_agent_graph():
24
+
25
+ main_llm = ChatOpenAI(
26
+ model=MAIN_LLM_MODEL, # e.g., "mistralai/mistral-7b-instruct"
27
+ api_key=SecretStr(OPENROUTER_API_KEY), # Your OpenRouter API key
28
+ base_url="https://openrouter.ai/api/v1", # Standard OpenRouter API base
29
+ verbose=True # Optional: for debugging
30
+ )
31
+
32
+
33
+ tools = [download_file_from_url, basic_web_search, extract_url_content, wikipedia_reader, transcribe_audio_file, question_youtube_video] # Ensure these tools are defined
34
+ chat_with_tools = main_llm.bind_tools(tools)
35
+
36
+ class AgentState(TypedDict):
37
+ messages: Annotated[list[AnyMessage], add_messages]
38
+ file_url: Optional[str | None]
39
+ file_ext: Optional[str | None]
40
+ local_file_path: Optional[str | None]
41
+ final_answer: Optional[str | None]
42
+
43
+ def assistant(state: AgentState):
44
+ return {
45
+ "messages": [chat_with_tools.invoke(state["messages"])],
46
+ "file_url": state.get("file_url", None),
47
+ "file_ext": state.get("file_ext", None),
48
+ "local_file_path": state.get("local_file_path", None),
49
+ "final_answer": state.get("final_answer", None)
50
+ }
51
+
52
+ def file_path_updater_node(state: AgentState):
53
+ download_tool_response = state["messages"][-1].content
54
+ file_path = download_tool_response.split("Local File Path: ")[-1].strip()
55
+ return {
56
+ "local_file_path": file_path
57
+ }
58
+
59
+ def file_path_condition(state: AgentState) -> str:
60
+ if state["messages"] and isinstance(state["messages"][-1], ToolMessage):
61
+ tool_response = state["messages"][-1]
62
+ if tool_response.name == "download_file_from_url":
63
+ return "update_file_path" # Route to file path updater if a file was downloaded
64
+ return "assistant" # Otherwise, continue with the assistant node
65
+
66
+ def format_final_answer_node(state: AgentState) -> AgentState:
67
+ """
68
+ Formats the final answer based on the state.
69
+ This node is reached when the assistant has completed its task.
70
+ """
71
+ final_answer = state["messages"][-1].content if state["messages"] else None
72
+ if final_answer:
73
+ state["final_answer"] = final_answer.split("FINAL ANSWER:")[-1].strip() #if FINAL_ANSWER isn't present we grab the whole string
74
+ return state
75
+
76
+
77
+ # The graph
78
+ builder = StateGraph(AgentState)
79
+
80
+ builder.add_node("assistant", assistant)
81
+ builder.add_edge(START, "assistant")
82
+ builder.add_node("tools", ToolNode(tools))
83
+ builder.add_node("file_path_updater_node", file_path_updater_node)
84
+ builder.add_node("format_final_answer_node", format_final_answer_node)
85
+
86
+ builder.add_conditional_edges(
87
+ "assistant",
88
+ tools_condition,
89
+ {
90
+ "tools": "tools",
91
+ "__end__": "format_final_answer_node" # This is the end node for the assistant
92
+ }
93
+ )
94
+ builder.add_conditional_edges(
95
+ "tools",
96
+ file_path_condition,
97
+ {
98
+ "update_file_path": "file_path_updater_node",
99
+ "assistant": "assistant"
100
+ }
101
+ )
102
+
103
+ builder.add_edge("file_path_updater_node", "assistant")
104
+ builder.add_edge("format_final_answer_node", END)
105
+ graph = builder.compile()
106
+ from IPython.display import Image, display
107
+ display(Image(graph.get_graph().draw_mermaid_png()))
108
+ return graph
109
+
110
+ class BasicAgent:
111
+ """
112
+ A basic agent that can answer questions and download files.
113
+ Requires a system message be defined in 'system_prompt.txt'.
114
+ """
115
+ def __init__(self, graph=None):
116
+
117
+ with open("system_prompt.txt", "r", encoding="utf-8") as f:
118
+ self.system_message = SystemMessage(content=f.read())
119
+
120
+ if graph is None:
121
+ self.graph = create_agent_graph()
122
+ else:
123
+ self.graph = graph
124
+
125
+ def __call__(self, question: str, file_url: Optional[str] = None, file_ext: Optional[str] = None) -> str:
126
+ """
127
+ Call the agent with a question and optional file URL and extension.
128
+
129
+ Args:
130
+ question (str): The user's question.
131
+ file_url (Optional[str]): The URL of the file to download.
132
+ file_ext (Optional[str]): The file extension for the downloaded file.
133
+
134
+ Returns:
135
+ str: The agent's response.
136
+ """
137
+ if file_url and file_ext:
138
+ question += f"\nREFERENCE FILE MUST BE RETRIEVED\nFile URL: {file_url}, File Extension: {file_ext}\nUSE A TOOL TO DOWNLOAD THIS FILE."
139
+ state = {
140
+ "messages": [self.system_message, HumanMessage(content=question)],
141
+ "file_url": file_url,
142
+ "file_ext": file_ext,
143
+ "local_file_path": None,
144
+ "final_answer": None
145
+ }
146
+ response = self.graph.invoke(state)
147
+ for m in response["messages"]:
148
+ m.pretty_print()
149
+ return response["final_answer"] if response["final_answer"] else "No final answer generated."
app.py CHANGED
@@ -3,23 +3,12 @@ import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str, file_name: str | None, file_ext: str | None) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- if file_name:
19
- print(f"\tAssociated File URL: {file_name}\tFile Extension: {file_ext}")
20
- fixed_answer = "This is a default answer."
21
- print(f"Agent returning fixed answer: {fixed_answer}")
22
- return fixed_answer
23
 
24
  def run_and_submit_all( profile: gr.OAuthProfile | None):
25
  """
@@ -83,11 +72,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
83
  continue
84
  file_name = item.get("file_name")
85
  file_ext = None
 
86
  if file_name:
87
  file_ext = file_name.split(".")[-1]
88
- file_name = f"{api_url}/files/{task_id}"
89
  try:
90
- submitted_answer = agent(question_text, file_name)
91
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
92
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
93
  except Exception as e:
 
3
  import requests
4
  import inspect
5
  import pandas as pd
6
+ from agent import BasicAgent
7
 
8
  # (Keep Constants as is)
9
  # --- Constants ---
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def run_and_submit_all( profile: gr.OAuthProfile | None):
14
  """
 
72
  continue
73
  file_name = item.get("file_name")
74
  file_ext = None
75
+ file_url = None
76
  if file_name:
77
  file_ext = file_name.split(".")[-1]
78
+ file_url = f"{api_url}/files/{task_id}"
79
  try:
80
+ submitted_answer = agent(question_text, file_url, file_ext)
81
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
82
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
83
  except Exception as e:
requirements.txt CHANGED
@@ -1,2 +1,12 @@
1
  gradio
2
- requests
 
 
 
 
 
 
 
 
 
 
 
1
  gradio
2
+ requests
3
+ langgraph
4
+ langchain-core
5
+ langchain-community
6
+ langchain-openai
7
+ langchain-tavily
8
+ pydantic
9
+ dotenv
10
+ pandas
11
+ yt-dlp
12
+ beautifulsoup4
tools.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import json
4
+ import os
5
+ import subprocess
6
+ from email.message import Message
7
+ from io import StringIO
8
+ from pathlib import Path
9
+ from typing import List
10
+ import av
11
+ import pandas as pd
12
+ import requests
13
+ import yt_dlp
14
+ from bs4 import BeautifulSoup
15
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
16
+ from langchain_core.tools import tool
17
+ from langchain_openai import ChatOpenAI
18
+ from langchain_tavily import TavilyExtract, TavilySearch
19
+ from pydantic import SecretStr
20
+
21
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "")
22
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
23
+ YOUTUBE_FRAME_ASSESSMENT_MODEL = os.getenv("YOUTUBE_FRAME_ASSESSMENT_MODEL", "google/gemini-2.5-flash-preview-05-20")
24
+ YOUTUBE_CONFIRMATION_MODEL = os.getenv("YOUTUBE_CONFIRMATION_MODEL", "google/gemini-2.5-pro-preview")
25
+
26
+ # Define Tools for the Agent
27
+ @tool(parse_docstring=True)
28
+ def download_file_from_url(url: str, filename_override: str|None = None) -> str:
29
+ """
30
+ Downloads a file from a URL to a directory in the cwd. Prefer to use the filename associated with the URL, but can override if directed to.
31
+ Filename Logic:
32
+ 1. If `filename_override` is provided, it is used directly.
33
+ 2. Otherwise, the filename is extracted from the 'Content-Disposition' HTTP header
34
+ using Python's `email.message.Message` parser. The result is sanitized.
35
+ 3. If no filename is provided via override and none can be determined from
36
+ the header, a ValueError is raised.
37
+
38
+ Args:
39
+ url: The URL of the file to download.
40
+ filename_override: Optional. If provided, this exact name is used for the downloaded file. Using the name associated with the URL is recommended (but may require identifying the extension).
41
+
42
+ Returns:
43
+ The full path to the downloaded file.
44
+
45
+ Raises:
46
+ requests.exceptions.RequestException: For HTTP errors (e.g., 404, network issues).
47
+ IOError: If the file cannot be written.
48
+ ValueError: If no filename can be determined (neither provided via override
49
+ nor found in Content-Disposition header).
50
+ """
51
+ try:
52
+ with requests.Session() as session:
53
+ with session.get(url, stream=True, allow_redirects=True, timeout=30) as response:
54
+ response.raise_for_status()
55
+
56
+ final_filename = None
57
+
58
+ if filename_override:
59
+ final_filename = filename_override
60
+ print(f"Using provided filename: {final_filename}")
61
+ else:
62
+ content_disposition = response.headers.get('content-disposition')
63
+ if content_disposition:
64
+ msg = Message()
65
+ msg['Content-Disposition'] = content_disposition
66
+ filename_from_header = msg.get_filename() # Handles various encodings
67
+
68
+ if filename_from_header:
69
+ # Sanitize by taking only the basename to prevent path traversal
70
+ final_filename = os.path.basename(filename_from_header)
71
+ print(f"Using filename from Content-Disposition: {final_filename}")
72
+
73
+ if not final_filename:
74
+ raise ValueError(
75
+ "No filename could be determined. "
76
+ "None was provided as an override, and it could not be "
77
+ "extracted from the Content-Disposition header."
78
+ )
79
+
80
+ current_dir = Path.cwd()
81
+ temp_dir = current_dir / "temp_downloads"
82
+ temp_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ local_filepath = os.path.join(temp_dir, final_filename)
85
+
86
+ with open(local_filepath, 'wb') as f:
87
+ for chunk in response.iter_content(chunk_size=8192):
88
+ if chunk:
89
+ f.write(chunk)
90
+
91
+ #print(f"File downloaded to: {local_filepath}")
92
+ return_str = f"File downloaded successfully. Local File Path: {local_filepath}"
93
+ return return_str
94
+
95
+ except requests.exceptions.RequestException as e:
96
+ print(f"Error during download from {url}: {e}")
97
+ raise
98
+ except IOError as e:
99
+ print(f"Error writing file: {e}")
100
+ raise
101
+ # ValueError will propagate if raised
102
+
103
+ @tool(parse_docstring=True)
104
+ def basic_web_search(query: str, search_domains: list[str]|None = None) -> str:
105
+ """
106
+ Perform a web search using Tavily. Useful for retrieving relevant URLs and content summaries based on a search query.
107
+ The content returned by this tool is limited. For more detailed content extraction, use the `extract_url_content` tool.
108
+ If you would like to limit the search to specific domains, you can pass a comma-separated string of domains (['wikipedia.org', 'example.com']).
109
+
110
+ Args:
111
+ query (str): The search query to perform.
112
+ search_domains (None | list[str]): Optional. A list of domains (E.g., ['wikipedia.org', 'example.com']) to restrict the search to. If None, searches across all domains.
113
+
114
+ Returns:
115
+ str: a json formatted string containing the search results, including titles, content snippets, and URLs.
116
+ """
117
+ search_tool = TavilySearch(
118
+ api_key=SecretStr(TAVILY_API_KEY),
119
+ max_results=5,
120
+ include_raw_content=False,
121
+ #include_answer=True,
122
+ include_domains=search_domains
123
+ )
124
+
125
+ results = search_tool.invoke({"query": query})
126
+
127
+ if results and isinstance(results, dict) and len(results["results"]) > 0:
128
+ return_dict = {
129
+ #"answer": "The following is an unconfirmed answer. Confirm it by extracting cotent from a url." + results.get("answer", ""),
130
+ "results": []
131
+ }
132
+ for result in results["results"]:
133
+ if "title" in result and "content" in result and result['score'] > 0.25: # Filter results based on score
134
+ return_dict["results"].append({
135
+ "title": result["title"],
136
+ "url": result["url"],
137
+ "content": result["content"],
138
+ })
139
+ if len(return_dict["results"]) == 0:
140
+ return "No results found. If the query is too specific, try a more general search term."
141
+ return json.dumps(return_dict, indent=2)
142
+
143
+ else:
144
+ return "No results found. If the query is too specific, try a more general search term."
145
+
146
+ @tool(parse_docstring=True)
147
+ def extract_url_content(url_list: list[str]) -> str:
148
+ """
149
+ Extracts the content from URLs using Tavily's extract tool.
150
+ This tool is useful for retrieving content from web pages.
151
+ This tool will most likely be used after a web search to extract content from the URLs returned by the search.
152
+
153
+ Args:
154
+ url_list (list[str]): The URLs to extract content from.
155
+
156
+ Returns:
157
+ str: The extracted content or an error message if extraction fails.
158
+ """
159
+ extract_tool = TavilyExtract(api_key=SecretStr(TAVILY_API_KEY))
160
+ extract_results = extract_tool.invoke({'urls': url_list})
161
+
162
+ if extract_results and 'results' in extract_results and len(extract_results['results']) > 0:
163
+ for i, page_content in enumerate(extract_results['results']):
164
+ del extract_results['results'][i]['images']
165
+ if len(page_content['raw_content']) > 40000:
166
+ extract_results['results'][i]['raw_content'] = page_content['raw_content'][:40000] + '... [truncated]'
167
+ return json.dumps(extract_results['results'], indent=2)
168
+ else:
169
+ return f"No content could be extracted from the provided URLs: {url_list}"
170
+
171
+
172
+
173
+ def bs_html_parser(url):
174
+ response = requests.get(url) # Send a GET request to the URL
175
+
176
+ # Check if the request was successful
177
+ if response.status_code == 200:
178
+ return BeautifulSoup(response.text, "html.parser") # Parse and return the HTML
179
+ else:
180
+ return None # Return None if the request fails
181
+
182
+ def get_table_title(table_tag):
183
+ """
184
+ Extracts a title for a given table tag.
185
+ It looks for a <caption>, then for the closest preceding <h1>-<h6> tag.
186
+ """
187
+ title = "Untitled Table"
188
+
189
+ # 1. Check for a <caption> element within the table
190
+ caption = table_tag.find('caption')
191
+ if caption:
192
+ caption_text = caption.get_text(strip=True)
193
+ if caption_text: # Ensure caption is not empty and use it
194
+ return caption_text
195
+
196
+ # 2. If no caption, look for the closest preceding heading tag (h1-h6)
197
+ headings = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
198
+ # find_all_previous gets all previous tags matching criteria, in reverse document order.
199
+ # limit=1 gets the closest one (the last one encountered before the table).
200
+ preceding_headings = table_tag.find_all_previous(headings, limit=1)
201
+
202
+ if preceding_headings:
203
+ heading_tag = preceding_headings[0]
204
+
205
+ # To get the cleanest text, prefer 'mw-headline' if it exists,
206
+ # otherwise, clone the heading, remove edit sections, and then get text.
207
+
208
+ # Try to find a specific 'mw-headline' span first (common in Wikipedia)
209
+ headline_span = heading_tag.find("span", class_="mw-headline")
210
+ if headline_span:
211
+ title_text = headline_span.get_text(strip=True)
212
+ else:
213
+ # Fallback: create a temporary copy of the heading tag to modify it
214
+ # without affecting the main soup.
215
+ temp_heading_soup = BeautifulSoup(str(heading_tag), 'html.parser')
216
+ temp_heading_tag = temp_heading_soup.find(heading_tag.name)
217
+
218
+ if temp_heading_tag:
219
+ # Remove "edit" links (span with class "mw-editsection")
220
+ for span in temp_heading_tag.find_all("span", class_="mw-editsection"):
221
+ span.decompose()
222
+ title_text = temp_heading_tag.get_text(strip=True)
223
+ else:
224
+ # If cloning somehow failed, take raw text (less ideal)
225
+ title_text = heading_tag.get_text(strip=True)
226
+
227
+ if title_text: # Ensure title_text is not empty
228
+ title = title_text
229
+
230
+ return title
231
+
232
+ @tool(parse_docstring=True)
233
+ def wikipedia_reader(url: str) -> str:
234
+ """
235
+ Extracts sections, paragraphs, and tables from a Wikipedia page.
236
+
237
+ Args:
238
+ url (str): The URL of the Wikipedia page to extract content from.
239
+
240
+ Returns:
241
+ str: A JSON string containing sections, paragraphs, and tables.
242
+ """
243
+ soup = bs_html_parser(url)
244
+ if not soup:
245
+ return "" # Return empty if soup creation failed
246
+
247
+ def extract_links(soup_obj):
248
+ links = []
249
+ for link in soup_obj.find_all('a', href=True):
250
+ href = link.get('href')
251
+ # Filter for internal page links (sections)
252
+ if href and href.startswith("#") and "#cite_" not in href and len(href) > 1:
253
+ links.append(url+href)
254
+ # Original logic for other links starting with the base URL (might need adjustment based on desired links)
255
+ # elif href and href.startswith(url):
256
+ # links.append(href)
257
+ return links
258
+
259
+ links = extract_links(soup)
260
+
261
+ def extract_paragraphs(soup_obj):
262
+ paragraphs_text = [p.get_text(strip=True) for p in soup_obj.find_all("p")]
263
+ return [p for p in paragraphs_text if p and len(p) > 10]
264
+
265
+ paragraphs = extract_paragraphs(soup)
266
+
267
+ def extract_tables(soup_obj):
268
+ tables_with_titles = []
269
+ for table_tag in soup_obj.find_all("table", {"class": "wikitable"}):
270
+ title = get_table_title(table_tag) # Get the title
271
+ try:
272
+ # Pandas read_html expects a string or file-like object
273
+ table_html_str = str(table_tag)
274
+ # Using StringIO to simulate a file, as read_html can be sensitive
275
+ df_list = pd.read_html(StringIO(table_html_str))
276
+ if df_list:
277
+ df = df_list[0] # read_html returns a list of DataFrames
278
+ tables_with_titles.append({"title": title, "table_data": df.to_dict(orient='records')})
279
+ else:
280
+ tables_with_titles.append({"title": title, "table_data": None, "error": "pd.read_html returned empty list"})
281
+ except Exception as e:
282
+
283
+ tables_with_titles.append({"title": title, "table_data" : None, "error": str(e)})
284
+ return tables_with_titles
285
+
286
+ tables = extract_tables(soup) # This now returns a list of dicts
287
+
288
+ return_dict = {
289
+ "sections": links,
290
+ "paragraphs": paragraphs,
291
+ "tables": tables
292
+ }
293
+
294
+ return json.dumps(return_dict, indent=2, ensure_ascii=False) # Return as JSON string
295
+
296
+
297
+ # Singleton class for Whisper model
298
+ # we use this so we don't have to load the model multiple times, just once the first time the tool is used
299
+ class WhisperTranscriber:
300
+ _instance = None
301
+
302
+ def __new__(cls):
303
+ if cls._instance is None:
304
+ import torch
305
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
306
+ from transformers.pipelines import pipeline
307
+
308
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
309
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
310
+ model_id = "openai/whisper-large-v3"
311
+
312
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
313
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
314
+ )
315
+ model.to(device)
316
+
317
+ processor = AutoProcessor.from_pretrained(model_id)
318
+ pipe = pipeline(
319
+ "automatic-speech-recognition",
320
+ model=model,
321
+ tokenizer=processor.tokenizer,
322
+ feature_extractor=processor.feature_extractor,
323
+ torch_dtype=torch_dtype,
324
+ device=device,
325
+ )
326
+
327
+ cls._instance = pipe
328
+ return cls._instance
329
+
330
+
331
+ @tool(parse_docstring=True)
332
+ def transcribe_audio_file(file_path: str) -> str:
333
+ """
334
+ Transcribes an audio file to text using OpenAI's Whisper-large-v3 model, caching the model after the first load.
335
+
336
+ Args:
337
+ file_path (str): The path to the audio file to transcribe.
338
+
339
+ Returns:
340
+ str: The transcription of the audio file.
341
+ """
342
+ pipe = WhisperTranscriber()
343
+ transcription = pipe(file_path)["text"]
344
+ return transcription.strip() if transcription else "No transcription available."
345
+
346
+
347
+ @tool(parse_docstring=True)
348
+ def question_youtube_video(video_url: str, query: str) -> str:
349
+ """
350
+ Returns an answer to a question about a YouTube video.
351
+ The video is streamed and one frame is captured every `capture_interval_sec` seconds.
352
+ These frames are sent sequentially to a multimodal model to answer the question about the video.
353
+ The final answer is aggregated from the answers to each frame.
354
+
355
+ Args:
356
+ video_url (str): The URL of the video to capture frames from.
357
+ query (str): The question to answer about the video.
358
+
359
+ Returns:
360
+ str: The answer to the question about the video.
361
+ """
362
+ CAPTURE_INTERVAL_SEC = int(os.getenv("CAPTURE_INTERVAL_SEC", 2)) # Default to 2 seconds if not set
363
+
364
+ # First, we need to get the video stream URL using yt-dlp
365
+ ydl_opts = {
366
+ "quiet": True,
367
+ "skip_download": True,
368
+ "format": "mp4[ext=mp4]+bestaudio/best",
369
+ "forceurl": True,
370
+ "noplaylist": True,
371
+ "writesubtitles": True,
372
+ "writeautomaticsub": True,
373
+ "subtitlesformat": "vtt",
374
+ "subtitleslangs": ['en'],
375
+ }
376
+
377
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
378
+ info_dict = ydl.extract_info(video_url, download=False)
379
+ assert isinstance(info_dict, dict), "Failed to extract video information. Please check the video URL."
380
+ stream_url = info_dict.get("url", None)
381
+
382
+ # Second, we use FFmpeg to capture frames from the video stream
383
+ ffmpeg_cmd = [
384
+ "ffmpeg",
385
+ "-i",
386
+ stream_url,
387
+ "-f",
388
+ "matroska", # container format
389
+ "-",
390
+ ]
391
+
392
+ process = subprocess.Popen(
393
+ ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
394
+ )
395
+
396
+ container = av.open(process.stdout)
397
+ stream = container.streams.video[0]
398
+ time_base = stream.time_base
399
+ if time_base is None:
400
+ raise ValueError("Could not determine time base for the video stream. Please check the video URL and try again.")
401
+ else:
402
+ time_base = float(time_base)
403
+
404
+ # Third, we need to use a multimodal model to analyze the video frames.
405
+ if stream_url is None:
406
+ raise ValueError("Could not retrieve video stream URL. Please check the video URL and try again.")
407
+ else:
408
+ image_model = ChatOpenAI(
409
+ model="google/gemini-2.5-flash-preview-05-20", # Example multimodal model
410
+ api_key=SecretStr(OPENROUTER_API_KEY), # Your OpenRouter API key
411
+ base_url="https://openrouter.ai/api/v1", # Standard OpenRouter API base
412
+ verbose=True # Optional: for debugging
413
+ )
414
+ image_model_system_prompt = SystemMessage(
415
+ content="You will be shown a frame from a video along with a question about that video and an answer based on the previous frames in the video. "\
416
+ "Your task is to analyze the frame and provide an answer to the question using both the current frame and the previous answer. " \
417
+ "If the previous answer is reasonable and the current frame can not answer the question return the previous answer. " \
418
+ "For example, if the question is about the color of a car and the previous answer is 'red' but the current frame shows no car, you should return 'red'. " \
419
+ "If the question is about the greatest number of something in the video, you should return the number counted in the current frame or the previous answer, whichever is greater. " \
420
+ "For example, if the current frame has 5 objects but the previous answer is 10 objects, you should return '10'. " \
421
+ "Be concise and clear in your answers, and do not repeat the question. " \
422
+ )
423
+
424
+
425
+ # Then, we loop through the frames and analyze them one by one, skipping frames based on the capture interval
426
+ next_capture_time = 0
427
+ aggregated_answer = ''
428
+ response = ''
429
+
430
+ answers_list: List[dict] = []
431
+
432
+ for frame in container.decode(stream):
433
+ if frame.pts is None:
434
+ continue
435
+
436
+ timestamp = float(frame.pts * time_base)
437
+ if CAPTURE_INTERVAL_SEC is None or timestamp >= next_capture_time:
438
+ # Convert the frame to an image format that the model can process
439
+ buf = io.BytesIO()
440
+ img = frame.to_image()
441
+ img.save(buf, format="JPEG") # using PIL.Image.save
442
+ jpeg_bytes = buf.getvalue()
443
+ frame_base64 = base64.b64encode(jpeg_bytes).decode("utf-8")
444
+
445
+ # Explicitly type the list to hold instances of BaseMessage
446
+ msgs: List[BaseMessage] = [image_model_system_prompt]
447
+
448
+ frame_query = query
449
+
450
+ if aggregated_answer:
451
+ frame_query += f"\nPrevious Answer: {aggregated_answer}"
452
+ frame_query += "\nProvide a concise answer based on the previous answer and the current frame. " \
453
+ "If the current frame does not answer the question but there is a previous answer, return the previous answer. " \
454
+ "REMEMBER: This question is not about the current frame! It is about the video as a whole. ALWAYS PAY ATTENTION TO THE PREVIOUS ANSWER!"
455
+
456
+ msgs.append(HumanMessage(content = [
457
+ {
458
+ "type": "text",
459
+ "text": frame_query
460
+ },
461
+ {
462
+ "type": "image",
463
+ "source_type": "base64",
464
+ "mime_type": "image/jpeg",
465
+ "data": frame_base64
466
+ }
467
+ ]))
468
+
469
+ response = image_model.invoke(msgs) # Pass the image bytes to the model
470
+ # Extract the answer from the model's response
471
+ assert isinstance(response.content, str), "The model's response should be a string."
472
+ answer = response.content.strip()
473
+ answers_list.append({"timestamp": timestamp, "answer": answer})
474
+ if answer:
475
+ aggregated_answer = answer
476
+ if CAPTURE_INTERVAL_SEC is not None:
477
+ next_capture_time += CAPTURE_INTERVAL_SEC
478
+
479
+ process.terminate()
480
+
481
+ final_answer_model = ChatOpenAI(
482
+ model="google/gemini-2.5-pro-preview", # Example multimodal model
483
+ api_key=SecretStr(OPENROUTER_API_KEY), # Your OpenRouter API key
484
+ base_url="https://openrouter.ai/api/v1", # Standard OpenRouter API base
485
+ verbose=True # Optional: for debugging
486
+ )
487
+
488
+ final_answer_system_message = SystemMessage(
489
+ "You are a brilliant assistant who is eager to help and extremely detailed oriented. " \
490
+ "A group of individuals have been asked the same question about a video. " \
491
+ "None of the individuals have seen the entire video. " \
492
+ "Each individual, when asked the question, was provided a frame from the video, as well as the previously reported answer based on the previous frame. " \
493
+ "Your job is to report a final answer for the question about the video. " \
494
+ "Ideally, the final answer has already been reported correctly by the last individual. " \
495
+ "However, this is similar to the game a telephone, where the true answer can become corrupted along the way. " \
496
+ "Assess all of the answers. If you can confirm the final answer is correct, simply return it. " \
497
+ "If you notice that the final answer is incorrect, then identify the correct answer and report that. " \
498
+ "You will also have access to the video title and description, which may help you identify the correct answer. " \
499
+ "Be concise and only respond with the correct final answer!"
500
+ )
501
+
502
+ answers_list_str = "\n".join([f"Answer {i+1} at {ans['timestamp']:.2f}s: {ans['answer']}" for i, ans in enumerate(answers_list)])
503
+
504
+ final_query = (
505
+ f"Video Title: {info_dict.get('title', 'No title found')}. "
506
+ f"Video Description: {info_dict.get('description', 'No description found')}. "
507
+ f"Question about video: {query} "
508
+ f"Answers provided by individuals: \n{answers_list_str}\n\n "
509
+ "Provide a concise final answer to the question about the video based on the previous answers. "
510
+ "Include a short explanation of why you chose this answer. "
511
+ "Format the answer like so: "
512
+ "Explanation: <your explanation here>. "
513
+ "Final Answer: <your answer here>. "
514
+ )
515
+
516
+
517
+ final_msgs = [
518
+ final_answer_system_message,
519
+ HumanMessage(content=[
520
+ {
521
+ "type": "text",
522
+ "text": final_query
523
+ }
524
+ ])
525
+ ]
526
+ final_response = final_answer_model.invoke(final_msgs)
527
+ assert isinstance(final_response.content, str), "The final model's response should be a string."
528
+ final_answer = final_response.content.strip()
529
+
530
+ return final_answer
531
+