Final_Assignment_Template

Runtime error

Final_Assignment_Template / tools /multimodal_tools.py

Refactor core.py and multimodal_tools.py to support dynamic provider selection for Chat models; update requirements.txt to include langchain_openai and gradio[oauth].

7eff63d 6 months ago

raw

history blame contribute delete

6.14 kB

	import base64
	import os
	from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.tools import Tool
	from langchain_core.tools import tool
	from langchain_openai import ChatOpenAI


	provider = os.getenv("PROVIDER", "GOOGLE")
	if provider == "GOOGLE":
	api_key = os.getenv("GEMINI_API_KEY")
	vision_llm = ChatGoogleGenerativeAI(
	model= "gemini-2.5-pro-preview-05-06",
	temperature=0,
	max_retries=2,
	google_api_key=api_key,
	thinking_budget= 0
	)
	elif provider == "OPENAI":
	vision_llm = ChatOpenAI(
	model= "gpt-4o",
	temperature=0,
	max_retries=2,
	)
	else:
	raise ValueError(f"Invalid provider: {provider}")

	@tool("extract_text_tool", parse_docstring=True)
	def extract_text(img_path: str) -> str:
	"""
	Extract text from an image file using a multimodal model.

	Args:
	img_path: The path to the image file from which to extract text.

	Returns:
	The extracted text from the image, or an empty string if an error occurs.
	"""
	all_text = ""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare the prompt including the base64 image data
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	"Extract all the text from this image. "
	"Return only the extracted text, no explanations."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]

	# Call the vision-capable model
	response = vision_llm.invoke(message)

	# Append extracted text
	all_text += response.content + "\n\n"

	return all_text.strip()
	except Exception as e:
	# A butler should handle errors gracefully
	error_msg = f"Error extracting text: {str(e)}"
	print(error_msg)
	return ""

	@tool("analyze_image_tool", parse_docstring=True)
	def analyze_image_tool(user_query: str, img_path: str) -> str:
	"""
	Answer the question reasoning on the image.

	Args:
	user_query: The question to be answered based on the image.
	img_path: Path to the image file to be analyzed.

	Returns:
	The answer to the query based on image content, or an empty string if an error occurs.
	"""
	all_text = ""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare the prompt including the base64 image data
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	f"User query: {user_query}"
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]

	# Call the vision-capable model
	response = vision_llm.invoke(message)

	# Append extracted text
	all_text += response.content + "\n\n"

	return all_text.strip()
	except Exception as e:
	# A butler should handle errors gracefully
	error_msg = f"Error analyzing image: {str(e)}"
	print(error_msg)
	return ""

	@tool("analyze_audio_tool", parse_docstring=True)
	def analyze_audio_tool(user_query: str, audio_path: str) -> str:
	"""Answer the question by reasoning on the provided audio file.

	Args:
	user_query: The question to be answered based on the audio content.
	audio_path: Path to the audio file (e.g., .mp3, .wav, .flac, .aac, .ogg).

	Returns:
	The answer to the query based on audio content, or an error message/empty string if an error occurs.
	"""
	try:
	# Determine MIME type from file extension
	_filename, file_extension = os.path.splitext(audio_path)
	file_extension = file_extension.lower()

	supported_formats = {
	".mp3": "audio/mp3", ".wav": "audio/wav", ".flac": "audio/flac",
	".aac": "audio/aac", ".ogg": "audio/ogg"
	}

	if file_extension not in supported_formats:
	return (f"Error: Unsupported audio file format '{file_extension}'. "
	f"Supported extensions: {', '.join(supported_formats.keys())}.")
	mime_type = supported_formats[file_extension]

	# Read audio file and encode as base64
	with open(audio_path, "rb") as audio_file:
	audio_bytes = audio_file.read()
	audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")

	# Prepare the prompt including the base64 audio data
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": f"User query: {user_query}",
	},
	{
	"type": "audio",
	"source_type": "base64",
	"mime_type": mime_type,
	"data": audio_base64
	},
	]
	)
	]

	# Call the vision-capable model
	response = vision_llm.invoke(message)
	return response.content.strip()
	except Exception as e:
	error_msg = f"Error analyzing audio: {str(e)}"
	print(error_msg)
	return ""