from smolagents import CodeAgent, OpenAIServerModel from pathlib import Path from PIL import Image from dotenv import load_dotenv import os import json import re load_dotenv() class ImageAnalyzerAgent: """Agent for describing images using Gemini with smolagents""" def __init__(self): """Initialize the agent with Gemini configuration""" # Configure Gemini with smolagents using OpenAI-compatible endpoint api_key = os.getenv("GEMINI_API_KEY") if not api_key: raise ValueError("GEMINI_API_KEY not found in environment variables. Get one at https://aistudio.google.com/apikey") # Use Gemini with smolagents via OpenAI-compatible API self.model = OpenAIServerModel( model_id="gemini-2.0-flash", api_base="https://generativelanguage.googleapis.com/v1beta/openai/", api_key=api_key ) # Instantiate the agent self.agent = CodeAgent( tools=[], model=self.model, max_steps=5, verbosity_level=1 ) def retrieve_similar_images(self, image_path): """ Describe a given image. Args: image_path: Path object or string pointing to an image file Returns: Description text of the image """ image_path = Path(image_path) if isinstance(image_path, str) else image_path if not image_path.exists(): raise FileNotFoundError(f"Image not found at {image_path}") image = Image.open(image_path) print(f"Loaded image: {image_path.name}\n") response = self.agent.run( """ Describe this image in a way that could be used as a prompt for generating a new image inspired by it. Focus on the main subjects, composition, style, mood, and colors. Avoid mentioning specific names or exact details — instead, describe the overall aesthetic and atmosphere so the result feels similar but not identical. """, images=[image] ) return response def analyze_image_metadata(self, image_path): """ Analyze an image and extract structured metadata (tags, description, caption). Args: image_path: Path object or string pointing to an image file Returns: Dictionary with keys: tags (list), description (str), caption (str) Raises: FileNotFoundError: If image file doesn't exist ValueError: If response cannot be parsed into valid JSON """ image_path = Path(image_path) if isinstance(image_path, str) else image_path if not image_path.exists(): raise FileNotFoundError(f"Image not found at {image_path}") image = Image.open(image_path) print(f"Loaded image: {image_path.name}\n") prompt = """ Describe this image in the following exact format: result: { "tags": [list of tags related to the image], "description": "a 5-line descriptive description for the image", "caption": "a short description for the image" } """ response = self.agent.run(prompt, images=[image]) # If response is already a dict, return it directly if isinstance(response, dict): return response # Safely convert to string, handling non-string types if response is None: text_content = "" else: text_content = str(response).strip() if not text_content: raise ValueError("Model returned empty response") # Try to extract JSON-like dict from model output try: if "{" not in text_content: raise ValueError("Response does not contain valid JSON structure (missing opening brace)") start = text_content.index("{") # Try to find closing brace if "}" not in text_content[start:]: # No closing brace found, try adding one print(f"[Warning] No closing brace found in response, attempting to add closing brace...") json_str = text_content[start:] + "}" else: end = text_content.rindex("}") + 1 json_str = text_content[start:end] result = json.loads(json_str) return result except ValueError as ve: raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}") except json.JSONDecodeError as je: raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}") except Exception as e: raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}") # Test with sample images if __name__ == "__main__": uploads_dir = Path(__file__).parent.parent.parent / "uploads" sample_image_paths = [ uploads_dir / "img_1_20251024_180707_942.jpg", uploads_dir / "img_2_20251024_180749_372.jpeg", uploads_dir / "img_3_20251024_180756_356.jpeg", ] agent = ImageAnalyzerAgent() # Test with first sample image result = agent.retrieve_similar_images(sample_image_paths[0]) print(f"\n=== Results ===") print(f"Description: {result}") # print(f"Similar images found: {len(result['similar_images'])}")