Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| from typing import Dict, Optional | |
| import google.generativeai as genai | |
| import logging | |
| from dotenv import load_dotenv | |
| from urllib.parse import urlparse | |
| from cachetools import TTLCache | |
| # Load environment variables | |
| load_dotenv() | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| # In-memory cache: 1000 items, 1-hour TTL | |
| cache = TTLCache(maxsize=1000, ttl=3600) | |
| async def summarize_text(text: str, url: str = "") -> Dict[str, str]: | |
| """Summarize text into a title and description using Gemini-1.5 Flash.""" | |
| try: | |
| # Validate inputs | |
| text = text.strip() if text else "" | |
| if not url: | |
| url = "https://example.com" | |
| try: | |
| parsed_url = urlparse(url) | |
| domain = parsed_url.netloc or "example.com" | |
| except Exception: | |
| logging.warning(f"Invalid URL: {url}. Using default domain.") | |
| domain = "example.com" | |
| # Check cache | |
| cache_key = f"summarize_{hash(text + url)}" | |
| if cache_key in cache: | |
| logging.info(f"Cache hit for {cache_key}") | |
| return cache[cache_key] | |
| # Get Gemini API key | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| if not api_key: | |
| logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.") | |
| raise ValueError("Gemini API key is required for summarization.") | |
| # Configure Gemini client | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel('gemini-1.5-flash') | |
| # Handle short or empty text | |
| if len(text) < 20: | |
| logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.") | |
| text = f"Content from {url} about news, products, or services." | |
| # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk) | |
| chunk_size = 1000 | |
| text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| summaries = [] | |
| for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency | |
| prompt = ( | |
| f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) " | |
| f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, " | |
| f"and suitable for a news, product, or service context inferred from the URL ({url}). " | |
| f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}" | |
| ) | |
| response = await model.generate_content_async(prompt) | |
| raw_content = response.text.strip() | |
| logging.info(f"Raw Gemini response: {raw_content}") | |
| # Parse response with regex | |
| try: | |
| match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content) | |
| if match: | |
| title = match.group(1) | |
| description = match.group(2) | |
| summaries.append({"title": title, "description": description}) | |
| else: | |
| raise ValueError("Invalid JSON format in Gemini response") | |
| except Exception as e: | |
| logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.") | |
| continue | |
| # Combine summaries (prioritize first valid summary) | |
| if summaries: | |
| result = summaries[0] | |
| else: | |
| logging.warning("No valid summaries generated. Using fallback.") | |
| result = { | |
| "title": "News Summary", | |
| "description": f"Discover news and insights from {domain}."[:100] | |
| } | |
| # Ensure non-empty outputs | |
| if not result["title"].strip(): | |
| result["title"] = "News Summary" | |
| if not result["description"].strip(): | |
| result["description"] = f"Discover news and insights from {domain}." | |
| cache[cache_key] = result | |
| logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}") | |
| return result | |
| except Exception as e: | |
| logging.error(f"Error summarizing text: {e}") | |
| domain = urlparse(url).netloc or "example.com" | |
| result = { | |
| "title": "News Summary", | |
| "description": f"Discover news and insights from {domain}."[:100] | |
| } | |
| cache[cache_key] = result | |
| return result | |
| async def quick_summarize(text: str, url: str = "") -> Dict[str, str]: | |
| """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash.""" | |
| try: | |
| # Validate inputs | |
| text = text.strip() if text else "" | |
| if not url: | |
| url = "https://example.com" | |
| try: | |
| parsed_url = urlparse(url) | |
| domain = parsed_url.netloc or "example.com" | |
| except Exception: | |
| logging.warning(f"Invalid URL: {url}. Using default domain.") | |
| domain = "example.com" | |
| # Check cache | |
| cache_key = f"quick_summarize_{hash(text + url)}" | |
| if cache_key in cache: | |
| logging.info(f"Cache hit for {cache_key}") | |
| return cache[cache_key] | |
| # Get Gemini API key | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| if not api_key: | |
| logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.") | |
| raise ValueError("Gemini API key is required for summarization.") | |
| # Configure Gemini client | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel('gemini-1.5-pro') | |
| # Handle short or empty text | |
| if len(text) < 20: | |
| logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.") | |
| text = f"Content from {url} about news, products, or services." | |
| # Lightweight prompt with chunking | |
| chunk_size = 1000 | |
| text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| summaries = [] | |
| for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary | |
| prompt = ( | |
| f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. " | |
| f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}" | |
| ) | |
| response = await model.generate_content_async(prompt) | |
| raw_content = response.text.strip() | |
| logging.info(f"Raw Gemini response (quick): {raw_content}") | |
| # Parse response with regex | |
| try: | |
| match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content) | |
| if match: | |
| title = match.group(1) | |
| description = match.group(2) | |
| summaries.append({"title": title, "description": description}) | |
| else: | |
| raise ValueError("Invalid JSON format in Gemini response") | |
| except Exception as e: | |
| logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.") | |
| continue | |
| # Use first valid summary or fallback | |
| if summaries: | |
| result = summaries[0] | |
| else: | |
| logging.warning("No valid summaries generated. Using fallback.") | |
| result = { | |
| "title": "Quick Summary", | |
| "description": f"Check out content from {domain}." | |
| } | |
| # Ensure non-empty outputs | |
| if not result["title"].strip(): | |
| result["title"] = "Quick Summary" | |
| if not result["description"].strip(): | |
| result["description"] = f"Check out content from {domain}." | |
| cache[cache_key] = result | |
| logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}") | |
| return result | |
| except Exception as e: | |
| logging.error(f"Error in quick summarize: {e}") | |
| domain = urlparse(url).netloc or "example.com" | |
| result = { | |
| "title": "Quick Summary", | |
| "description": f"Check out content from {domain}." | |
| } | |
| cache[cache_key] = result | |
| return result |