RCS / summarizer.py
AiDeveloper1's picture
Update summarizer.py
5c7501f verified
import os
import re
from typing import Dict, Optional
import google.generativeai as genai
import logging
from dotenv import load_dotenv
from urllib.parse import urlparse
from cachetools import TTLCache
# Load environment variables
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# In-memory cache: 1000 items, 1-hour TTL
cache = TTLCache(maxsize=1000, ttl=3600)
async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
"""Summarize text into a title and description using Gemini-1.5 Flash."""
try:
# Validate inputs
text = text.strip() if text else ""
if not url:
url = "https://example.com"
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc or "example.com"
except Exception:
logging.warning(f"Invalid URL: {url}. Using default domain.")
domain = "example.com"
# Check cache
cache_key = f"summarize_{hash(text + url)}"
if cache_key in cache:
logging.info(f"Cache hit for {cache_key}")
return cache[cache_key]
# Get Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
raise ValueError("Gemini API key is required for summarization.")
# Configure Gemini client
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')
# Handle short or empty text
if len(text) < 20:
logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
text = f"Content from {url} about news, products, or services."
# Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
chunk_size = 1000
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
prompt = (
f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
f"and suitable for a news, product, or service context inferred from the URL ({url}). "
f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
)
response = await model.generate_content_async(prompt)
raw_content = response.text.strip()
logging.info(f"Raw Gemini response: {raw_content}")
# Parse response with regex
try:
match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
if match:
title = match.group(1)
description = match.group(2)
summaries.append({"title": title, "description": description})
else:
raise ValueError("Invalid JSON format in Gemini response")
except Exception as e:
logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
continue
# Combine summaries (prioritize first valid summary)
if summaries:
result = summaries[0]
else:
logging.warning("No valid summaries generated. Using fallback.")
result = {
"title": "News Summary",
"description": f"Discover news and insights from {domain}."[:100]
}
# Ensure non-empty outputs
if not result["title"].strip():
result["title"] = "News Summary"
if not result["description"].strip():
result["description"] = f"Discover news and insights from {domain}."
cache[cache_key] = result
logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
return result
except Exception as e:
logging.error(f"Error summarizing text: {e}")
domain = urlparse(url).netloc or "example.com"
result = {
"title": "News Summary",
"description": f"Discover news and insights from {domain}."[:100]
}
cache[cache_key] = result
return result
async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
"""Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
try:
# Validate inputs
text = text.strip() if text else ""
if not url:
url = "https://example.com"
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc or "example.com"
except Exception:
logging.warning(f"Invalid URL: {url}. Using default domain.")
domain = "example.com"
# Check cache
cache_key = f"quick_summarize_{hash(text + url)}"
if cache_key in cache:
logging.info(f"Cache hit for {cache_key}")
return cache[cache_key]
# Get Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
raise ValueError("Gemini API key is required for summarization.")
# Configure Gemini client
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-pro')
# Handle short or empty text
if len(text) < 20:
logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
text = f"Content from {url} about news, products, or services."
# Lightweight prompt with chunking
chunk_size = 1000
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
prompt = (
f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
)
response = await model.generate_content_async(prompt)
raw_content = response.text.strip()
logging.info(f"Raw Gemini response (quick): {raw_content}")
# Parse response with regex
try:
match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
if match:
title = match.group(1)
description = match.group(2)
summaries.append({"title": title, "description": description})
else:
raise ValueError("Invalid JSON format in Gemini response")
except Exception as e:
logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
continue
# Use first valid summary or fallback
if summaries:
result = summaries[0]
else:
logging.warning("No valid summaries generated. Using fallback.")
result = {
"title": "Quick Summary",
"description": f"Check out content from {domain}."
}
# Ensure non-empty outputs
if not result["title"].strip():
result["title"] = "Quick Summary"
if not result["description"].strip():
result["description"] = f"Check out content from {domain}."
cache[cache_key] = result
logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
return result
except Exception as e:
logging.error(f"Error in quick summarize: {e}")
domain = urlparse(url).netloc or "example.com"
result = {
"title": "Quick Summary",
"description": f"Check out content from {domain}."
}
cache[cache_key] = result
return result