Spaces:

AiDeveloper1
/

RCS

Sleeping

App Files Files Community

RCS / summarizer.py

AiDeveloper1

Update summarizer.py

5c7501f verified 6 months ago

raw

history blame contribute delete

8.38 kB

	import os
	import re
	from typing import Dict, Optional
	import google.generativeai as genai
	import logging
	from dotenv import load_dotenv
	from urllib.parse import urlparse
	from cachetools import TTLCache

	# Load environment variables
	load_dotenv()

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# In-memory cache: 1000 items, 1-hour TTL
	cache = TTLCache(maxsize=1000, ttl=3600)

	async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
	"""Summarize text into a title and description using Gemini-1.5 Flash."""
	try:
	# Validate inputs
	text = text.strip() if text else ""
	if not url:
	url = "https://example.com"
	try:
	parsed_url = urlparse(url)
	domain = parsed_url.netloc or "example.com"
	except Exception:
	logging.warning(f"Invalid URL: {url}. Using default domain.")
	domain = "example.com"

	# Check cache
	cache_key = f"summarize_{hash(text + url)}"
	if cache_key in cache:
	logging.info(f"Cache hit for {cache_key}")
	return cache[cache_key]

	# Get Gemini API key
	api_key = os.getenv("GEMINI_API_KEY")
	if not api_key:
	logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
	raise ValueError("Gemini API key is required for summarization.")

	# Configure Gemini client
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-flash')

	# Handle short or empty text
	if len(text) < 20:
	logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
	text = f"Content from {url} about news, products, or services."

	# Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
	chunk_size = 1000
	text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
	summaries = []

	for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
	prompt = (
	f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
	f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
	f"and suitable for a news, product, or service context inferred from the URL ({url}). "
	f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
	)

	response = await model.generate_content_async(prompt)
	raw_content = response.text.strip()
	logging.info(f"Raw Gemini response: {raw_content}")

	# Parse response with regex
	try:
	match = re.search(r'\{[\s\S]"title":\s"([^"]+)"[\s\S]"description":\s"([^"]+)"[\s\S]*\}', raw_content)
	if match:
	title = match.group(1)
	description = match.group(2)
	summaries.append({"title": title, "description": description})
	else:
	raise ValueError("Invalid JSON format in Gemini response")
	except Exception as e:
	logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
	continue

	# Combine summaries (prioritize first valid summary)
	if summaries:
	result = summaries[0]
	else:
	logging.warning("No valid summaries generated. Using fallback.")
	result = {
	"title": "News Summary",
	"description": f"Discover news and insights from {domain}."[:100]
	}

	# Ensure non-empty outputs
	if not result["title"].strip():
	result["title"] = "News Summary"
	if not result["description"].strip():
	result["description"] = f"Discover news and insights from {domain}."

	cache[cache_key] = result
	logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
	return result

	except Exception as e:
	logging.error(f"Error summarizing text: {e}")
	domain = urlparse(url).netloc or "example.com"
	result = {
	"title": "News Summary",
	"description": f"Discover news and insights from {domain}."[:100]
	}
	cache[cache_key] = result
	return result

	async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
	"""Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
	try:
	# Validate inputs
	text = text.strip() if text else ""
	if not url:
	url = "https://example.com"
	try:
	parsed_url = urlparse(url)
	domain = parsed_url.netloc or "example.com"
	except Exception:
	logging.warning(f"Invalid URL: {url}. Using default domain.")
	domain = "example.com"

	# Check cache
	cache_key = f"quick_summarize_{hash(text + url)}"
	if cache_key in cache:
	logging.info(f"Cache hit for {cache_key}")
	return cache[cache_key]

	# Get Gemini API key
	api_key = os.getenv("GEMINI_API_KEY")
	if not api_key:
	logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
	raise ValueError("Gemini API key is required for summarization.")

	# Configure Gemini client
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-pro')

	# Handle short or empty text
	if len(text) < 20:
	logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
	text = f"Content from {url} about news, products, or services."

	# Lightweight prompt with chunking
	chunk_size = 1000
	text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
	summaries = []

	for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
	prompt = (
	f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
	f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
	)

	response = await model.generate_content_async(prompt)
	raw_content = response.text.strip()
	logging.info(f"Raw Gemini response (quick): {raw_content}")

	# Parse response with regex
	try:
	match = re.search(r'\{[\s\S]"title":\s"([^"]+)"[\s\S]"description":\s"([^"]+)"[\s\S]*\}', raw_content)
	if match:
	title = match.group(1)
	description = match.group(2)
	summaries.append({"title": title, "description": description})
	else:
	raise ValueError("Invalid JSON format in Gemini response")
	except Exception as e:
	logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
	continue

	# Use first valid summary or fallback
	if summaries:
	result = summaries[0]
	else:
	logging.warning("No valid summaries generated. Using fallback.")
	result = {
	"title": "Quick Summary",
	"description": f"Check out content from {domain}."
	}

	# Ensure non-empty outputs
	if not result["title"].strip():
	result["title"] = "Quick Summary"
	if not result["description"].strip():
	result["description"] = f"Check out content from {domain}."

	cache[cache_key] = result
	logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
	return result

	except Exception as e:
	logging.error(f"Error in quick summarize: {e}")
	domain = urlparse(url).netloc or "example.com"
	result = {
	"title": "Quick Summary",
	"description": f"Check out content from {domain}."
	}
	cache[cache_key] = result
	return result