Spaces:
Sleeping
Sleeping
File size: 8,377 Bytes
5c7501f 95f63e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import os
import re
from typing import Dict, Optional
import google.generativeai as genai
import logging
from dotenv import load_dotenv
from urllib.parse import urlparse
from cachetools import TTLCache
# Load environment variables
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# In-memory cache: 1000 items, 1-hour TTL
cache = TTLCache(maxsize=1000, ttl=3600)
async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
"""Summarize text into a title and description using Gemini-1.5 Flash."""
try:
# Validate inputs
text = text.strip() if text else ""
if not url:
url = "https://example.com"
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc or "example.com"
except Exception:
logging.warning(f"Invalid URL: {url}. Using default domain.")
domain = "example.com"
# Check cache
cache_key = f"summarize_{hash(text + url)}"
if cache_key in cache:
logging.info(f"Cache hit for {cache_key}")
return cache[cache_key]
# Get Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
raise ValueError("Gemini API key is required for summarization.")
# Configure Gemini client
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')
# Handle short or empty text
if len(text) < 20:
logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
text = f"Content from {url} about news, products, or services."
# Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
chunk_size = 1000
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
prompt = (
f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
f"and suitable for a news, product, or service context inferred from the URL ({url}). "
f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
)
response = await model.generate_content_async(prompt)
raw_content = response.text.strip()
logging.info(f"Raw Gemini response: {raw_content}")
# Parse response with regex
try:
match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
if match:
title = match.group(1)
description = match.group(2)
summaries.append({"title": title, "description": description})
else:
raise ValueError("Invalid JSON format in Gemini response")
except Exception as e:
logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
continue
# Combine summaries (prioritize first valid summary)
if summaries:
result = summaries[0]
else:
logging.warning("No valid summaries generated. Using fallback.")
result = {
"title": "News Summary",
"description": f"Discover news and insights from {domain}."[:100]
}
# Ensure non-empty outputs
if not result["title"].strip():
result["title"] = "News Summary"
if not result["description"].strip():
result["description"] = f"Discover news and insights from {domain}."
cache[cache_key] = result
logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
return result
except Exception as e:
logging.error(f"Error summarizing text: {e}")
domain = urlparse(url).netloc or "example.com"
result = {
"title": "News Summary",
"description": f"Discover news and insights from {domain}."[:100]
}
cache[cache_key] = result
return result
async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
"""Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
try:
# Validate inputs
text = text.strip() if text else ""
if not url:
url = "https://example.com"
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc or "example.com"
except Exception:
logging.warning(f"Invalid URL: {url}. Using default domain.")
domain = "example.com"
# Check cache
cache_key = f"quick_summarize_{hash(text + url)}"
if cache_key in cache:
logging.info(f"Cache hit for {cache_key}")
return cache[cache_key]
# Get Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
raise ValueError("Gemini API key is required for summarization.")
# Configure Gemini client
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-pro')
# Handle short or empty text
if len(text) < 20:
logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
text = f"Content from {url} about news, products, or services."
# Lightweight prompt with chunking
chunk_size = 1000
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
summaries = []
for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
prompt = (
f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
)
response = await model.generate_content_async(prompt)
raw_content = response.text.strip()
logging.info(f"Raw Gemini response (quick): {raw_content}")
# Parse response with regex
try:
match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
if match:
title = match.group(1)
description = match.group(2)
summaries.append({"title": title, "description": description})
else:
raise ValueError("Invalid JSON format in Gemini response")
except Exception as e:
logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
continue
# Use first valid summary or fallback
if summaries:
result = summaries[0]
else:
logging.warning("No valid summaries generated. Using fallback.")
result = {
"title": "Quick Summary",
"description": f"Check out content from {domain}."
}
# Ensure non-empty outputs
if not result["title"].strip():
result["title"] = "Quick Summary"
if not result["description"].strip():
result["description"] = f"Check out content from {domain}."
cache[cache_key] = result
logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
return result
except Exception as e:
logging.error(f"Error in quick summarize: {e}")
domain = urlparse(url).netloc or "example.com"
result = {
"title": "Quick Summary",
"description": f"Check out content from {domain}."
}
cache[cache_key] = result
return result |