Spaces:

AiDeveloper1
/

RCS

Sleeping

RCS

File size: 8,377 Bytes

import os
import re
from typing import Dict, Optional
import google.generativeai as genai
import logging
from dotenv import load_dotenv
from urllib.parse import urlparse
from cachetools import TTLCache

# Load environment variables
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# In-memory cache: 1000 items, 1-hour TTL
cache = TTLCache(maxsize=1000, ttl=3600)

async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
    """Summarize text into a title and description using Gemini-1.5 Flash."""
    try:
        # Validate inputs
        text = text.strip() if text else ""
        if not url:
            url = "https://example.com"
        try:
            parsed_url = urlparse(url)
            domain = parsed_url.netloc or "example.com"
        except Exception:
            logging.warning(f"Invalid URL: {url}. Using default domain.")
            domain = "example.com"

        # Check cache
        cache_key = f"summarize_{hash(text + url)}"
        if cache_key in cache:
            logging.info(f"Cache hit for {cache_key}")
            return cache[cache_key]

        # Get Gemini API key
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
            raise ValueError("Gemini API key is required for summarization.")

        # Configure Gemini client
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Handle short or empty text
        if len(text) < 20:
            logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
            text = f"Content from {url} about news, products, or services."

        # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
        chunk_size = 1000
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summaries = []

        for chunk in text_chunks[:2]:  # Limit to first 2000 chars for efficiency
            prompt = (
                f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
                f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
                f"and suitable for a news, product, or service context inferred from the URL ({url}). "
                f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
            )

            response = await model.generate_content_async(prompt)
            raw_content = response.text.strip()
            logging.info(f"Raw Gemini response: {raw_content}")

            # Parse response with regex
            try:
                match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
                if match:
                    title = match.group(1)
                    description = match.group(2)
                    summaries.append({"title": title, "description": description})
                else:
                    raise ValueError("Invalid JSON format in Gemini response")
            except Exception as e:
                logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
                continue

        # Combine summaries (prioritize first valid summary)
        if summaries:
            result = summaries[0]
        else:
            logging.warning("No valid summaries generated. Using fallback.")
            result = {
                "title": "News Summary",
                "description": f"Discover news and insights from {domain}."[:100]
            }

        # Ensure non-empty outputs
        if not result["title"].strip():
            result["title"] = "News Summary"
        if not result["description"].strip():
            result["description"] = f"Discover news and insights from {domain}."

        cache[cache_key] = result
        logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
        return result

    except Exception as e:
        logging.error(f"Error summarizing text: {e}")
        domain = urlparse(url).netloc or "example.com"
        result = {
            "title": "News Summary",
            "description": f"Discover news and insights from {domain}."[:100]
        }
        cache[cache_key] = result
        return result

async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
    """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
    try:
        # Validate inputs
        text = text.strip() if text else ""
        if not url:
            url = "https://example.com"
        try:
            parsed_url = urlparse(url)
            domain = parsed_url.netloc or "example.com"
        except Exception:
            logging.warning(f"Invalid URL: {url}. Using default domain.")
            domain = "example.com"

        # Check cache
        cache_key = f"quick_summarize_{hash(text + url)}"
        if cache_key in cache:
            logging.info(f"Cache hit for {cache_key}")
            return cache[cache_key]

        # Get Gemini API key
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
            raise ValueError("Gemini API key is required for summarization.")

        # Configure Gemini client
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro')

        # Handle short or empty text
        if len(text) < 20:
            logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
            text = f"Content from {url} about news, products, or services."

        # Lightweight prompt with chunking
        chunk_size = 1000
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summaries = []

        for chunk in text_chunks[:1]:  # Limit to first 1000 chars for quick summary
            prompt = (
                f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
                f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
            )

            response = await model.generate_content_async(prompt)
            raw_content = response.text.strip()
            logging.info(f"Raw Gemini response (quick): {raw_content}")

            # Parse response with regex
            try:
                match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
                if match:
                    title = match.group(1)
                    description = match.group(2)
                    summaries.append({"title": title, "description": description})
                else:
                    raise ValueError("Invalid JSON format in Gemini response")
            except Exception as e:
                logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
                continue

        # Use first valid summary or fallback
        if summaries:
            result = summaries[0]
        else:
            logging.warning("No valid summaries generated. Using fallback.")
            result = {
                "title": "Quick Summary",
                "description": f"Check out content from {domain}."
            }

        # Ensure non-empty outputs
        if not result["title"].strip():
            result["title"] = "Quick Summary"
        if not result["description"].strip():
            result["description"] = f"Check out content from {domain}."

        cache[cache_key] = result
        logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
        return result

    except Exception as e:
        logging.error(f"Error in quick summarize: {e}")
        domain = urlparse(url).netloc or "example.com"
        result = {
            "title": "Quick Summary",
            "description": f"Check out content from {domain}."
        }
        cache[cache_key] = result
        return result