File size: 8,377 Bytes
5c7501f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95f63e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import re
from typing import Dict, Optional
import google.generativeai as genai
import logging
from dotenv import load_dotenv
from urllib.parse import urlparse
from cachetools import TTLCache

# Load environment variables
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# In-memory cache: 1000 items, 1-hour TTL
cache = TTLCache(maxsize=1000, ttl=3600)

async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
    """Summarize text into a title and description using Gemini-1.5 Flash."""
    try:
        # Validate inputs
        text = text.strip() if text else ""
        if not url:
            url = "https://example.com"
        try:
            parsed_url = urlparse(url)
            domain = parsed_url.netloc or "example.com"
        except Exception:
            logging.warning(f"Invalid URL: {url}. Using default domain.")
            domain = "example.com"

        # Check cache
        cache_key = f"summarize_{hash(text + url)}"
        if cache_key in cache:
            logging.info(f"Cache hit for {cache_key}")
            return cache[cache_key]

        # Get Gemini API key
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
            raise ValueError("Gemini API key is required for summarization.")

        # Configure Gemini client
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-flash')

        # Handle short or empty text
        if len(text) < 20:
            logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
            text = f"Content from {url} about news, products, or services."

        # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
        chunk_size = 1000
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summaries = []

        for chunk in text_chunks[:2]:  # Limit to first 2000 chars for efficiency
            prompt = (
                f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
                f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
                f"and suitable for a news, product, or service context inferred from the URL ({url}). "
                f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
            )

            response = await model.generate_content_async(prompt)
            raw_content = response.text.strip()
            logging.info(f"Raw Gemini response: {raw_content}")

            # Parse response with regex
            try:
                match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
                if match:
                    title = match.group(1)
                    description = match.group(2)
                    summaries.append({"title": title, "description": description})
                else:
                    raise ValueError("Invalid JSON format in Gemini response")
            except Exception as e:
                logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
                continue

        # Combine summaries (prioritize first valid summary)
        if summaries:
            result = summaries[0]
        else:
            logging.warning("No valid summaries generated. Using fallback.")
            result = {
                "title": "News Summary",
                "description": f"Discover news and insights from {domain}."[:100]
            }

        # Ensure non-empty outputs
        if not result["title"].strip():
            result["title"] = "News Summary"
        if not result["description"].strip():
            result["description"] = f"Discover news and insights from {domain}."

        cache[cache_key] = result
        logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
        return result

    except Exception as e:
        logging.error(f"Error summarizing text: {e}")
        domain = urlparse(url).netloc or "example.com"
        result = {
            "title": "News Summary",
            "description": f"Discover news and insights from {domain}."[:100]
        }
        cache[cache_key] = result
        return result

async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
    """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
    try:
        # Validate inputs
        text = text.strip() if text else ""
        if not url:
            url = "https://example.com"
        try:
            parsed_url = urlparse(url)
            domain = parsed_url.netloc or "example.com"
        except Exception:
            logging.warning(f"Invalid URL: {url}. Using default domain.")
            domain = "example.com"

        # Check cache
        cache_key = f"quick_summarize_{hash(text + url)}"
        if cache_key in cache:
            logging.info(f"Cache hit for {cache_key}")
            return cache[cache_key]

        # Get Gemini API key
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
            raise ValueError("Gemini API key is required for summarization.")

        # Configure Gemini client
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro')

        # Handle short or empty text
        if len(text) < 20:
            logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
            text = f"Content from {url} about news, products, or services."

        # Lightweight prompt with chunking
        chunk_size = 1000
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summaries = []

        for chunk in text_chunks[:1]:  # Limit to first 1000 chars for quick summary
            prompt = (
                f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
                f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
            )

            response = await model.generate_content_async(prompt)
            raw_content = response.text.strip()
            logging.info(f"Raw Gemini response (quick): {raw_content}")

            # Parse response with regex
            try:
                match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
                if match:
                    title = match.group(1)
                    description = match.group(2)
                    summaries.append({"title": title, "description": description})
                else:
                    raise ValueError("Invalid JSON format in Gemini response")
            except Exception as e:
                logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
                continue

        # Use first valid summary or fallback
        if summaries:
            result = summaries[0]
        else:
            logging.warning("No valid summaries generated. Using fallback.")
            result = {
                "title": "Quick Summary",
                "description": f"Check out content from {domain}."
            }

        # Ensure non-empty outputs
        if not result["title"].strip():
            result["title"] = "Quick Summary"
        if not result["description"].strip():
            result["description"] = f"Check out content from {domain}."

        cache[cache_key] = result
        logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
        return result

    except Exception as e:
        logging.error(f"Error in quick summarize: {e}")
        domain = urlparse(url).netloc or "example.com"
        result = {
            "title": "Quick Summary",
            "description": f"Check out content from {domain}."
        }
        cache[cache_key] = result
        return result