File size: 14,073 Bytes
700863c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#!/home/tom/miniconda3/envs/fake_news_detection/bin/python
"""
main.py - Server for the Fake News Detection system

This script creates a Flask server that exposes API endpoints to:
1. Take user input (news query) from the UI
2. Process the request through the fake news detection pipeline
3. Return the results to the UI for display
"""

import os
import json
import time
from dotenv import load_dotenv
from flask import Flask, request, jsonify
from flask_cors import CORS

# Import required functions from modules
from gdelt_api import (
    fetch_articles_from_gdelt,
    filter_by_whitelisted_domains,
    normalize_gdelt_articles
)
from ranker import ArticleRanker
from gdelt_query_builder import generate_query, GEMINI_MODEL
import bias_analyzer
from google_search import google_search

# Global variable for embedding model caching across requests
print("Preloading embedding model for faster request processing...")
# Preload the embedding model at server startup
global_ranker = ArticleRanker()


# The function has been removed since bias category descriptions are provided directly by the Gemini model
# and stored in the bias_analysis["descriptions"] dictionary


def format_results(query, ranked_articles):
    """
    Format the ranked results in a structured way for the UI.
    
    Args:
        query (str): The original query
        ranked_articles (list): List of ranked article dictionaries
        
    Returns:
        dict: Dictionary with formatted results
    """
    result = {}
    
    if not ranked_articles:
        result = {
            "status": "no_results",
            "message": "⚠️ No news found. Possibly Fake.",
            "details": "No reliable sources could verify this information.",
            "articles": []
        }
    else:
        # Get display configuration from environment variables
        show_scores = os.getenv('SHOW_SIMILARITY_SCORES', 'true').lower() == 'true'
        show_date = os.getenv('SHOW_PUBLISH_DATE', 'true').lower() == 'true'
        show_url = os.getenv('SHOW_URL', 'true').lower() == 'true'
        
        formatted_articles = []
        for article in ranked_articles:
            formatted_article = {
                "rank": article['rank'],
                "title": article['title'],
                "source": article['source']
            }
            
            if show_scores:
                formatted_article["similarity_score"] = round(article['similarity_score'], 4)
                
            if show_url:
                formatted_article["url"] = article['url']
                
            if show_date:
                formatted_article["published_at"] = article['published_at']
                
            formatted_articles.append(formatted_article)
        
        result = {
            "status": "success",
            "message": f"✅ Found {len(ranked_articles)} relevant articles for: '{query}'",
            "articles": formatted_articles,
            "footer": "If the news matches these reliable sources, it's likely true. If it contradicts them or no sources are found, it might be fake."
        }
    
    return result


def remove_duplicates(articles):
    """
    Remove duplicate articles based on URL.
    
    Args:
        articles (list): List of article dictionaries
        
    Returns:
        list: List with duplicate articles removed
    """
    unique_urls = set()
    unique_articles = []
    
    for article in articles:
        if article['url'] not in unique_urls:
            unique_urls.add(article['url'])
            unique_articles.append(article)
    
    return unique_articles


# This function has been removed since Gemini is a cloud API service
# that does not require local caching - models are instantiated as needed


def main():
    """Main function to run the fake news detection pipeline as a server."""
    # Load environment variables
    load_dotenv()
    
    # Create Flask app
    app = Flask(__name__, static_folder='static')
    CORS(app)  # Enable CORS for all routes
    
    @app.route('/static/')
    def index():
        """Serve the main page."""
        return app.send_static_file('front.html')

    
    @app.route('/api/detect', methods=['POST'])
    def detect_fake_news():
        """API endpoint to check if news is potentially fake."""
        # Start timing the request processing
        start_time = time.time()
        
        data = request.json
        query = data.get('query', '')
        
        if not query:
            return jsonify({
                "status": "error",
                "message": "Please provide a news statement to verify."
            })
        
        # =====================================================
        # 1. Input Handling
        # =====================================================
        # Generate three variations of the query using Gemini
        query_variations = generate_query(query)
        
        # Check if the query was flagged as inappropriate
        if query_variations == ["INAPPROPRIATE_QUERY"]:
            return jsonify({
                "status": "error",
                "message": "I cannot provide information on this topic as it appears to contain sensitive or inappropriate content."
            })
        
        # =====================================================
        # 2. Data Fetching
        # =====================================================
        # Fetch articles from GDELT API for each query variation
        all_articles = []
        
        # First, fetch Google search results using the original query
        # print(f"Fetching Google search results for: {query}")
        # google_results = google_search(query, num_results=25)
        # if google_results:
        #     all_articles.extend(google_results)
        #     print(f"Added {len(google_results)} Google search results to articles")
        
        # Then fetch GDELT results for each query variation
        for query_var in query_variations:
            articles = fetch_articles_from_gdelt(query_var)
            if articles:
                all_articles.extend(articles)
        
        # After the loop, check if any articles were found
        if not all_articles:
            return jsonify({
                "status": "no_results",
                "message": "No articles found on this topic.",
                "details": "No reliable sources could be found covering this information.",
                "articles": []
            })
        
        # Store unique articles in a set to ensure uniqueness
        unique_articles = remove_duplicates(all_articles)
        
        # Apply domain whitelist filtering if enabled in .env
        use_whitelist_only = os.getenv('USE_WHITELIST_ONLY', 'false').lower() == 'true'
        if use_whitelist_only:
            print(f"Filtering articles to only include whitelisted domains...")
            unique_articles = filter_by_whitelisted_domains(unique_articles)
            print(f"After whitelist filtering: {len(unique_articles)} articles remain")
        
        # Normalize the articles to a standard format
        normalized_articles = normalize_gdelt_articles(unique_articles)
        
        if not normalized_articles:
            return jsonify(format_results(query, []))
        
        # =====================================================
        # 3. Embedding & Ranking
        # =====================================================
        # Initialize the ranker with model from environment variable
        model_name = os.getenv('SIMILARITY_MODEL', 'intfloat/multilingual-e5-base')
        
        # Use global ranker if it matches the requested model, otherwise create a new instance
        if global_ranker.model_name == model_name:
            ranker = global_ranker
        else:
            ranker = ArticleRanker(model_name)
        
        # Get TOP_K_ARTICLES from .env file
        TOP_K_ARTICLES = int(os.getenv('TOP_K_ARTICLES', 250))
        min_threshold = float(os.getenv('MIN_SIMILARITY_THRESHOLD', 0.1))
        
        # Prepare article texts for embedding
        article_texts = [f"{article['title']} {article['description'] or ''}" for article in normalized_articles]
        
        # Create embeddings and calculate similarities
        query_embedding, article_embeddings = ranker.create_embeddings(query, article_texts)
        similarities = ranker.calculate_similarities(query_embedding, article_embeddings)
        
        # Get top articles based on similarity
        top_indices = ranker.get_top_articles(similarities, normalized_articles, TOP_K_ARTICLES, min_threshold)
        top_articles = ranker.format_results(top_indices, similarities, normalized_articles)
        
        # =====================================================
        # 4. Bias Categorization
        # =====================================================
        # Extract outlet names from the TOP_K_ARTICLES
        # In top_articles, the source is already extracted as a string
        outlet_names = [article['source'] for article in top_articles]
        unique_outlets = list(set(outlet_names))
        print(f"Analyzing {len(unique_outlets)} unique news outlets for bias...")
        
        # Analyze bias using Gemini - send just the outlet names, not the whole articles
        bias_analysis = bias_analyzer.analyze_bias(query, unique_outlets, GEMINI_MODEL)
        
        # =====================================================
        # 5. Category Embeddings
        # =====================================================
        print("\n" + "=" * 80)
        print("EMBEDDING VECTORS BY BIAS CATEGORY")
        print("=" * 80)
        
        # Create embedding vectors for each bias category
        # 1. Group articles based on their outlet's bias category
        # 2. Create an embedding vector for each category using ONLY article titles
        # 3. Rank articles within each category by similarity to query
        category_rankings = bias_analyzer.categorize_and_rank_by_bias(
            query, normalized_articles, bias_analysis, ranker, min_threshold
        )
        
        # =====================================================
        # 6. Top N Selection per Category
        # =====================================================
        # Get TOP_N_PER_CATEGORY from .env file (default: 5)
        TOP_N_PER_CATEGORY = int(os.getenv('TOP_N_PER_CATEGORY', 5))
        
        # Get total counts of articles per category before filtering
        category_article_counts = {
            category: len(articles) 
            for category, articles in category_rankings.items() 
            if category not in ["descriptions", "reasoning"]
        }
        
        # For each bias category, select the top N articles
        # These are the most relevant articles within each bias perspective
        filtered_category_rankings = {}
        for category, articles in category_rankings.items():
            # Skip non-category keys like "descriptions" or "reasoning"
            if category in ["descriptions", "reasoning"]:
                continue
                
            filtered_category_rankings[category] = articles[:TOP_N_PER_CATEGORY]
            
            # Only print if there are articles in this category
            if len(filtered_category_rankings[category]) > 0:
                print(f"\n===== Top {len(filtered_category_rankings[category])} articles from {category} category =====")
                
                # Print detailed information about each selected article
                for i, article in enumerate(filtered_category_rankings[category], 1):
                    print(f"Article #{i}:")
                    print(f"  Title: {article['title']}")
                    print(f"  Source: {article['source']}")
                    print(f"  Similarity Score: {article['similarity_score']:.4f}")
                    print(f"  Rank: {article['rank']}")
                    print(f"  URL: {article['url']}")
                    print(f"  Published: {article['published_at']}")
                    print("-" * 50)
        
        # =====================================================
        # 7. Summarization
        # =====================================================
        # Generate summary from articles in all categories
        print("\nGenerating factual summary using top articles from all categories...")
        
        # Pass the original bias_analysis to include the reasoning in the summary
        # We need to add the reasoning to filtered_category_rankings since that's what gets passed to generate_summary
        filtered_category_rankings["reasoning"] = bias_analysis.get("reasoning", "No reasoning provided")
        
        # Call the bias_analyzer's generate_summary function with articles from all categories
        summary = bias_analyzer.generate_summary(
            query, 
            normalized_articles, 
            filtered_category_rankings, 
            GEMINI_MODEL
        )
        
        # Print the summary to terminal (already includes its own formatting)
        print(summary)
        
        # Prepare response with ONLY the combined summary (reasoning already appended at end)
        # Removed separate 'reasoning' key to avoid it showing at the top in the UI
        result = {
            "summary": summary
        }
        
        return jsonify(result)
    
    @app.route('/api/health', methods=['GET'])
    def health_check():
        """API endpoint to check if the server is running."""
        return jsonify({
            "status": "ok", 
            "message": "Fake News Detection API is running"
        })
    
    # Get port from environment variable or use default 5000
    port = int(os.getenv('PORT', 5000))
    debug = os.getenv('DEBUG', 'false').lower() == 'true'
    
    print(f"Starting Fake News Detection API server on port {port}...")
    # Start the Flask server
    app.run(host='0.0.0.0', port=port, debug=debug)


if __name__ == "__main__":
    main()