Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import time | |
| import requests | |
| import pandas as pd | |
| from datetime import datetime | |
| from pathlib import Path | |
| import spacy | |
| def google_search(query, api_key, search_engine_id, start_date, end_date): | |
| # print(f"[SYSTEM] Calling Google Search API for: {query}") | |
| sort = f"date:r:{start_date}:{end_date}" | |
| url = "https://www.googleapis.com/customsearch/v1" | |
| params = { | |
| "q": query, | |
| "key": api_key, | |
| "cx": search_engine_id, | |
| "num": 10, | |
| "sort": sort, | |
| "cr": "countryUK", | |
| "gl": "uk" | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| response.raise_for_status() | |
| return response.json().get("items", []) | |
| except Exception as e: | |
| print(f"[ERROR] Google Search Failed: {e}") | |
| return [] | |
| def save_tsv(file_name, id_value, string_value, value_list, query): | |
| data = { | |
| 'ID': id_value, | |
| 'String': string_value, | |
| 'ListValue': value_list, | |
| 'query': query | |
| } | |
| df = pd.DataFrame(data) | |
| df.to_csv(file_name, sep='\t', index=False, header=False) | |
| def ensure_directory_exists(path): | |
| dir_path = Path(path).expanduser().resolve().parent | |
| if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"): | |
| raise ValueError(f"[ERROR] Unsafe path: {dir_path}") | |
| dir_path.mkdir(parents=True, exist_ok=True) | |
| def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date): | |
| if suggestion_meta==None: | |
| qa_lines = open(f"{qa_file}","r").read() | |
| qa_lines = json.loads(qa_lines) | |
| claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})" | |
| idx=0 | |
| else: | |
| # claim_text = suggestion_meta["text"] | |
| idx = suggestion_meta["index"] | |
| qa_lines = open(f"{qa_file}","r").readlines()[idx] | |
| qa_lines = json.loads(qa_lines) | |
| claim_text = f"{qa_lines['claim']}" | |
| api_key = os.environ.get("GOOGLE_API_KEY") | |
| search_engine_id = os.environ.get("GOOGLE_SEARCH_CX") | |
| if not api_key or not search_engine_id: | |
| raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.") | |
| # base_dir = pipeline_base_dir | |
| tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv") | |
| ensure_directory_exists(tsv_file_path) | |
| urls = [] | |
| string_values = [] | |
| queries = [] | |
| questions = [] | |
| questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions] | |
| questions = questions[:10] | |
| results = google_search(claim_text, api_key, search_engine_id, start_date, end_date) | |
| for result in results: | |
| if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]: | |
| string_values.append("claim") | |
| urls.append(result["link"]) | |
| queries.append(f"{pledge_author}: {claim_text}") | |
| for question in questions: | |
| results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date) | |
| for result in results: | |
| if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]: | |
| string_values.append("question") | |
| urls.append(result["link"]) | |
| queries.append(f"{question}") | |
| urls = list(dict.fromkeys(urls)) | |
| save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries) | |
| print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}") | |
| return str(tsv_file_path) | |