Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

Pledge_Tracker / system /augmented_searching.py

yulongchen

Add system

d210108 5 months ago

raw

history blame

3.65 kB

	import json
	import os
	import time
	import requests
	import pandas as pd
	from datetime import datetime
	from pathlib import Path
	import spacy

	def google_search(query, api_key, search_engine_id, start_date, end_date):
	# print(f"[SYSTEM] Calling Google Search API for: {query}")
	sort = f"date:r:{start_date}:{end_date}"
	url = "https://www.googleapis.com/customsearch/v1"
	params = {
	"q": query,
	"key": api_key,
	"cx": search_engine_id,
	"num": 10,
	"sort": sort,
	"cr": "countryUK",
	"gl": "uk"
	}
	try:
	response = requests.get(url, params=params)
	response.raise_for_status()
	return response.json().get("items", [])
	except Exception as e:
	print(f"[ERROR] Google Search Failed: {e}")
	return []

	def save_tsv(file_name, id_value, string_value, value_list, query):

	data = {
	'ID': id_value,
	'String': string_value,
	'ListValue': value_list,
	'query': query
	}
	df = pd.DataFrame(data)
	df.to_csv(file_name, sep='\t', index=False, header=False)

	def ensure_directory_exists(path):
	dir_path = Path(path).expanduser().resolve().parent
	if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
	raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
	dir_path.mkdir(parents=True, exist_ok=True)

	def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
	if suggestion_meta==None:
	qa_lines = open(f"{qa_file}","r").read()
	qa_lines = json.loads(qa_lines)
	claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
	idx=0
	else:
	# claim_text = suggestion_meta["text"]
	idx = suggestion_meta["index"]
	qa_lines = open(f"{qa_file}","r").readlines()[idx]
	qa_lines = json.loads(qa_lines)
	claim_text = f"{qa_lines['claim']}"


	api_key = os.environ.get("GOOGLE_API_KEY")
	search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
	if not api_key or not search_engine_id:
	raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")

	# base_dir = pipeline_base_dir

	tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
	ensure_directory_exists(tsv_file_path)


	urls = []
	string_values = []
	queries = []
	questions = []
	questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
	questions = questions[:10]


	results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
	for result in results:
	if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
	string_values.append("claim")
	urls.append(result["link"])
	queries.append(f"{pledge_author}: {claim_text}")

	for question in questions:
	results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
	for result in results:
	if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
	string_values.append("question")
	urls.append(result["link"])
	queries.append(f"{question}")

	urls = list(dict.fromkeys(urls))

	save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
	print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
	return str(tsv_file_path)