Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import argparse | |
| from tqdm import tqdm | |
| import tiktoken | |
| from openai import OpenAI | |
| from huggingface_hub import hf_hub_download | |
| def gpt_4o(input_text): | |
| client=OpenAI(api_key=os.environ.get("OAI")) | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| {"role": "user", "content": [{"type": "text", "text": input_text}]} | |
| ], | |
| response_format={"type": "json_object"}, | |
| temperature=0, | |
| max_tokens=4096, | |
| top_p=0, | |
| frequency_penalty=0, | |
| presence_penalty=0 | |
| ) | |
| return response.choices[0].message.content | |
| def run_gpt4_event_extraction(data_dir, max_tokens=100000): | |
| all_info_path = os.path.join(data_dir, "all_info_with_txt.json") | |
| output_dir = os.path.join(data_dir, "gpt4_event_extraction") | |
| os.makedirs(output_dir, exist_ok=True) | |
| icl_path = hf_hub_download( | |
| repo_id="PledgeTracker/demo_feedback", | |
| filename="icl.txt", | |
| repo_type="dataset", | |
| token=os.environ["HF_TOKEN"] | |
| ) | |
| ICL = open(icl_path, "r").read() | |
| all_info = open(all_info_path, "r").readlines() | |
| enc = tiktoken.encoding_for_model("gpt-4o") | |
| for i, line in enumerate(all_info): | |
| ID = i | |
| urls = [] | |
| results = [] | |
| data = json.loads(line) | |
| docs = data["evidence"] | |
| claim = data["claim"] | |
| output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json") | |
| if os.path.exists(output_path): | |
| print(f"Already exist: {output_path}") | |
| else: | |
| for doc in tqdm(docs): | |
| if doc["url"] in urls: | |
| continue | |
| text = " ".join(doc["text"]) | |
| input_text = ( | |
| f"{ICL}\nNow please only summarize events that are useful for verifying the pledge '{claim}', and their dates in the JSON format.\n\nInput:\n\nTitle: {doc['metadata']['title']}\n" | |
| f"Date: {doc['metadata']['date']}\nArticle: {text}\nPledge: {claim}\n\n" | |
| f"Output:\n" | |
| ) | |
| urls.append(doc["url"]) | |
| text_tokens = enc.encode(input_text) | |
| if len(text_tokens) > max_tokens: | |
| input_text = enc.decode(text_tokens[:max_tokens]) | |
| try: | |
| output = gpt_4o(input_text) | |
| # print(f"GPT-4o Response: {output}") | |
| results.append({ | |
| "url": doc["url"], | |
| "title": doc["metadata"]["title"], | |
| "date": doc["metadata"]["date"], | |
| "article": text, | |
| "output": json.loads(output) | |
| }) | |
| except Exception as e: | |
| print(f"Error processing doc: {e}") | |
| continue | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(results, f, ensure_ascii=False, indent=4) | |
| return output_path | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Run GPT-4o event extraction") | |
| parser.add_argument("--data_dir", type=str, required=True, help="Root data directory") | |
| parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file") | |
| parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input") | |
| args = parser.parse_args() | |
| run_gpt4_event_extraction( | |
| base_dir=args.base_dir, | |
| icl_path=args.icl_path, | |
| max_tokens=args.max_tokens | |
| ) | |