Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import argparse | |
| from system.html2lines import html2metadata | |
| from lxml.etree import tostring | |
| import lxml.etree | |
| def process_manifesto_data_with_metadata(input_base_dir: str): | |
| input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json") | |
| output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json") | |
| url2text_dir = os.path.join(input_base_dir, "augmented_data_store") | |
| with open(input_file_path, "r", encoding="utf-8") as f: | |
| input_file = f.readlines() | |
| out_file = open(output_file_path, "w", encoding="utf-8") | |
| i = 0 | |
| for id, line in enumerate(input_file): | |
| line = json.loads(line) | |
| claim = line["claim"] | |
| QAs = line["top_50"] | |
| new_line = {"claim": claim, "evidence": []} | |
| json_path = os.path.join(url2text_dir, f"{id}.jsonl") | |
| if not os.path.exists(json_path): | |
| print(f"Warning: {json_path} not found") | |
| continue | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| try: | |
| data_store = json.load(f) | |
| except json.JSONDecodeError: | |
| f.seek(0) | |
| data_store = [json.loads(line) for line in f] | |
| url_txt = {data["url"]: data["url2text"] for data in data_store} | |
| URLs = [] | |
| for j, QA in enumerate(QAs): | |
| newQA = QA.copy() | |
| URL = QA["url"] | |
| newQA["text"] = url_txt.get(URL, "") | |
| if URL not in URLs: | |
| try: | |
| meta = html2metadata(URL) | |
| if isinstance(meta, lxml.etree._Element): | |
| meta = tostring(meta, encoding="unicode", pretty_print=True) | |
| meta_save = { | |
| "title": meta["title"], | |
| "date": meta["date"] | |
| } | |
| except Exception as e: | |
| print(f"Metadata extraction failed for URL: {URL}, error: {e}") | |
| meta_save = { | |
| "title": "", | |
| "date": "" | |
| } | |
| newQA["metadata"] = meta_save | |
| new_line["evidence"].append(newQA) | |
| out_file.write(json.dumps(new_line) + "\n") | |
| out_file.close() | |
| return output_file_path | |