Spaces:
Sleeping
Sleeping
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import os | |
| import csv | |
| import json | |
| import fitz | |
| import time | |
| import requests | |
| import pandas as pd | |
| from time import sleep | |
| from pathlib import Path | |
| from system.html2lines import url2lines, line_correction, html2metadata | |
| MAX_RETRIES = 3 | |
| TIMEOUT = 5 # seconds | |
| def scrape_text_from_url(url, temp_name): | |
| response = None | |
| for attempt in range(MAX_RETRIES): | |
| try: | |
| response = requests.get(url, timeout=TIMEOUT) | |
| break | |
| except requests.RequestException: | |
| if attempt < MAX_RETRIES - 1: | |
| sleep(3) | |
| if response is None or response.status_code == 503: | |
| return [] | |
| if url.endswith(".pdf"): | |
| pdf_dir = Path("/tmp/pdf_dir") | |
| pdf_dir.mkdir(parents=True, exist_ok=True) | |
| pdf_path = pdf_dir / f"{temp_name}.pdf" | |
| with open(pdf_path, "wb") as f: | |
| f.write(response.content) | |
| extracted_text = "" | |
| doc = fitz.open(str(pdf_path)) | |
| for page in doc: | |
| extracted_text += page.get_text() or "" | |
| return line_correction(extracted_text.split("\n")) | |
| return line_correction(url2lines(url)) | |
| def process_row(row, claim_id): | |
| try: | |
| url = row[2] | |
| json_data = { | |
| "claim_id": claim_id, | |
| "type": row[1], | |
| "query": row[3], | |
| "url": url, | |
| "url2text": scrape_text_from_url(url, claim_id), | |
| "metadata": {} | |
| } | |
| meta = html2metadata(url) | |
| json_data["metadata"] = { | |
| "title": meta.get("title"), | |
| "date": meta.get("date") | |
| } | |
| return json_data | |
| except Exception as e: | |
| print(f"[WARN] Failed to scrape {row[2]}: {e}") | |
| return None | |
| def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10): | |
| claim_id = Path(tsv_file_path).stem | |
| output_jsonl_path = Path(output_jsonl_path) | |
| output_jsonl_path.parent.mkdir(parents=True, exist_ok=True) | |
| if output_jsonl_path.exists(): | |
| print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}") | |
| return str(output_jsonl_path) | |
| try: | |
| df = pd.read_csv(tsv_file_path, sep="\t", header=None) | |
| print("[INFO] Data loaded successfully with Pandas.") | |
| except Exception as e: | |
| raise RuntimeError(f"[ERROR] Failed to load TSV: {e}") | |
| results = [] | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()] | |
| for future in as_completed(futures): | |
| result = future.result() | |
| if result: | |
| results.append(result) | |
| with open(output_jsonl_path, "w", encoding="utf-8") as json_file: | |
| for item in results: | |
| json_file.write(json.dumps(item, ensure_ascii=False) + "\n") | |
| print(f"[SYSTEM] Output saved to {output_jsonl_path}") | |
| return str(output_jsonl_path) | |