|
|
""" |
|
|
Paper parsing module for ProjectPageAgent. |
|
|
Reuses the parsing capabilities from Paper2Poster. |
|
|
""" |
|
|
|
|
|
from ProjectPageAgent.parse_raw import parse_raw, gen_image_and_table |
|
|
from utils.wei_utils import get_agent_config |
|
|
import json |
|
|
import os |
|
|
import argparse |
|
|
|
|
|
def parse_paper_for_project_page(args, agent_config_t, version=2): |
|
|
""" |
|
|
Parse a research paper PDF and extract content for project page generation. |
|
|
|
|
|
Args: |
|
|
args: Command line arguments |
|
|
agent_config_t: Text model configuration |
|
|
version: Parser version to use |
|
|
|
|
|
Returns: |
|
|
tuple: (input_tokens, output_tokens, raw_result, images, tables) |
|
|
""" |
|
|
print("Step 1: Parsing the research paper...") |
|
|
|
|
|
|
|
|
if not hasattr(args, 'poster_path'): |
|
|
args.poster_path = args.paper_path |
|
|
|
|
|
if not hasattr(args, 'poster_name'): |
|
|
args.poster_name = args.paper_name |
|
|
|
|
|
|
|
|
input_token, output_token, raw_result = parse_raw(args, agent_config_t, version=version) |
|
|
|
|
|
|
|
|
_, _, images, tables = gen_image_and_table(args, raw_result) |
|
|
|
|
|
print(f"Parsing completed. Tokens: {input_token} -> {output_token}") |
|
|
print(f"Extracted {len(images)} images and {len(tables)} tables") |
|
|
|
|
|
return input_token, output_token, raw_result, images, tables |
|
|
|
|
|
def save_parsed_content(args, raw_result, images, tables, input_token, output_token): |
|
|
""" |
|
|
Save parsed content to files for later use. |
|
|
|
|
|
Args: |
|
|
args: Command line arguments |
|
|
raw_result: Parsed raw content |
|
|
images: Extracted images |
|
|
tables: Extracted tables |
|
|
input_token: Input token count |
|
|
output_token: Output token count |
|
|
""" |
|
|
|
|
|
os.makedirs('project_contents', exist_ok=True) |
|
|
raw_content_path = f'project_contents/{args.paper_name}_raw_content.json' |
|
|
|
|
|
|
|
|
if hasattr(raw_result, 'document'): |
|
|
|
|
|
raw_markdown = raw_result.document.export_to_markdown() |
|
|
content_json = { |
|
|
'markdown_content': raw_markdown, |
|
|
'images': images, |
|
|
'tables': tables |
|
|
} |
|
|
else: |
|
|
content_json = raw_result |
|
|
|
|
|
with open(raw_content_path, 'w') as f: |
|
|
json.dump(content_json, f, indent=4) |
|
|
|
|
|
|
|
|
token_log = { |
|
|
'parse_input_tokens': input_token, |
|
|
'parse_output_tokens': output_token, |
|
|
'total_images': len(images), |
|
|
'total_tables': len(tables) |
|
|
} |
|
|
|
|
|
token_log_path = f'project_contents/{args.paper_name}_parse_log.json' |
|
|
with open(token_log_path, 'w') as f: |
|
|
json.dump(token_log, f, indent=4) |
|
|
|
|
|
print(f"Parsed content saved to {raw_content_path}") |
|
|
return raw_content_path, token_log_path |