AutoPage / ProjectPageAgent /parse_paper.py
Mqleet's picture
upd code
fcaa164
raw
history blame
2.93 kB
"""
Paper parsing module for ProjectPageAgent.
Reuses the parsing capabilities from Paper2Poster.
"""
from ProjectPageAgent.parse_raw import parse_raw, gen_image_and_table
from utils.wei_utils import get_agent_config
import json
import os
import argparse
def parse_paper_for_project_page(args, agent_config_t, version=2):
"""
Parse a research paper PDF and extract content for project page generation.
Args:
args: Command line arguments
agent_config_t: Text model configuration
version: Parser version to use
Returns:
tuple: (input_tokens, output_tokens, raw_result, images, tables)
"""
print("Step 1: Parsing the research paper...")
# Add poster_path and poster_name attributes to args for compatibility with parse_raw
if not hasattr(args, 'poster_path'):
args.poster_path = args.paper_path
if not hasattr(args, 'poster_name'):
args.poster_name = args.paper_name
# Parse the raw paper content
input_token, output_token, raw_result = parse_raw(args, agent_config_t, version=version)
# Extract images and tables
_, _, images, tables = gen_image_and_table(args, raw_result)
print(f"Parsing completed. Tokens: {input_token} -> {output_token}")
print(f"Extracted {len(images)} images and {len(tables)} tables")
return input_token, output_token, raw_result, images, tables
def save_parsed_content(args, raw_result, images, tables, input_token, output_token):
"""
Save parsed content to files for later use.
Args:
args: Command line arguments
raw_result: Parsed raw content
images: Extracted images
tables: Extracted tables
input_token: Input token count
output_token: Output token count
"""
# Save raw content
os.makedirs('project_contents', exist_ok=True)
raw_content_path = f'project_contents/{args.paper_name}_raw_content.json'
# Convert raw_result to JSON format if needed
if hasattr(raw_result, 'document'):
# Extract text content from docling result
raw_markdown = raw_result.document.export_to_markdown()
content_json = {
'markdown_content': raw_markdown,
'images': images,
'tables': tables
}
else:
content_json = raw_result
with open(raw_content_path, 'w') as f:
json.dump(content_json, f, indent=4)
# Save token usage
token_log = {
'parse_input_tokens': input_token,
'parse_output_tokens': output_token,
'total_images': len(images),
'total_tables': len(tables)
}
token_log_path = f'project_contents/{args.paper_name}_parse_log.json'
with open(token_log_path, 'w') as f:
json.dump(token_log, f, indent=4)
print(f"Parsed content saved to {raw_content_path}")
return raw_content_path, token_log_path