File size: 2,930 Bytes
fcaa164 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
"""
Paper parsing module for ProjectPageAgent.
Reuses the parsing capabilities from Paper2Poster.
"""
from ProjectPageAgent.parse_raw import parse_raw, gen_image_and_table
from utils.wei_utils import get_agent_config
import json
import os
import argparse
def parse_paper_for_project_page(args, agent_config_t, version=2):
"""
Parse a research paper PDF and extract content for project page generation.
Args:
args: Command line arguments
agent_config_t: Text model configuration
version: Parser version to use
Returns:
tuple: (input_tokens, output_tokens, raw_result, images, tables)
"""
print("Step 1: Parsing the research paper...")
# Add poster_path and poster_name attributes to args for compatibility with parse_raw
if not hasattr(args, 'poster_path'):
args.poster_path = args.paper_path
if not hasattr(args, 'poster_name'):
args.poster_name = args.paper_name
# Parse the raw paper content
input_token, output_token, raw_result = parse_raw(args, agent_config_t, version=version)
# Extract images and tables
_, _, images, tables = gen_image_and_table(args, raw_result)
print(f"Parsing completed. Tokens: {input_token} -> {output_token}")
print(f"Extracted {len(images)} images and {len(tables)} tables")
return input_token, output_token, raw_result, images, tables
def save_parsed_content(args, raw_result, images, tables, input_token, output_token):
"""
Save parsed content to files for later use.
Args:
args: Command line arguments
raw_result: Parsed raw content
images: Extracted images
tables: Extracted tables
input_token: Input token count
output_token: Output token count
"""
# Save raw content
os.makedirs('project_contents', exist_ok=True)
raw_content_path = f'project_contents/{args.paper_name}_raw_content.json'
# Convert raw_result to JSON format if needed
if hasattr(raw_result, 'document'):
# Extract text content from docling result
raw_markdown = raw_result.document.export_to_markdown()
content_json = {
'markdown_content': raw_markdown,
'images': images,
'tables': tables
}
else:
content_json = raw_result
with open(raw_content_path, 'w') as f:
json.dump(content_json, f, indent=4)
# Save token usage
token_log = {
'parse_input_tokens': input_token,
'parse_output_tokens': output_token,
'total_images': len(images),
'total_tables': len(tables)
}
token_log_path = f'project_contents/{args.paper_name}_parse_log.json'
with open(token_log_path, 'w') as f:
json.dump(token_log, f, indent=4)
print(f"Parsed content saved to {raw_content_path}")
return raw_content_path, token_log_path |