|
|
from utils.poster_eval_utils import * |
|
|
import json |
|
|
from utils.wei_utils import get_agent_config |
|
|
import argparse |
|
|
from dotenv import load_dotenv |
|
|
import tempfile |
|
|
import shutil |
|
|
import os |
|
|
import glob |
|
|
import re |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def run_qa_and_update_results( |
|
|
args, |
|
|
raw_folder, |
|
|
gen_poster_path, |
|
|
save_path, |
|
|
single_model_name=None, |
|
|
del_model_name=None, |
|
|
): |
|
|
""" |
|
|
If single_model_name is provided, run QA for that one model only, |
|
|
but update an existing JSON file (which already contains the other |
|
|
models' results) and re-compute the overall averages. |
|
|
|
|
|
If single_model_name is None, run QA for all models in all_model_names |
|
|
and write a new JSON file. |
|
|
|
|
|
:param raw_folder: Path to folder with 'o3_qa.json'. |
|
|
:param gen_poster_path: Path to the generated poster image. |
|
|
:param save_path: Directory where overall_qa_result.json is saved or should be written. |
|
|
:param all_model_names: List of model names (e.g. ['vllm_qwen_vl', '4o', 'o3']). |
|
|
:param single_model_name: Optional single model name. |
|
|
""" |
|
|
|
|
|
|
|
|
qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r')) |
|
|
detail_qa = qa_dict['detail'] |
|
|
understanding_qa = qa_dict['understanding'] |
|
|
|
|
|
|
|
|
if single_model_name is not None: |
|
|
qa_input_token, qa_output_token = 0, 0 |
|
|
|
|
|
existing_path = os.path.join(save_path, "overall_qa_result.json") |
|
|
with open(existing_path, 'r') as f: |
|
|
overall_qa_result = json.load(f) |
|
|
|
|
|
if del_model_name is not None: |
|
|
|
|
|
if del_model_name in overall_qa_result['qa_result']: |
|
|
del overall_qa_result['qa_result'][del_model_name] |
|
|
print(f"Removed model {del_model_name} from existing results.") |
|
|
|
|
|
if single_model_name in overall_qa_result['qa_result']: |
|
|
print(f"Model {single_model_name} already evaluated. Skipping.") |
|
|
return |
|
|
|
|
|
|
|
|
print(f"Running QA for single model: {single_model_name}") |
|
|
agent_config = get_agent_config(single_model_name) |
|
|
|
|
|
if args.poster_method == 'paper': |
|
|
poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg') |
|
|
else: |
|
|
poster_images = [Image.open(gen_poster_path)] |
|
|
|
|
|
poster_images = [ensure_under_limit_pil(image) for image in poster_images] |
|
|
|
|
|
detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer( |
|
|
poster_input=poster_images, |
|
|
questions=detail_qa['questions'], |
|
|
answers=detail_qa['answers'], |
|
|
aspects=detail_qa['aspects'], |
|
|
input_type='image', |
|
|
agent_config=agent_config |
|
|
) |
|
|
qa_input_token += input_token |
|
|
qa_output_token += output_token |
|
|
print('Detail QA accuracy:', detail_accuracy) |
|
|
|
|
|
understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer( |
|
|
poster_input=poster_images, |
|
|
questions=understanding_qa['questions'], |
|
|
answers=understanding_qa['answers'], |
|
|
aspects=understanding_qa['aspects'], |
|
|
input_type='image', |
|
|
agent_config=agent_config |
|
|
) |
|
|
qa_input_token += input_token |
|
|
qa_output_token += output_token |
|
|
print('Understanding QA accuracy:', understanding_accuracy) |
|
|
|
|
|
|
|
|
|
|
|
overall_qa_result['qa_result'][single_model_name] = { |
|
|
'detail_accuracy': detail_accuracy, |
|
|
'detail_aspect_accuracy': detail_aspect_accuracy, |
|
|
'detail_agent_answers': detail_agent_answers, |
|
|
'understanding_accuracy': understanding_accuracy, |
|
|
'understanding_aspect_accuracy': understanding_aspect_accuracy, |
|
|
'understanding_agent_answers': understanding_agent_answers |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
all_models_in_file = list(overall_qa_result['qa_result'].keys()) |
|
|
detail_accs = [] |
|
|
understanding_accs = [] |
|
|
for m in all_models_in_file: |
|
|
detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy']) |
|
|
understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy']) |
|
|
|
|
|
avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0 |
|
|
avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0 |
|
|
|
|
|
overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy |
|
|
overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy |
|
|
|
|
|
|
|
|
with open(existing_path, 'w') as f: |
|
|
json.dump(overall_qa_result, f, indent=4) |
|
|
|
|
|
print(f'Input tokens: {qa_input_token}') |
|
|
print(f'Output tokens: {qa_output_token}') |
|
|
|
|
|
print('Updated overall_qa_result.json with single-model results.') |
|
|
print('New average detail accuracy:', avg_detail_accuracy) |
|
|
print('New average understanding accuracy:', avg_understanding_accuracy) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('--paper_name', type=str) |
|
|
parser.add_argument('--base_dir', type=str, default='Paper2Poster-data') |
|
|
parser.add_argument('--poster_method', type=str) |
|
|
parser.add_argument('--poster_image_name', type=str, default='poster.png', choices=['poster.png']) |
|
|
parser.add_argument('--metric', type=str, choices=['stats', 'qa', 'judge', 'word_count', 'token_count', 'figure_count', 'aesthetic_judge'], default='stats') |
|
|
parser.add_argument('--fix', type=str, default=None) |
|
|
parser.add_argument('--del_model_name', type=str, default=None) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
raw_poster_path = f'{args.base_dir}/{args.paper_name}/poster.png' |
|
|
raw_folder = f'{args.base_dir}/{args.paper_name}' |
|
|
|
|
|
gen_poster_path = f'{args.poster_method}/{args.base_dir}/{args.paper_name}/{args.poster_image_name}' |
|
|
gen_folder = f'{args.poster_method}/{args.base_dir}/{args.paper_name}' |
|
|
|
|
|
save_path = f'eval_results/{args.paper_name}/{args.poster_method}' |
|
|
os.makedirs(save_path, exist_ok=True) |
|
|
|
|
|
if args.poster_method == 'paper': |
|
|
if args.metric == 'qa' and args.fix is not None: |
|
|
overall_qa_result = json.load(open(f'{save_path}/overall_qa_result.json', 'r')) |
|
|
if args.fix in overall_qa_result['qa_result']: |
|
|
print(f"Model {args.fix} already evaluated. Skipping.") |
|
|
exit(0) |
|
|
|
|
|
|
|
|
temp_dir = tempfile.mkdtemp(prefix="eval_temp", suffix="_data") |
|
|
|
|
|
|
|
|
paper_slug = args.paper_name.replace(' ', '_') |
|
|
source_dir = os.path.join('<4o_vllm_qwen>_images_and_tables', paper_slug) |
|
|
|
|
|
|
|
|
index = 1 |
|
|
while True: |
|
|
filename = f"{paper_slug}-{index}.png" |
|
|
src_path = os.path.join(source_dir, filename) |
|
|
if not os.path.isfile(src_path): |
|
|
|
|
|
break |
|
|
shutil.copy2(src_path, os.path.join(temp_dir, filename)) |
|
|
index += 1 |
|
|
if index > 20 and args.metric != 'word_count' and args.metric != 'token_count': |
|
|
break |
|
|
|
|
|
gen_folder = temp_dir |
|
|
gen_poster_path = f'{args.base_dir}/{args.paper_name}/paper.pdf' |
|
|
|
|
|
|
|
|
print('Evaluating poster:', args.paper_name) |
|
|
|
|
|
if args.metric == 'stats': |
|
|
stats_file = os.path.join(save_path, 'stats_result.json') |
|
|
|
|
|
|
|
|
if os.path.exists(stats_file): |
|
|
with open(stats_file, 'r') as f: |
|
|
stats_result = json.load(f) |
|
|
print(f"Loaded existing stats from {stats_file}") |
|
|
else: |
|
|
stats_result = {} |
|
|
|
|
|
|
|
|
if 'CLIP_similarity' not in stats_result: |
|
|
_, cos_sim = compare_folders_with_clip(raw_folder, gen_folder) |
|
|
stats_result['CLIP_similarity'] = cos_sim |
|
|
print(f'CLIP similarity: {cos_sim}') |
|
|
else: |
|
|
print(f"Skipping CLIP similarity (already {stats_result['CLIP_similarity']})") |
|
|
|
|
|
|
|
|
need_eval = any(k not in stats_result for k in ('textual_ppl', 'mixtual_ppl', 'visual_relevance', 'visual_ppl')) |
|
|
if need_eval: |
|
|
images, poster_text, raw_markdown, new_markdown = gen_eval_markdown( |
|
|
args.paper_name, |
|
|
args.poster_method, |
|
|
gen_poster_path |
|
|
) |
|
|
|
|
|
|
|
|
if 'textual_ppl' not in stats_result: |
|
|
textual_ppl = get_ppl(poster_text) |
|
|
stats_result['textual_ppl'] = textual_ppl |
|
|
print(f'Textual PPL: {textual_ppl}') |
|
|
else: |
|
|
print(f"Skipping textual PPL (already {stats_result['textual_ppl']})") |
|
|
|
|
|
|
|
|
if 'mixtual_ppl' not in stats_result: |
|
|
mixtual_ppl = get_ppl(new_markdown) |
|
|
stats_result['mixtual_ppl'] = mixtual_ppl |
|
|
print(f'Mixtual PPL: {mixtual_ppl}') |
|
|
else: |
|
|
print(f"Skipping mixtual PPL (already {stats_result['mixtual_ppl']})") |
|
|
|
|
|
|
|
|
if 'visual_relevance' not in stats_result: |
|
|
if images: |
|
|
sims = [ |
|
|
compute_cosine_similarity(v['image_clip_embedding'], |
|
|
v['section_text_clip_embedding']) |
|
|
for v in images.values() |
|
|
] |
|
|
avg_sim = float(np.mean(sims)) |
|
|
stats_result['visual_relevance'] = avg_sim |
|
|
print(f'Average cosine similarity: {avg_sim}') |
|
|
else: |
|
|
stats_result['visual_relevance'] = 0.0 |
|
|
print('No images found in the poster. Set visual_relevance to 0.') |
|
|
else: |
|
|
print(f"Skipping visual relevance (already {stats_result['visual_relevance']})") |
|
|
|
|
|
if 'visual_ppl' not in stats_result or math.isnan(stats_result['visual_ppl']): |
|
|
visual_ppls = [] |
|
|
for relative_path, v in images.items(): |
|
|
image_path = os.path.join('eval_poster_markdown', args.paper_name, args.poster_method, relative_path) |
|
|
image = Image.open(image_path) |
|
|
visual_ppl = get_visual_ppl(image, poster_text) |
|
|
visual_ppls.append(visual_ppl) |
|
|
avg_visual_ppl = float(np.mean(visual_ppls)) |
|
|
stats_result['visual_ppl'] = avg_visual_ppl |
|
|
print(f'Average visual PPL: {avg_visual_ppl}') |
|
|
else: |
|
|
print("All textual and visual metrics already computed; skipping gen_eval_markdown.") |
|
|
|
|
|
if 'interleaved_ppl' not in stats_result: |
|
|
interleaved_ppl = compute_interleaved_ppl(args.paper_name, args.poster_method) |
|
|
stats_result['interleaved_ppl'] = interleaved_ppl |
|
|
print(f'Interleaved PPL: {interleaved_ppl}') |
|
|
else: |
|
|
print(f"Skipping interleaved PPL (already {stats_result['interleaved_ppl']})") |
|
|
|
|
|
if 'poster_image_ppl' not in stats_result: |
|
|
if args.poster_method == 'paper': |
|
|
poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg') |
|
|
else: |
|
|
poster_images = [Image.open(gen_poster_path)] |
|
|
poster_image_ppl = compute_poster_image_ppl(poster_images) |
|
|
stats_result['poster_image_ppl'] = poster_image_ppl |
|
|
print(f'Poster image PPL: {poster_image_ppl}') |
|
|
else: |
|
|
print(f"Skipping poster image PPL (already {stats_result['poster_image_ppl']})") |
|
|
|
|
|
|
|
|
with open(stats_file, 'w') as f: |
|
|
json.dump(stats_result, f, indent=4) |
|
|
print(f"Updated stats written to {stats_file}") |
|
|
elif args.metric == 'figure_count': |
|
|
save_file_path = os.path.join(save_path, 'figure_count.json') |
|
|
if os.path.exists(save_file_path): |
|
|
print(f"Figure count already exists at {save_file_path}. Skipping.") |
|
|
else: |
|
|
figure_count = gen_eval_markdown( |
|
|
args.paper_name, |
|
|
args.poster_method, |
|
|
gen_poster_path, |
|
|
figure_count_only=True |
|
|
) |
|
|
with open(save_file_path, 'w') as f: |
|
|
json.dump({'figure_count': figure_count}, f, indent=4) |
|
|
print(f"Figure count saved to {save_file_path}") |
|
|
elif args.metric == 'qa': |
|
|
if args.fix is not None: |
|
|
run_qa_and_update_results( |
|
|
args, |
|
|
raw_folder, |
|
|
gen_poster_path, |
|
|
save_path, |
|
|
single_model_name=args.fix, |
|
|
del_model_name=args.del_model_name |
|
|
) |
|
|
else: |
|
|
overall_qa_result = {} |
|
|
qa_result = {} |
|
|
qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r')) |
|
|
detail_qa = qa_dict['detail'] |
|
|
understanding_qa = qa_dict['understanding'] |
|
|
model_names = [ |
|
|
'4o', |
|
|
'o3', |
|
|
'4o-mini' |
|
|
] |
|
|
if args.poster_method == 'paper': |
|
|
poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_')) |
|
|
else: |
|
|
poster_images = [Image.open(gen_poster_path)] |
|
|
|
|
|
poster_images = [ensure_under_limit_pil(image) for image in poster_images] |
|
|
|
|
|
for model_name in model_names: |
|
|
qa_input_token, qa_output_token = 0, 0 |
|
|
print('QA model:', model_name) |
|
|
agent_config = get_agent_config(model_name) |
|
|
detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer( |
|
|
poster_input=poster_images, |
|
|
questions=detail_qa['questions'], |
|
|
answers=detail_qa['answers'], |
|
|
aspects=detail_qa['aspects'], |
|
|
input_type='image', |
|
|
agent_config=agent_config |
|
|
) |
|
|
print(f'{model_name} Detail QA accuracy:', detail_accuracy) |
|
|
qa_input_token += input_token |
|
|
qa_output_token += output_token |
|
|
|
|
|
understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer( |
|
|
poster_input=poster_images, |
|
|
questions=understanding_qa['questions'], |
|
|
answers=understanding_qa['answers'], |
|
|
aspects=understanding_qa['aspects'], |
|
|
input_type='image', |
|
|
agent_config=agent_config |
|
|
) |
|
|
print(f'{model_name} Understanding QA accuracy:', understanding_accuracy) |
|
|
qa_input_token += input_token |
|
|
qa_output_token += output_token |
|
|
|
|
|
qa_result[model_name] = { |
|
|
'detail_accuracy': detail_accuracy, |
|
|
'detail_aspect_accuracy': detail_aspect_accuracy, |
|
|
'detail_agent_answers': detail_agent_answers, |
|
|
'understanding_accuracy': understanding_accuracy, |
|
|
'understanding_aspect_accuracy': understanding_aspect_accuracy, |
|
|
'understanding_agent_answers': understanding_agent_answers |
|
|
} |
|
|
|
|
|
print(f'{model_name} Input tokens:', qa_input_token) |
|
|
print(f'{model_name} Output tokens:', qa_output_token) |
|
|
|
|
|
|
|
|
avg_detail_accuracy = np.mean([qa_result[model_name]['detail_accuracy'] for model_name in model_names]) |
|
|
avg_understanding_accuracy = np.mean([qa_result[model_name]['understanding_accuracy'] for model_name in model_names]) |
|
|
|
|
|
print('Average detail accuracy:', avg_detail_accuracy) |
|
|
print('Average understanding accuracy:', avg_understanding_accuracy) |
|
|
|
|
|
overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy |
|
|
overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy |
|
|
overall_qa_result['qa_result'] = qa_result |
|
|
|
|
|
with open(f'{save_path}/overall_qa_result.json', 'w') as f: |
|
|
json.dump(overall_qa_result, f, indent=4) |
|
|
|
|
|
elif args.metric == 'word_count': |
|
|
if args.poster_method == 'paper': |
|
|
|
|
|
image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True) |
|
|
word_count = 0 |
|
|
for image_path in image_paths: |
|
|
|
|
|
word_count += count_words_in_image(image_path) |
|
|
else: |
|
|
word_count = count_words_in_image(gen_poster_path) |
|
|
|
|
|
with open(f'{save_path}/word_count.json', 'w') as f: |
|
|
json.dump({'word_count': word_count}, f, indent=4) |
|
|
|
|
|
elif args.metric == 'token_count': |
|
|
if args.poster_method == 'paper': |
|
|
|
|
|
image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True) |
|
|
token_count = 0 |
|
|
for image_path in image_paths: |
|
|
|
|
|
token_count += count_tokens_in_image(image_path) |
|
|
else: |
|
|
token_count = count_tokens_in_image(gen_poster_path) |
|
|
|
|
|
with open(f'{save_path}/token_count.json', 'w') as f: |
|
|
json.dump({'token_count': token_count}, f, indent=4) |
|
|
elif args.metric == 'judge': |
|
|
agent_config = get_agent_config('4o') |
|
|
|
|
|
if args.poster_method == 'paper': |
|
|
poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_')) |
|
|
else: |
|
|
poster_images = [Image.open(gen_poster_path)] |
|
|
|
|
|
results = eval_vlm_as_judge( |
|
|
poster_image_list=poster_images, |
|
|
agent_config=agent_config, |
|
|
) |
|
|
|
|
|
aesthetic_aspects = [ |
|
|
'aesthetic_element', |
|
|
'aesthetic_engagement', |
|
|
'aesthetic_layout' |
|
|
] |
|
|
|
|
|
information_aspects = [ |
|
|
'information_low_level', |
|
|
'information_logic', |
|
|
'information_content', |
|
|
] |
|
|
|
|
|
|
|
|
overall_average = np.mean([results[aspect]['score'] for aspect in results]) |
|
|
aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects]) |
|
|
information_average = np.mean([results[aspect]['score'] for aspect in results if aspect in information_aspects]) |
|
|
|
|
|
judge_result = { |
|
|
'overall_average': overall_average, |
|
|
'aesthetic_average': aesthetic_average, |
|
|
'information_average': information_average, |
|
|
'results': results |
|
|
} |
|
|
|
|
|
|
|
|
with open(f'{save_path}/judge_result.json', 'w') as f: |
|
|
json.dump(judge_result, f, indent=4) |
|
|
elif args.metric == 'aesthetic_judge': |
|
|
agent_config = get_agent_config('4o') |
|
|
|
|
|
if args.poster_method == 'paper': |
|
|
poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_')) |
|
|
else: |
|
|
poster_images = [Image.open(gen_poster_path)] |
|
|
|
|
|
results = eval_vlm_as_judge( |
|
|
poster_image_list=poster_images, |
|
|
agent_config=agent_config, |
|
|
aspect='aesthetic' |
|
|
) |
|
|
|
|
|
aesthetic_aspects = [ |
|
|
'aesthetic_element', |
|
|
'aesthetic_engagement', |
|
|
'aesthetic_layout' |
|
|
] |
|
|
|
|
|
aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects]) |
|
|
|
|
|
judge_result = { |
|
|
'aesthetic_average': aesthetic_average, |
|
|
'results': results |
|
|
} |
|
|
|
|
|
|
|
|
with open(f'{save_path}/aesthetic_judge_result.json', 'w') as f: |
|
|
json.dump(judge_result, f, indent=4) |
|
|
|
|
|
if args.poster_method == 'paper': |
|
|
|
|
|
shutil.rmtree(temp_dir) |
|
|
print(f"Removed temporary folder {temp_dir}") |
|
|
|