Spaces:

JaceWei
/

PaperShow

Sleeping

PaperShow / Paper2Poster /Paper2Poster-eval /eval_poster_pipeline.py

ZaynZhu

Clean version without large assets

7c08dc3 about 1 month ago

21.2 kB

	from utils.poster_eval_utils import *
	import json
	from utils.wei_utils import get_agent_config
	import argparse
	from dotenv import load_dotenv
	import tempfile
	import shutil
	import os
	import glob
	import re

	load_dotenv()

	def run_qa_and_update_results(
	args,
	raw_folder,
	gen_poster_path,
	save_path,
	single_model_name=None,
	del_model_name=None,
	):
	"""
	If single_model_name is provided, run QA for that one model only,
	but update an existing JSON file (which already contains the other
	models' results) and re-compute the overall averages.

	If single_model_name is None, run QA for all models in all_model_names
	and write a new JSON file.

	:param raw_folder: Path to folder with 'o3_qa.json'.
	:param gen_poster_path: Path to the generated poster image.
	:param save_path: Directory where overall_qa_result.json is saved or should be written.
	:param all_model_names: List of model names (e.g. ['vllm_qwen_vl', '4o', 'o3']).
	:param single_model_name: Optional single model name.
	"""

	# Load the QA data (questions, answers, aspects)
	qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r'))
	detail_qa = qa_dict['detail']
	understanding_qa = qa_dict['understanding']

	# Option A: Single model case
	if single_model_name is not None:
	qa_input_token, qa_output_token = 0, 0
	# Load the existing JSON with all previously computed results
	existing_path = os.path.join(save_path, "overall_qa_result.json")
	with open(existing_path, 'r') as f:
	overall_qa_result = json.load(f)

	if del_model_name is not None:
	# Remove the specified model from the existing results
	if del_model_name in overall_qa_result['qa_result']:
	del overall_qa_result['qa_result'][del_model_name]
	print(f"Removed model {del_model_name} from existing results.")

	if single_model_name in overall_qa_result['qa_result']:
	print(f"Model {single_model_name} already evaluated. Skipping.")
	return

	# Evaluate QA for the single_model_name
	print(f"Running QA for single model: {single_model_name}")
	agent_config = get_agent_config(single_model_name)

	if args.poster_method == 'paper':
	poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg')
	else:
	poster_images = [Image.open(gen_poster_path)]

	poster_images = [ensure_under_limit_pil(image) for image in poster_images]

	detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
	poster_input=poster_images,
	questions=detail_qa['questions'],
	answers=detail_qa['answers'],
	aspects=detail_qa['aspects'],
	input_type='image',
	agent_config=agent_config
	)
	qa_input_token += input_token
	qa_output_token += output_token
	print('Detail QA accuracy:', detail_accuracy)

	understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
	poster_input=poster_images,
	questions=understanding_qa['questions'],
	answers=understanding_qa['answers'],
	aspects=understanding_qa['aspects'],
	input_type='image',
	agent_config=agent_config
	)
	qa_input_token += input_token
	qa_output_token += output_token
	print('Understanding QA accuracy:', understanding_accuracy)

	# Update QA result for this one model
	# overall_qa_result["qa_result"] is assumed to already have the others
	overall_qa_result['qa_result'][single_model_name] = {
	'detail_accuracy': detail_accuracy,
	'detail_aspect_accuracy': detail_aspect_accuracy,
	'detail_agent_answers': detail_agent_answers,
	'understanding_accuracy': understanding_accuracy,
	'understanding_aspect_accuracy': understanding_aspect_accuracy,
	'understanding_agent_answers': understanding_agent_answers
	}

	# Now re-compute the averages across all models present in the JSON
	# Grab all model entries from overall_qa_result['qa_result']
	all_models_in_file = list(overall_qa_result['qa_result'].keys())
	detail_accs = []
	understanding_accs = []
	for m in all_models_in_file:
	detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy'])
	understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy'])

	avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0
	avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0

	overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
	overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy

	# Finally, overwrite the same JSON file with the updated results
	with open(existing_path, 'w') as f:
	json.dump(overall_qa_result, f, indent=4)

	print(f'Input tokens: {qa_input_token}')
	print(f'Output tokens: {qa_output_token}')

	print('Updated overall_qa_result.json with single-model results.')
	print('New average detail accuracy:', avg_detail_accuracy)
	print('New average understanding accuracy:', avg_understanding_accuracy)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--paper_name', type=str)
	parser.add_argument('--base_dir', type=str, default='Paper2Poster-data')
	parser.add_argument('--poster_method', type=str)
	parser.add_argument('--poster_image_name', type=str, default='poster.png', choices=['poster.png'])
	parser.add_argument('--metric', type=str, choices=['stats', 'qa', 'judge', 'word_count', 'token_count', 'figure_count', 'aesthetic_judge'], default='stats')
	parser.add_argument('--fix', type=str, default=None)
	parser.add_argument('--del_model_name', type=str, default=None)

	args = parser.parse_args()

	raw_poster_path = f'{args.base_dir}/{args.paper_name}/poster.png'
	raw_folder = f'{args.base_dir}/{args.paper_name}'

	gen_poster_path = f'{args.poster_method}/{args.base_dir}/{args.paper_name}/{args.poster_image_name}'
	gen_folder = f'{args.poster_method}/{args.base_dir}/{args.paper_name}'

	save_path = f'eval_results/{args.paper_name}/{args.poster_method}'
	os.makedirs(save_path, exist_ok=True)

	if args.poster_method == 'paper':
	if args.metric == 'qa' and args.fix is not None:
	overall_qa_result = json.load(open(f'{save_path}/overall_qa_result.json', 'r'))
	if args.fix in overall_qa_result['qa_result']:
	print(f"Model {args.fix} already evaluated. Skipping.")
	exit(0)
	# create a temp folder to store the paper
	# 1) Create a unique temp folder
	temp_dir = tempfile.mkdtemp(prefix="eval_temp", suffix="_data")

	# 2) Build your source directory path, replacing spaces
	paper_slug = args.paper_name.replace(' ', '_')
	source_dir = os.path.join('<4o_vllm_qwen>_images_and_tables', paper_slug)

	# 3) Sequentially copy files named "<paper_slug>-<index>.png"
	index = 1
	while True:
	filename = f"{paper_slug}-{index}.png"
	src_path = os.path.join(source_dir, filename)
	if not os.path.isfile(src_path):
	# stop once the next index is missing
	break
	shutil.copy2(src_path, os.path.join(temp_dir, filename))
	index += 1
	if index > 20 and args.metric != 'word_count' and args.metric != 'token_count':
	break

	gen_folder = temp_dir
	gen_poster_path = f'{args.base_dir}/{args.paper_name}/paper.pdf'


	print('Evaluating poster:', args.paper_name)

	if args.metric == 'stats':
	stats_file = os.path.join(save_path, 'stats_result.json')

	# 1) load existing results if there are any
	if os.path.exists(stats_file):
	with open(stats_file, 'r') as f:
	stats_result = json.load(f)
	print(f"Loaded existing stats from {stats_file}")
	else:
	stats_result = {}

	# 2) CLIP similarity
	if 'CLIP_similarity' not in stats_result:
	_, cos_sim = compare_folders_with_clip(raw_folder, gen_folder)
	stats_result['CLIP_similarity'] = cos_sim
	print(f'CLIP similarity: {cos_sim}')
	else:
	print(f"Skipping CLIP similarity (already {stats_result['CLIP_similarity']})")

	# 3) we only need to regenerate markdown+images if any of the text/image metrics is missing
	need_eval = any(k not in stats_result for k in ('textual_ppl', 'mixtual_ppl', 'visual_relevance', 'visual_ppl'))
	if need_eval:
	images, poster_text, raw_markdown, new_markdown = gen_eval_markdown(
	args.paper_name,
	args.poster_method,
	gen_poster_path
	)

	# textual PPL
	if 'textual_ppl' not in stats_result:
	textual_ppl = get_ppl(poster_text)
	stats_result['textual_ppl'] = textual_ppl
	print(f'Textual PPL: {textual_ppl}')
	else:
	print(f"Skipping textual PPL (already {stats_result['textual_ppl']})")

	# mixtual PPL
	if 'mixtual_ppl' not in stats_result:
	mixtual_ppl = get_ppl(new_markdown)
	stats_result['mixtual_ppl'] = mixtual_ppl
	print(f'Mixtual PPL: {mixtual_ppl}')
	else:
	print(f"Skipping mixtual PPL (already {stats_result['mixtual_ppl']})")

	# visual relevance
	if 'visual_relevance' not in stats_result:
	if images:
	sims = [
	compute_cosine_similarity(v['image_clip_embedding'],
	v['section_text_clip_embedding'])
	for v in images.values()
	]
	avg_sim = float(np.mean(sims))
	stats_result['visual_relevance'] = avg_sim
	print(f'Average cosine similarity: {avg_sim}')
	else:
	stats_result['visual_relevance'] = 0.0
	print('No images found in the poster. Set visual_relevance to 0.')
	else:
	print(f"Skipping visual relevance (already {stats_result['visual_relevance']})")

	if 'visual_ppl' not in stats_result or math.isnan(stats_result['visual_ppl']):
	visual_ppls = []
	for relative_path, v in images.items():
	image_path = os.path.join('eval_poster_markdown', args.paper_name, args.poster_method, relative_path)
	image = Image.open(image_path)
	visual_ppl = get_visual_ppl(image, poster_text)
	visual_ppls.append(visual_ppl)
	avg_visual_ppl = float(np.mean(visual_ppls))
	stats_result['visual_ppl'] = avg_visual_ppl
	print(f'Average visual PPL: {avg_visual_ppl}')
	else:
	print("All textual and visual metrics already computed; skipping gen_eval_markdown.")

	if 'interleaved_ppl' not in stats_result:
	interleaved_ppl = compute_interleaved_ppl(args.paper_name, args.poster_method)
	stats_result['interleaved_ppl'] = interleaved_ppl
	print(f'Interleaved PPL: {interleaved_ppl}')
	else:
	print(f"Skipping interleaved PPL (already {stats_result['interleaved_ppl']})")

	if 'poster_image_ppl' not in stats_result:
	if args.poster_method == 'paper':
	poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg')
	else:
	poster_images = [Image.open(gen_poster_path)]
	poster_image_ppl = compute_poster_image_ppl(poster_images)
	stats_result['poster_image_ppl'] = poster_image_ppl
	print(f'Poster image PPL: {poster_image_ppl}')
	else:
	print(f"Skipping poster image PPL (already {stats_result['poster_image_ppl']})")

	# 4) write back updated file
	with open(stats_file, 'w') as f:
	json.dump(stats_result, f, indent=4)
	print(f"Updated stats written to {stats_file}")
	elif args.metric == 'figure_count':
	save_file_path = os.path.join(save_path, 'figure_count.json')
	if os.path.exists(save_file_path):
	print(f"Figure count already exists at {save_file_path}. Skipping.")
	else:
	figure_count = gen_eval_markdown(
	args.paper_name,
	args.poster_method,
	gen_poster_path,
	figure_count_only=True
	)
	with open(save_file_path, 'w') as f:
	json.dump({'figure_count': figure_count}, f, indent=4)
	print(f"Figure count saved to {save_file_path}")
	elif args.metric == 'qa':
	if args.fix is not None:
	run_qa_and_update_results(
	args,
	raw_folder,
	gen_poster_path,
	save_path,
	single_model_name=args.fix,
	del_model_name=args.del_model_name
	)
	else:
	overall_qa_result = {}
	qa_result = {}
	qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r'))
	detail_qa = qa_dict['detail']
	understanding_qa = qa_dict['understanding']
	model_names = [
	'4o',
	'o3',
	'4o-mini'
	]
	if args.poster_method == 'paper':
	poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
	else:
	poster_images = [Image.open(gen_poster_path)]

	poster_images = [ensure_under_limit_pil(image) for image in poster_images]

	for model_name in model_names:
	qa_input_token, qa_output_token = 0, 0
	print('QA model:', model_name)
	agent_config = get_agent_config(model_name)
	detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
	poster_input=poster_images,
	questions=detail_qa['questions'],
	answers=detail_qa['answers'],
	aspects=detail_qa['aspects'],
	input_type='image',
	agent_config=agent_config
	)
	print(f'{model_name} Detail QA accuracy:', detail_accuracy)
	qa_input_token += input_token
	qa_output_token += output_token

	understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
	poster_input=poster_images,
	questions=understanding_qa['questions'],
	answers=understanding_qa['answers'],
	aspects=understanding_qa['aspects'],
	input_type='image',
	agent_config=agent_config
	)
	print(f'{model_name} Understanding QA accuracy:', understanding_accuracy)
	qa_input_token += input_token
	qa_output_token += output_token

	qa_result[model_name] = {
	'detail_accuracy': detail_accuracy,
	'detail_aspect_accuracy': detail_aspect_accuracy,
	'detail_agent_answers': detail_agent_answers,
	'understanding_accuracy': understanding_accuracy,
	'understanding_aspect_accuracy': understanding_aspect_accuracy,
	'understanding_agent_answers': understanding_agent_answers
	}

	print(f'{model_name} Input tokens:', qa_input_token)
	print(f'{model_name} Output tokens:', qa_output_token)

	# average the results
	avg_detail_accuracy = np.mean([qa_result[model_name]['detail_accuracy'] for model_name in model_names])
	avg_understanding_accuracy = np.mean([qa_result[model_name]['understanding_accuracy'] for model_name in model_names])

	print('Average detail accuracy:', avg_detail_accuracy)
	print('Average understanding accuracy:', avg_understanding_accuracy)

	overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
	overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy
	overall_qa_result['qa_result'] = qa_result

	with open(f'{save_path}/overall_qa_result.json', 'w') as f:
	json.dump(overall_qa_result, f, indent=4)

	elif args.metric == 'word_count':
	if args.poster_method == 'paper':
	# loop through all images in the folder
	image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True)
	word_count = 0
	for image_path in image_paths:
	# count words in each image
	word_count += count_words_in_image(image_path)
	else:
	word_count = count_words_in_image(gen_poster_path)
	# save to json
	with open(f'{save_path}/word_count.json', 'w') as f:
	json.dump({'word_count': word_count}, f, indent=4)

	elif args.metric == 'token_count':
	if args.poster_method == 'paper':
	# loop through all images in the folder
	image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True)
	token_count = 0
	for image_path in image_paths:
	# count tokens in each image
	token_count += count_tokens_in_image(image_path)
	else:
	token_count = count_tokens_in_image(gen_poster_path)
	# save to json
	with open(f'{save_path}/token_count.json', 'w') as f:
	json.dump({'token_count': token_count}, f, indent=4)
	elif args.metric == 'judge':
	agent_config = get_agent_config('4o')

	if args.poster_method == 'paper':
	poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
	else:
	poster_images = [Image.open(gen_poster_path)]

	results = eval_vlm_as_judge(
	poster_image_list=poster_images,
	agent_config=agent_config,
	)

	aesthetic_aspects = [
	'aesthetic_element',
	'aesthetic_engagement',
	'aesthetic_layout'
	]

	information_aspects = [
	'information_low_level',
	'information_logic',
	'information_content',
	]

	# compute average scores for all, for aesthetic, and for information
	overall_average = np.mean([results[aspect]['score'] for aspect in results])
	aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects])
	information_average = np.mean([results[aspect]['score'] for aspect in results if aspect in information_aspects])

	judge_result = {
	'overall_average': overall_average,
	'aesthetic_average': aesthetic_average,
	'information_average': information_average,
	'results': results
	}

	# save to json
	with open(f'{save_path}/judge_result.json', 'w') as f:
	json.dump(judge_result, f, indent=4)
	elif args.metric == 'aesthetic_judge':
	agent_config = get_agent_config('4o')

	if args.poster_method == 'paper':
	poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
	else:
	poster_images = [Image.open(gen_poster_path)]

	results = eval_vlm_as_judge(
	poster_image_list=poster_images,
	agent_config=agent_config,
	aspect='aesthetic'
	)

	aesthetic_aspects = [
	'aesthetic_element',
	'aesthetic_engagement',
	'aesthetic_layout'
	]

	aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects])

	judge_result = {
	'aesthetic_average': aesthetic_average,
	'results': results
	}

	# save to json
	with open(f'{save_path}/aesthetic_judge_result.json', 'w') as f:
	json.dump(judge_result, f, indent=4)

	if args.poster_method == 'paper':
	# remove the temp folder
	shutil.rmtree(temp_dir)
	print(f"Removed temporary folder {temp_dir}")