Spaces:

JaceWei
/

PaperShow

Sleeping

File size: 21,174 Bytes

7c08dc3

from utils.poster_eval_utils import *
import json
from utils.wei_utils import get_agent_config
import argparse
from dotenv import load_dotenv
import tempfile
import shutil
import os
import glob
import re

load_dotenv()

def run_qa_and_update_results(
    args,
    raw_folder,
    gen_poster_path,
    save_path,
    single_model_name=None,
    del_model_name=None,
):
    """
    If single_model_name is provided, run QA for that one model only,
    but update an existing JSON file (which already contains the other
    models' results) and re-compute the overall averages.

    If single_model_name is None, run QA for all models in all_model_names
    and write a new JSON file.

    :param raw_folder: Path to folder with 'o3_qa.json'.
    :param gen_poster_path: Path to the generated poster image.
    :param save_path: Directory where overall_qa_result.json is saved or should be written.
    :param all_model_names: List of model names (e.g. ['vllm_qwen_vl', '4o', 'o3']).
    :param single_model_name: Optional single model name.
    """

    # Load the QA data (questions, answers, aspects)
    qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r'))
    detail_qa = qa_dict['detail']
    understanding_qa = qa_dict['understanding']

    # Option A: Single model case
    if single_model_name is not None:
        qa_input_token, qa_output_token = 0, 0
        # Load the existing JSON with all previously computed results
        existing_path = os.path.join(save_path, "overall_qa_result.json")
        with open(existing_path, 'r') as f:
            overall_qa_result = json.load(f)

        if del_model_name is not None:
            # Remove the specified model from the existing results
            if del_model_name in overall_qa_result['qa_result']:
                del overall_qa_result['qa_result'][del_model_name]
                print(f"Removed model {del_model_name} from existing results.")
        
        if single_model_name in overall_qa_result['qa_result']:
            print(f"Model {single_model_name} already evaluated. Skipping.")
            return

        # Evaluate QA for the single_model_name
        print(f"Running QA for single model: {single_model_name}")
        agent_config = get_agent_config(single_model_name)

        if args.poster_method == 'paper':
            poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg')
        else:
            poster_images = [Image.open(gen_poster_path)]

        poster_images = [ensure_under_limit_pil(image) for image in poster_images]

        detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
            poster_input=poster_images,
            questions=detail_qa['questions'],
            answers=detail_qa['answers'],
            aspects=detail_qa['aspects'],
            input_type='image',
            agent_config=agent_config
        )
        qa_input_token += input_token
        qa_output_token += output_token
        print('Detail QA accuracy:', detail_accuracy)

        understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
            poster_input=poster_images,
            questions=understanding_qa['questions'],
            answers=understanding_qa['answers'],
            aspects=understanding_qa['aspects'],
            input_type='image',
            agent_config=agent_config
        )
        qa_input_token += input_token
        qa_output_token += output_token
        print('Understanding QA accuracy:', understanding_accuracy)

        # Update QA result for this one model
        # overall_qa_result["qa_result"] is assumed to already have the others
        overall_qa_result['qa_result'][single_model_name] = {
            'detail_accuracy': detail_accuracy,
            'detail_aspect_accuracy': detail_aspect_accuracy,
            'detail_agent_answers': detail_agent_answers,
            'understanding_accuracy': understanding_accuracy,
            'understanding_aspect_accuracy': understanding_aspect_accuracy,
            'understanding_agent_answers': understanding_agent_answers
        }

        # Now re-compute the averages across all models present in the JSON
        # Grab all model entries from overall_qa_result['qa_result']
        all_models_in_file = list(overall_qa_result['qa_result'].keys())
        detail_accs = []
        understanding_accs = []
        for m in all_models_in_file:
            detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy'])
            understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy'])

        avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0
        avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0

        overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
        overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy

        # Finally, overwrite the same JSON file with the updated results
        with open(existing_path, 'w') as f:
            json.dump(overall_qa_result, f, indent=4)

        print(f'Input tokens: {qa_input_token}')
        print(f'Output tokens: {qa_output_token}')

        print('Updated overall_qa_result.json with single-model results.')
        print('New average detail accuracy:', avg_detail_accuracy)
        print('New average understanding accuracy:', avg_understanding_accuracy)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--paper_name', type=str)
    parser.add_argument('--base_dir', type=str, default='Paper2Poster-data')
    parser.add_argument('--poster_method', type=str)
    parser.add_argument('--poster_image_name', type=str, default='poster.png', choices=['poster.png'])
    parser.add_argument('--metric', type=str, choices=['stats', 'qa', 'judge', 'word_count', 'token_count', 'figure_count', 'aesthetic_judge'], default='stats')
    parser.add_argument('--fix', type=str, default=None)
    parser.add_argument('--del_model_name', type=str, default=None)
    
    args = parser.parse_args()

    raw_poster_path = f'{args.base_dir}/{args.paper_name}/poster.png'
    raw_folder = f'{args.base_dir}/{args.paper_name}'

    gen_poster_path = f'{args.poster_method}/{args.base_dir}/{args.paper_name}/{args.poster_image_name}'
    gen_folder = f'{args.poster_method}/{args.base_dir}/{args.paper_name}'

    save_path = f'eval_results/{args.paper_name}/{args.poster_method}'
    os.makedirs(save_path, exist_ok=True)

    if args.poster_method == 'paper':
        if args.metric == 'qa' and args.fix is not None:
            overall_qa_result = json.load(open(f'{save_path}/overall_qa_result.json', 'r'))
            if args.fix in overall_qa_result['qa_result']:
                print(f"Model {args.fix} already evaluated. Skipping.")
                exit(0)
        # create a temp folder to store the paper
        # 1) Create a unique temp folder
        temp_dir = tempfile.mkdtemp(prefix="eval_temp", suffix="_data")

        # 2) Build your source directory path, replacing spaces
        paper_slug = args.paper_name.replace(' ', '_')
        source_dir = os.path.join('<4o_vllm_qwen>_images_and_tables', paper_slug)

        # 3) Sequentially copy files named "<paper_slug>-<index>.png"
        index = 1
        while True:
            filename = f"{paper_slug}-{index}.png"
            src_path = os.path.join(source_dir, filename)
            if not os.path.isfile(src_path):
                # stop once the next index is missing
                break
            shutil.copy2(src_path, os.path.join(temp_dir, filename))
            index += 1
            if index > 20 and args.metric != 'word_count' and args.metric != 'token_count':
                break

        gen_folder = temp_dir
        gen_poster_path = f'{args.base_dir}/{args.paper_name}/paper.pdf'
        

    print('Evaluating poster:', args.paper_name)

    if args.metric == 'stats':
        stats_file = os.path.join(save_path, 'stats_result.json')

        # 1) load existing results if there are any
        if os.path.exists(stats_file):
            with open(stats_file, 'r') as f:
                stats_result = json.load(f)
            print(f"Loaded existing stats from {stats_file}")
        else:
            stats_result = {}

        # 2) CLIP similarity
        if 'CLIP_similarity' not in stats_result:
            _, cos_sim = compare_folders_with_clip(raw_folder, gen_folder)
            stats_result['CLIP_similarity'] = cos_sim
            print(f'CLIP similarity: {cos_sim}')
        else:
            print(f"Skipping CLIP similarity (already {stats_result['CLIP_similarity']})")

        # 3) we only need to regenerate markdown+images if any of the text/image metrics is missing
        need_eval = any(k not in stats_result for k in ('textual_ppl', 'mixtual_ppl', 'visual_relevance', 'visual_ppl'))
        if need_eval:                
            images, poster_text, raw_markdown, new_markdown = gen_eval_markdown(
                args.paper_name,
                args.poster_method,
                gen_poster_path
            )

            # textual PPL
            if 'textual_ppl' not in stats_result:
                textual_ppl = get_ppl(poster_text)
                stats_result['textual_ppl'] = textual_ppl
                print(f'Textual PPL: {textual_ppl}')
            else:
                print(f"Skipping textual PPL (already {stats_result['textual_ppl']})")

            # mixtual PPL
            if 'mixtual_ppl' not in stats_result:
                mixtual_ppl = get_ppl(new_markdown)
                stats_result['mixtual_ppl'] = mixtual_ppl
                print(f'Mixtual PPL: {mixtual_ppl}')
            else:
                print(f"Skipping mixtual PPL (already {stats_result['mixtual_ppl']})")

            # visual relevance
            if 'visual_relevance' not in stats_result:
                if images:
                    sims = [
                        compute_cosine_similarity(v['image_clip_embedding'],
                                                v['section_text_clip_embedding'])
                        for v in images.values()
                    ]
                    avg_sim = float(np.mean(sims))
                    stats_result['visual_relevance'] = avg_sim
                    print(f'Average cosine similarity: {avg_sim}')
                else:
                    stats_result['visual_relevance'] = 0.0
                    print('No images found in the poster. Set visual_relevance to 0.')
            else:
                print(f"Skipping visual relevance (already {stats_result['visual_relevance']})")

            if 'visual_ppl' not in stats_result or math.isnan(stats_result['visual_ppl']):
                visual_ppls = []
                for relative_path, v in images.items():
                    image_path = os.path.join('eval_poster_markdown', args.paper_name, args.poster_method, relative_path)
                    image = Image.open(image_path)
                    visual_ppl = get_visual_ppl(image, poster_text)
                    visual_ppls.append(visual_ppl)
                avg_visual_ppl = float(np.mean(visual_ppls))
                stats_result['visual_ppl'] = avg_visual_ppl
                print(f'Average visual PPL: {avg_visual_ppl}')
            else:
                print("All textual and visual metrics already computed; skipping gen_eval_markdown.")

        if 'interleaved_ppl' not in stats_result:
            interleaved_ppl = compute_interleaved_ppl(args.paper_name, args.poster_method)
            stats_result['interleaved_ppl'] = interleaved_ppl
            print(f'Interleaved PPL: {interleaved_ppl}')
        else:
            print(f"Skipping interleaved PPL (already {stats_result['interleaved_ppl']})")
        
        if 'poster_image_ppl' not in stats_result:
            if args.poster_method == 'paper':
                poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), format='jpg')
            else:
                poster_images = [Image.open(gen_poster_path)]
            poster_image_ppl = compute_poster_image_ppl(poster_images)
            stats_result['poster_image_ppl'] = poster_image_ppl
            print(f'Poster image PPL: {poster_image_ppl}')
        else:
            print(f"Skipping poster image PPL (already {stats_result['poster_image_ppl']})")

        # 4) write back updated file
        with open(stats_file, 'w') as f:
            json.dump(stats_result, f, indent=4)
        print(f"Updated stats written to {stats_file}")
    elif args.metric == 'figure_count':
        save_file_path = os.path.join(save_path, 'figure_count.json')
        if os.path.exists(save_file_path):
            print(f"Figure count already exists at {save_file_path}. Skipping.")
        else:
            figure_count = gen_eval_markdown(
                args.paper_name,
                args.poster_method,
                gen_poster_path,
                figure_count_only=True
            )
            with open(save_file_path, 'w') as f:
                json.dump({'figure_count': figure_count}, f, indent=4)
            print(f"Figure count saved to {save_file_path}")
    elif args.metric == 'qa':
        if args.fix is not None:
            run_qa_and_update_results(
                args,
                raw_folder,
                gen_poster_path,
                save_path,
                single_model_name=args.fix,
                del_model_name=args.del_model_name
            )
        else:
            overall_qa_result = {}
            qa_result = {}
            qa_dict = json.load(open(os.path.join(raw_folder, 'o3_qa.json'), 'r'))
            detail_qa = qa_dict['detail']
            understanding_qa = qa_dict['understanding']
            model_names = [
                '4o',
                'o3',
                '4o-mini'
            ]
            if args.poster_method == 'paper':
                poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
            else:
                poster_images = [Image.open(gen_poster_path)]

            poster_images = [ensure_under_limit_pil(image) for image in poster_images]
            
            for model_name in model_names:
                qa_input_token, qa_output_token = 0, 0
                print('QA model:', model_name)
                agent_config = get_agent_config(model_name)
                detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
                    poster_input=poster_images, 
                    questions=detail_qa['questions'], 
                    answers=detail_qa['answers'], 
                    aspects=detail_qa['aspects'], 
                    input_type='image', 
                    agent_config=agent_config
                )
                print(f'{model_name} Detail QA accuracy:', detail_accuracy)
                qa_input_token += input_token
                qa_output_token += output_token

                understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
                    poster_input=poster_images, 
                    questions=understanding_qa['questions'], 
                    answers=understanding_qa['answers'], 
                    aspects=understanding_qa['aspects'], 
                    input_type='image', 
                    agent_config=agent_config
                )
                print(f'{model_name} Understanding QA accuracy:', understanding_accuracy)
                qa_input_token += input_token
                qa_output_token += output_token

                qa_result[model_name] = {
                    'detail_accuracy': detail_accuracy,
                    'detail_aspect_accuracy': detail_aspect_accuracy,
                    'detail_agent_answers': detail_agent_answers,
                    'understanding_accuracy': understanding_accuracy,
                    'understanding_aspect_accuracy': understanding_aspect_accuracy,
                    'understanding_agent_answers': understanding_agent_answers
                }

                print(f'{model_name} Input tokens:', qa_input_token)
                print(f'{model_name} Output tokens:', qa_output_token)

            # average the results
            avg_detail_accuracy = np.mean([qa_result[model_name]['detail_accuracy'] for model_name in model_names])
            avg_understanding_accuracy = np.mean([qa_result[model_name]['understanding_accuracy'] for model_name in model_names])

            print('Average detail accuracy:', avg_detail_accuracy)
            print('Average understanding accuracy:', avg_understanding_accuracy)

            overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
            overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy
            overall_qa_result['qa_result'] = qa_result

            with open(f'{save_path}/overall_qa_result.json', 'w') as f:
                json.dump(overall_qa_result, f, indent=4)

    elif args.metric == 'word_count':
        if args.poster_method == 'paper':
            # loop through all images in the folder
            image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True)
            word_count = 0
            for image_path in image_paths:
                # count words in each image
                word_count += count_words_in_image(image_path)
        else:
            word_count = count_words_in_image(gen_poster_path)
        # save to json
        with open(f'{save_path}/word_count.json', 'w') as f:
            json.dump({'word_count': word_count}, f, indent=4)

    elif args.metric == 'token_count':
        if args.poster_method == 'paper':
            # loop through all images in the folder
            image_paths = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'), return_path=True)
            token_count = 0
            for image_path in image_paths:
                # count tokens in each image
                token_count += count_tokens_in_image(image_path)
        else:
            token_count = count_tokens_in_image(gen_poster_path)
        # save to json
        with open(f'{save_path}/token_count.json', 'w') as f:
            json.dump({'token_count': token_count}, f, indent=4)
    elif args.metric == 'judge':
        agent_config = get_agent_config('4o')

        if args.poster_method == 'paper':
            poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
        else:
            poster_images = [Image.open(gen_poster_path)]
        
        results = eval_vlm_as_judge(
            poster_image_list=poster_images,
            agent_config=agent_config,
        )

        aesthetic_aspects = [
            'aesthetic_element',
            'aesthetic_engagement',
            'aesthetic_layout'
        ]

        information_aspects = [
            'information_low_level',
            'information_logic',
            'information_content',
        ]

        # compute average scores for all, for aesthetic, and for information
        overall_average = np.mean([results[aspect]['score'] for aspect in results])
        aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects])
        information_average = np.mean([results[aspect]['score'] for aspect in results if aspect in information_aspects])

        judge_result = {
            'overall_average': overall_average,
            'aesthetic_average': aesthetic_average,
            'information_average': information_average,
            'results': results
        }

        # save to json
        with open(f'{save_path}/judge_result.json', 'w') as f:
            json.dump(judge_result, f, indent=4)
    elif args.metric == 'aesthetic_judge':
        agent_config = get_agent_config('4o')

        if args.poster_method == 'paper':
            poster_images = open_folder_images(gen_folder, args.paper_name.replace(' ', '_'))
        else:
            poster_images = [Image.open(gen_poster_path)]
        
        results = eval_vlm_as_judge(
            poster_image_list=poster_images,
            agent_config=agent_config,
            aspect='aesthetic'
        )

        aesthetic_aspects = [
            'aesthetic_element',
            'aesthetic_engagement',
            'aesthetic_layout'
        ]

        aesthetic_average = np.mean([results[aspect]['score'] for aspect in results if aspect in aesthetic_aspects])

        judge_result = {
            'aesthetic_average': aesthetic_average,
            'results': results
        }

        # save to json
        with open(f'{save_path}/aesthetic_judge_result.json', 'w') as f:
            json.dump(judge_result, f, indent=4)

    if args.poster_method == 'paper':
        # remove the temp folder
        shutil.rmtree(temp_dir)
        print(f"Removed temporary folder {temp_dir}")