|
|
import json |
|
|
import os |
|
|
import random |
|
|
import shutil |
|
|
import tempfile |
|
|
from collections import defaultdict |
|
|
from glob import glob |
|
|
from typing import Literal |
|
|
|
|
|
import func_argparse |
|
|
import pytorch_fid.fid_score as fid |
|
|
import torch |
|
|
from jinja2 import Template |
|
|
from pytorch_fid.fid_score import compute_statistics_of_path |
|
|
from rich import print |
|
|
from tqdm import tqdm |
|
|
from transformers import GPT2LMHeadModel, GPT2TokenizerFast |
|
|
|
|
|
import llms |
|
|
from presentation import Picture, Presentation, SlidePage |
|
|
from utils import Config, pexists, pjoin |
|
|
|
|
|
fid.tqdm = lambda x: x |
|
|
judges = [ |
|
|
(llms.gpt4o, llms.gpt4o, "gpt4o"), |
|
|
(llms.qwen2_5, llms.intern_vl, "qwen+intern"), |
|
|
(llms.qwen2_5, llms.qwen_vl, "Qwen"), |
|
|
(llms.qwen_vl, llms.qwen_vl, "qwen_vl"), |
|
|
(llms.intern_vl, llms.intern_vl, "intern_vl"), |
|
|
] |
|
|
DEVICES = torch.cuda.device_count() |
|
|
|
|
|
|
|
|
def get_ppl(slide: SlidePage, model: GPT2LMHeadModel, tokenizer: GPT2TokenizerFast): |
|
|
ppl = [] |
|
|
text = slide.to_text() |
|
|
if len(text) == 0: |
|
|
return ppl |
|
|
tokenized = tokenizer(text, return_tensors="pt").to(model.device) |
|
|
with torch.no_grad(): |
|
|
outputs = model(tokenized.input_ids, labels=tokenized.input_ids) |
|
|
loss = outputs.loss |
|
|
perplexity = torch.exp(loss) |
|
|
ppl.append(perplexity.item()) |
|
|
return ppl |
|
|
|
|
|
|
|
|
def eval_general(presentations: list[Presentation], evals: dict[str, list[int]]): |
|
|
for prs in presentations: |
|
|
if prs.source_file in evals["pages"]: |
|
|
continue |
|
|
evals["pages"][prs.source_file] = len(prs) |
|
|
evals["characters"][prs.source_file] = sum( |
|
|
[len(slide.to_text()) for slide in prs.slides] |
|
|
) |
|
|
evals["figures"][prs.source_file] = sum( |
|
|
[len(list(slide.shape_filter(Picture))) for slide in prs.slides] |
|
|
) |
|
|
|
|
|
|
|
|
def eval_feature( |
|
|
presentations: list[Presentation], |
|
|
evals: dict, |
|
|
setting: str, |
|
|
): |
|
|
device = f"cuda:{random.randint(0, DEVICES - 1)}" |
|
|
print("start scoring ppl") |
|
|
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device) |
|
|
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") |
|
|
for prs in tqdm(presentations): |
|
|
try: |
|
|
if prs.source_file in evals["ppl"]: |
|
|
continue |
|
|
if ( |
|
|
prs.source_file |
|
|
== "data/culture/pptx/ChemBio-in-the-HUB-public/PPTCrew_wo_SchemaInduction/SSRN-id2933553_Management of Systems Engineering and Technical Assistance of DARPA Research Programs/final.pptx" |
|
|
): |
|
|
continue |
|
|
ppl = [] |
|
|
for slide in prs.slides: |
|
|
ppl.extend(get_ppl(slide, model, tokenizer)) |
|
|
if len(ppl) == 0: |
|
|
continue |
|
|
evals["ppl"][prs.source_file] = sum(ppl) / len(ppl) |
|
|
except Exception as e: |
|
|
print(e, "\n", "happended in ", prs.source_file) |
|
|
|
|
|
model = fid.InceptionV3([fid.InceptionV3.BLOCK_INDEX_BY_DIM[64]]).to(device) |
|
|
for ppt_folder in tqdm(sorted(glob(f"data/*/pptx/*/"))): |
|
|
if ppt_folder in evals["fid"]: |
|
|
continue |
|
|
source_folder = pjoin(ppt_folder, "source_slides") |
|
|
m1, s1 = compute_statistics_of_path(source_folder, model, 128, 64, device) |
|
|
try: |
|
|
with tempfile.TemporaryDirectory(prefix="ppteval_fid_") as temp_dir: |
|
|
for result_folder in glob( |
|
|
pjoin(ppt_folder, f"final_images/{setting}/*") |
|
|
): |
|
|
folder_base = os.path.basename(result_folder) |
|
|
for image_file in os.listdir(result_folder): |
|
|
image_path = os.path.join(result_folder, image_file) |
|
|
temp_image_path = os.path.join( |
|
|
temp_dir, folder_base + "_" + image_file |
|
|
).replace(" ", "_") |
|
|
shutil.copyfile(image_path, temp_image_path) |
|
|
if len(os.listdir(temp_dir)) < 10: |
|
|
continue |
|
|
m2, s2 = compute_statistics_of_path(temp_dir, model, 32, 64, device) |
|
|
|
|
|
evals["fid"][ppt_folder] = fid.calculate_frechet_distance( |
|
|
m1, s1, m2, s2 |
|
|
) |
|
|
except Exception as e: |
|
|
print(e, "\n", "happended in ", ppt_folder, "on:", setting) |
|
|
|
|
|
|
|
|
def merge_evals(folders: list[str], evals: dict): |
|
|
for folder in folders: |
|
|
sub_eval = json.load(open(pjoin(folder, "evals.json"))) |
|
|
for dimension in ["content", "vision", "logic"]: |
|
|
evals[dimension] |= sub_eval[dimension] |
|
|
return evals |
|
|
|
|
|
|
|
|
def slide_score(slide_folder: str): |
|
|
eval_file = pjoin(slide_folder, "evals.json") |
|
|
evals = defaultdict(dict) |
|
|
if pexists(eval_file): |
|
|
evals |= json.load(open(eval_file)) |
|
|
text_scorer = Template(open("prompts/ppteval_content.txt", "r").read()) |
|
|
vision_scorer = Template(open("prompts/ppteval_style.txt", "r").read()) |
|
|
style_descriptor = open("prompts/ppteval_describe_style.txt", "r").read() |
|
|
content_descriptor = open("prompts/ppteval_describe_content.txt", "r").read() |
|
|
for slide_image in glob(pjoin(slide_folder, "slide_*.jpg")): |
|
|
slide_descr = slide_image.replace(".jpg", ".json") |
|
|
if not os.path.exists(slide_descr): |
|
|
style_descr = llms.vision_model(style_descriptor, slide_image) |
|
|
content_descr = llms.vision_model(content_descriptor, slide_image) |
|
|
json.dump( |
|
|
{"content": content_descr, "style": style_descr}, |
|
|
open(slide_descr, "w"), |
|
|
indent=4, |
|
|
) |
|
|
else: |
|
|
descr = json.load(open(slide_descr)) |
|
|
style_descr = descr["style"] |
|
|
content_descr = descr["content"] |
|
|
if slide_image not in evals["vision"]: |
|
|
evals["vision"][slide_image] = llms.language_model( |
|
|
vision_scorer.render(descr=style_descr), return_json=True |
|
|
) |
|
|
if slide_image not in evals["content"]: |
|
|
evals["content"][slide_image] = llms.language_model( |
|
|
text_scorer.render(descr=content_descr), return_json=True |
|
|
) |
|
|
|
|
|
|
|
|
def pres_score(prs_source: str): |
|
|
if "/pptx/" in prs_source: |
|
|
source, setting, pdf, _ = prs_source.rsplit("/", 3) |
|
|
slide_folder = os.path.join(source, "final_images", setting, pdf) |
|
|
else: |
|
|
slide_folder = os.path.dirname(prs_source) |
|
|
eval_file = pjoin(slide_folder, "evals.json") |
|
|
evals = defaultdict(dict) |
|
|
if pexists(eval_file): |
|
|
try: |
|
|
evals |= json.load(open(eval_file)) |
|
|
except: |
|
|
pass |
|
|
evals.pop("logic", None) |
|
|
|
|
|
slide_descr = pjoin(slide_folder, "extracted.json") |
|
|
if not pexists(slide_descr): |
|
|
config = Config("/tmp") |
|
|
presentation = Presentation.from_file(prs_source, config) |
|
|
ppt_extractor = Template(open("prompts/ppteval_extract.txt", "r").read()) |
|
|
extracted = llms.language_model( |
|
|
ppt_extractor.render(presentation=presentation.to_text()), |
|
|
return_json=True, |
|
|
) |
|
|
json.dump(extracted, open(slide_descr, "w"), indent=4) |
|
|
else: |
|
|
extracted = json.load(open(slide_descr)) |
|
|
if presentation.source_file not in evals["logic"]: |
|
|
logic_scorer = Template(open("ppteval_coherence.txt", "r").read()) |
|
|
evals["logic"][presentation.source_file] = llms.language_model( |
|
|
logic_scorer.render( |
|
|
background_information=extracted.pop("metadata"), |
|
|
logical_structure=extracted, |
|
|
), |
|
|
return_json=True, |
|
|
) |
|
|
json.dump(evals, open(eval_file, "w"), indent=4) |
|
|
|
|
|
|
|
|
|
|
|
def eval_experiment( |
|
|
setting: str, |
|
|
general_eval: bool = False, |
|
|
feature_eval: bool = False, |
|
|
ppt_eval: bool = False, |
|
|
): |
|
|
assert setting != "*" |
|
|
llms.language_model, llms.vision_model, judge_name = judges[0] |
|
|
print(f"evaluating {setting} under {judge_name}") |
|
|
print( |
|
|
"eval config :", |
|
|
f"general_eval: {general_eval}, feature_eval: {feature_eval}, ppt_eval: {ppt_eval}", |
|
|
) |
|
|
eval_file = f"data/evals/{setting}_{judge_name}.json" |
|
|
eval_stats = defaultdict(dict) |
|
|
if pexists(eval_file): |
|
|
eval_stats |= json.load(open(eval_file)) |
|
|
config = Config("/tmp") |
|
|
prs_files = glob(f"data/*/pptx/*/{setting}/*/final.pptx") |
|
|
|
|
|
print("start evaluation") |
|
|
if general_eval or feature_eval: |
|
|
presentations = [Presentation.from_file(i, config) for i in prs_files] |
|
|
if general_eval: |
|
|
eval_general(presentations, eval_stats) |
|
|
|
|
|
if feature_eval: |
|
|
eval_feature(presentations, eval_stats, setting) |
|
|
|
|
|
if ppt_eval: |
|
|
slide_image_folders = glob(f"data/*/pptx/*/final_images/{setting}/*") |
|
|
for presentation in prs_files: |
|
|
pres_score(presentation) |
|
|
eval_stats = merge_evals(slide_image_folders, eval_stats) |
|
|
json.dump(eval_stats, open(eval_file, "w"), indent=4) |
|
|
|
|
|
|
|
|
def eval_baseline( |
|
|
setting: str, |
|
|
model: Literal["Qwen2.5", "gpt-4o"], |
|
|
general_eval: bool = False, |
|
|
feature_eval: bool = False, |
|
|
ppt_eval: bool = False, |
|
|
): |
|
|
evals = defaultdict(dict) |
|
|
prs_files = glob(f"data/*/pdf/*/{setting}/{model}/final.pptx") |
|
|
slide_folders = [os.path.dirname(i) for i in prs_files] |
|
|
|
|
|
if general_eval or feature_eval: |
|
|
config = Config("/tmp") |
|
|
presentations = [Presentation.from_file(i, config) for i in prs_files] |
|
|
|
|
|
if general_eval: |
|
|
eval_general(presentations, evals) |
|
|
if feature_eval: |
|
|
eval_feature(presentations, evals, setting, fid_eval=False) |
|
|
if ppt_eval: |
|
|
for slide_folder in slide_folders: |
|
|
slide_score(slide_folder) |
|
|
for presentation in prs_files: |
|
|
pres_score(presentation) |
|
|
|
|
|
merge_evals(slide_folders, evals) |
|
|
json.dump(evals, open(f"data/evals/{setting}_{model}.json", "w"), indent=4) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
func_argparse.main( |
|
|
eval_experiment, |
|
|
eval_baseline, |
|
|
pres_score, |
|
|
slide_score, |
|
|
) |
|
|
|