File size: 8,351 Bytes
fcaa164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import json
import os
from concurrent.futures import ThreadPoolExecutor
from glob import glob
from typing import Literal

import func_argparse
import jsonlines
from jinja2 import Template
from PIL import Image
from pptx import Presentation
from torch import cosine_similarity
from tqdm import tqdm
from transformers import CLIPModel, CLIPProcessor

import llms
from presentation import Presentation
from utils import edit_distance, ppt_to_images

outline_template = Template(
    """
    From the following text which contains a set of headings and some content within each heading:
    {{ text }}
    Extract the most important headings present in it. Reduce the length of each heading to five words if they are lengthy.
    Example Output:
    ["Heading 1", "Heading 2", "Heading 3"]
    Output: give your output as a list of strings in json format
    """
)
mapping_template = Template(
    """
    Think step by step and then answer the following question:  You are given with the following title: {{outline_headings}}
    and a list of keys: {{document_heading_from_bird_eye_view}}
    Each key is associated with some text as presented in the dictionary format below:
    {{bird_eye_view}}
    The task is to find 1-2 significantly matched keys. The matching should be done based on the similarity of the text associated with the keys with the given heading.
    Example Output:
    thoughts...
    {"Heading 1": ["key1", "key2"], "Heading 2": ["key1", "key4"]}
    Output: give your final output as a dictionary in json format, notice that all headings must be present in the output, no heading should be left out and at least one key should be present in the output for each heading
    """
)
generation_template = Template(
    """
    You are a presentation generator from a source of text. You have to generate the slide number {{slide_index}}. Previous slide headings and slide contents are given below in the format of a list of dictionaries. {{previous_slide}} Given the following slide heading and the source of text respectively, create the content of the slide number {{slide_index}} such that: 1. The slide should have maximum {{max_bullet}} bullet points. 2. Ensure that the content of the bullet points are coming strictly from the given source of text only. 3. The content of the slide is very relevant to the given slide heading 4. Each bullet point should have a maximum of 10 words 5. Ensure that this slide does not have any content repeated from the previous slides. 6. The flow of the overall presentation is nice. 7. Do not prefix the slide title before the bullet poide nts in the output  SliTitle: {{slide_heading}} Source of text: {{text}}
    Example Output:
    ["bullet point 1", "bullet point 2"]
    Output: give your output as a list of strings in json format
    """
)


def filter_aspect_ratio(image: list[str]):
    filtered_images = []
    for i in image:
        size = image = Image.open(i).size
        long, short = max(size), min(size)
        if long / short < 4:
            filtered_images.append(i)
    return filtered_images


def get_indexed_sections(bird_eye: dict, indexs: list[str]):
    indexed_sections = []
    for section in bird_eye["sections"]:
        for subsection in section["subsections"]:
            if any(edit_distance(key, next(iter(subsection))) > 0.9 for key in indexs):
                indexed_sections.append(subsection)
    return indexed_sections


def generate_content(source_text: str, bird_eye: dict, max_bullet: int):
    bird_eye_headdings = []
    for section in bird_eye["sections"]:
        bird_eye_headdings.extend(
            [next(iter(subsec)) for subsec in section["subsections"]]
        )
    outline: list[str] = llms.language_model(
        outline_template.render(text=source_text), return_json=True
    )
    assert len(outline) != 0, "No outline found"
    mapping = llms.language_model(
        mapping_template.render(
            outline_headings=outline,
            document_heading_from_bird_eye_view=bird_eye_headdings,
            bird_eye_view=bird_eye,
        ),
        return_json=True,
    )
    slides = []
    for slide_title in outline:
        bullet_points = llms.language_model(
            generation_template.render(
                slide_heading=slide_title,
                text=get_indexed_sections(bird_eye, mapping.get(slide_title, [])),
                previous_slide=slides,
                max_bullet=max_bullet,
            ),
            return_json=True,
        )
        slides.append(
            {
                "title": slide_title,
                "bullets": bullet_points,
                "indexed_sections": mapping.get(slide_title, []),
            }
        )
    return slides


def generate_slides(
    output_dir: str,
    source_text: str,
    bird_eye: dict,
    images: list[str],
    model: CLIPModel,
    processor: CLIPProcessor,
):
    os.makedirs(output_dir, exist_ok=True)
    images = filter_aspect_ratio(images)
    slides = generate_content(source_text, bird_eye, 7)
    image_embeddings = model.get_image_features(
        **processor(images=[Image.open(i) for i in images], return_tensors="pt").to(
            "cuda"
        )
    ).unsqueeze(0)
    text_embeddings = model.get_text_features(
        **processor(
            text=["\n".join(slide["bullets"]) for slide in slides],
            return_tensors="pt",
            padding=True,
            max_length=77,
            truncation=True,
        ).to("cuda")
    ).unsqueeze(1)
    similarity = cosine_similarity(image_embeddings, text_embeddings, dim=-1)
    pptx = Presentation()
    for slide_idx, slide in enumerate(slides):  # match image here
        title = slide["title"]
        bullets = slide["bullets"]

        subsimilarity = similarity[slide_idx]
        if subsimilarity.max() > 0.8:
            slide = pptx.slides.add_slide(pptx.slide_layouts[6])
            bullets_placeholder = slide.shapes.placeholders[2]
            image = images[subsimilarity.argmax()]
            slides[slide_idx]["image"] = image
            slide.shapes.placeholders[1].insert_picture(image)
        else:
            slide = pptx.slides.add_slide(pptx.slide_layouts[1])
            bullets_placeholder = slide.shapes.placeholders[1]
        slide.shapes.title.text = title
        text_frame = bullets_placeholder.text_frame
        for bullet in bullets:
            para = text_frame.add_paragraph()
            para.text = bullet
            para.level = 1
    with jsonlines.open(output_dir + "/final.jsonl", "w") as writer:
        writer.write_all(slides)
    pptx.save(output_dir + "/final.pptx")
    ppt_to_images(output_dir + "/final.pptx", output_dir + "/slide_images")


def generate(model: Literal["Qwen2.5", "gpt"]):
    if model == "Qwen2.5":
        llms.language_model = llms.qwen2_5
    elif model == "gpt":
        llms.language_model = llms.gpt4o

    print("Generating slides on baseline with ", llms.language_model.model)
    llm_name = llms.get_simple_modelname(llms.language_model)
    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda").eval()
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    folders = list(glob("data/*/pdf/*"))
    progress = tqdm(total=len(folders))

    def process_folder(pdf_folder, model, processor):
        source_text = open(f"{pdf_folder}/source.md").read()
        bird_eye = json.load(open(f"{pdf_folder}/refined_doc.json"))
        images = json.load(open(f"{pdf_folder}/image_caption.json")).keys()
        output_dir = f"{pdf_folder}/docpres/{llm_name}"
        if os.path.exists(output_dir + "/final.jsonl"):
            progress.write(f"Skipping {pdf_folder}")
            progress.update(1)
            return
        try:
            generate_slides(
                output_dir,
                source_text,
                bird_eye,
                list(images),
                model,
                processor,
            )
            progress.update(1)
        except Exception as e:
            print(f"Error in {pdf_folder}: {e}")

    # for folder in folders:
    #     process_folder(folder, model, processor)

    with ThreadPoolExecutor() as executor:
        list(executor.map(lambda f: process_folder(f, model, processor), folders))


if __name__ == "__main__":
    func_argparse.main([generate])