JaceWei commited on
Commit
0d563bd
·
1 Parent(s): 337c99f

update: push latest content

Browse files
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  input/
2
- output/Paper2Poster/assets/
 
3
  Paper2Video/assets/
4
  posterbuilder/latex_proj/figures/
5
  *.png
@@ -8,11 +9,4 @@ posterbuilder/latex_proj/figures/
8
  *.wav
9
  *.mp4
10
  __pycache__/
11
- *.png
12
- *.jpg
13
- *.pdf
14
- *.wav
15
- *.mp4
16
- Paper2Poster/assets/
17
- Paper2Video/assets/
18
- posterbuilder/latex_proj/figures/
 
1
  input/
2
+ output/
3
+ Paper2Poster/assets/
4
  Paper2Video/assets/
5
  posterbuilder/latex_proj/figures/
6
  *.png
 
9
  *.wav
10
  *.mp4
11
  __pycache__/
12
+
 
 
 
 
 
 
 
Paper2Poster/PosterAgent/new_pipeline.py CHANGED
@@ -129,63 +129,63 @@ if __name__ == '__main__':
129
  detail_log['parser_in_t'] = input_token
130
  detail_log['parser_out_t'] = output_token
131
 
132
- # Initialize LogoManager
133
- logo_manager = LogoManager()
134
- institution_logo_path = args.institution_logo_path
135
- conference_logo_path = args.conference_logo_path
136
-
137
- # Auto-detect institution from paper if not provided
138
- # Now using the raw_result directly instead of reading from file
139
- if not institution_logo_path:
140
- print("\n" + "="*60)
141
- print("🔍 AUTO-DETECTING INSTITUTION FROM PAPER")
142
- print("="*60)
143
-
144
- # Use the raw_result we already have from the parser
145
- if raw_result:
146
- print(f"📄 Using parsed paper content")
147
- # Extract text content from the ConversionResult object
148
- try:
149
- paper_text = raw_result.document.export_to_markdown()
150
- except:
151
- # Fallback: try to get text content in another way
152
- paper_text = str(raw_result)
153
-
154
- print("🔎 Searching for FIRST AUTHOR's institution...")
155
- first_author_inst = logo_manager.extract_first_author_institution(paper_text)
156
-
157
- if first_author_inst:
158
- print(f"\n✅ FIRST AUTHOR INSTITUTION: {first_author_inst}")
159
- print(f"🔍 Searching for logo: {first_author_inst}")
160
-
161
- inst_logo_path = logo_manager.get_logo_path(first_author_inst, category="institute", use_google=args.use_google_search)
162
- if inst_logo_path:
163
- institution_logo_path = str(inst_logo_path)
164
- print(f"✅ Institution logo found: {institution_logo_path}")
165
- else:
166
- print(f"❌ Could not find/download logo for: {first_author_inst}")
167
- else:
168
- print("❌ No first author institution detected or matched with available logos")
169
- else:
170
- print("❌ No parsed content available")
171
- print("="*60 + "\n")
172
-
173
- # Handle conference logo
174
- if args.conference_venue and not conference_logo_path:
175
- print("\n" + "="*60)
176
- print("🏛️ SEARCHING FOR CONFERENCE LOGO")
177
- print("="*60)
178
- print(f"📍 Conference: {args.conference_venue}")
179
- print(f"🔍 Searching for logo...")
180
-
181
- conf_logo_path = logo_manager.get_logo_path(args.conference_venue, category="conference", use_google=args.use_google_search)
182
- if conf_logo_path:
183
- conference_logo_path = str(conf_logo_path)
184
- print(f"✅ Conference logo found: {conference_logo_path}")
185
- else:
186
- print(f"❌ Could not find/download logo for: {args.conference_venue}")
187
- # Note: Web search is now handled inside get_logo_path automatically
188
- print("="*60 + "\n")
189
 
190
  # Step 2: Filter unnecessary images and tables
191
  input_token, output_token = filter_image_table(args, agent_config_t)
@@ -217,87 +217,142 @@ if __name__ == '__main__':
217
  detail_log['outline_in_t'] = input_token
218
  detail_log['outline_out_t'] = output_token
219
 
220
- if args.ablation_no_tree_layout:
221
- panel_arrangement, figure_arrangement, text_arrangement, input_token, output_token = no_tree_get_layout(
222
- poster_width,
223
- poster_height,
224
- panels,
225
- figures,
226
- agent_config_t
227
- )
228
- total_input_tokens_t += input_token
229
- total_output_tokens_t += output_token
230
- print(f'No tree layout token consumption: {input_token} -> {output_token}')
231
- detail_log['no_tree_layout_in_t'] = input_token
232
- detail_log['no_tree_layout_out_t'] = output_token
233
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- # Step 4: Learn and generate layout
236
- panel_model_params, figure_model_params = main_train()
237
-
238
- panel_arrangement, figure_arrangement, text_arrangement = main_inference(
239
- panels,
240
- panel_model_params,
241
- figure_model_params,
242
- poster_width,
243
- poster_height,
244
- shrink_margin=3
245
- )
246
-
247
- text_arrangement_title = text_arrangement[0]
248
- text_arrangement = text_arrangement[1:]
249
- # Split the title textbox into two parts
250
- text_arrangement_title_top, text_arrangement_title_bottom = split_textbox(
251
- text_arrangement_title,
252
- 0.8
253
- )
254
- # Add the split textboxes back to the list
255
- text_arrangement = [text_arrangement_title_top, text_arrangement_title_bottom] + text_arrangement
256
-
257
- for i in range(len(figure_arrangement)):
258
- panel_id = figure_arrangement[i]['panel_id']
259
- panel_section_name = panels[panel_id]['section_name']
260
- figure_info = figures[panel_section_name]
261
- if 'image' in figure_info:
262
- figure_id = figure_info['image']
263
- if not figure_id in images:
264
- figure_path = images[str(figure_id)]['image_path']
265
- else:
266
- figure_path = images[figure_id]['image_path']
267
- elif 'table' in figure_info:
268
- figure_id = figure_info['table']
269
- if not figure_id in tables:
270
- figure_path = tables[str(figure_id)]['table_path']
271
- else:
272
- figure_path = tables[figure_id]['table_path']
273
-
274
- figure_arrangement[i]['figure_path'] = figure_path
275
-
276
- for text_arrangement_item in text_arrangement:
277
- num_chars = char_capacity(
278
- bbox=(text_arrangement_item['x'], text_arrangement_item['y'], text_arrangement_item['height'], text_arrangement_item['width'])
279
- )
280
- text_arrangement_item['num_chars'] = num_chars
281
-
282
-
283
- width_inch, height_inch, panel_arrangement_inches, figure_arrangement_inches, text_arrangement_inches = get_arrangments_in_inches(
284
- poster_width, poster_height, panel_arrangement, figure_arrangement, text_arrangement, 25
285
- )
286
-
287
- # Save to file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  tree_split_results = {
289
- 'poster_width': poster_width,
290
- 'poster_height': poster_height,
291
- 'poster_width_inches': width_inch,
292
- 'poster_height_inches': height_inch,
293
  'panels': panels,
294
- 'panel_arrangement': panel_arrangement,
295
- 'figure_arrangement': figure_arrangement,
296
- 'text_arrangement': text_arrangement,
297
- 'panel_arrangement_inches': panel_arrangement_inches,
298
- 'figure_arrangement_inches': figure_arrangement_inches,
299
- 'text_arrangement_inches': text_arrangement_inches,
300
  }
 
301
  os.makedirs('tree_splits', exist_ok=True)
302
  with open(f'tree_splits/<{args.model_name_t}_{args.model_name_v}>_{args.poster_name}_tree_split_{args.index}.json', 'w') as f:
303
  json.dump(tree_split_results, f, indent=4)
 
129
  detail_log['parser_in_t'] = input_token
130
  detail_log['parser_out_t'] = output_token
131
 
132
+ # # Initialize LogoManager
133
+ # logo_manager = LogoManager()
134
+ # institution_logo_path = args.institution_logo_path
135
+ # conference_logo_path = args.conference_logo_path
136
+
137
+ # # Auto-detect institution from paper if not provided
138
+ # # Now using the raw_result directly instead of reading from file
139
+ # if not institution_logo_path:
140
+ # print("\n" + "="*60)
141
+ # print("🔍 AUTO-DETECTING INSTITUTION FROM PAPER")
142
+ # print("="*60)
143
+
144
+ # # Use the raw_result we already have from the parser
145
+ # if raw_result:
146
+ # print(f"📄 Using parsed paper content")
147
+ # # Extract text content from the ConversionResult object
148
+ # try:
149
+ # paper_text = raw_result.document.export_to_markdown()
150
+ # except:
151
+ # # Fallback: try to get text content in another way
152
+ # paper_text = str(raw_result)
153
+
154
+ # print("🔎 Searching for FIRST AUTHOR's institution...")
155
+ # first_author_inst = logo_manager.extract_first_author_institution(paper_text)
156
+
157
+ # if first_author_inst:
158
+ # print(f"\n✅ FIRST AUTHOR INSTITUTION: {first_author_inst}")
159
+ # print(f"🔍 Searching for logo: {first_author_inst}")
160
+
161
+ # inst_logo_path = logo_manager.get_logo_path(first_author_inst, category="institute", use_google=args.use_google_search)
162
+ # if inst_logo_path:
163
+ # institution_logo_path = str(inst_logo_path)
164
+ # print(f"✅ Institution logo found: {institution_logo_path}")
165
+ # else:
166
+ # print(f"❌ Could not find/download logo for: {first_author_inst}")
167
+ # else:
168
+ # print("❌ No first author institution detected or matched with available logos")
169
+ # else:
170
+ # print("❌ No parsed content available")
171
+ # print("="*60 + "\n")
172
+
173
+ # # Handle conference logo
174
+ # if args.conference_venue and not conference_logo_path:
175
+ # print("\n" + "="*60)
176
+ # print("🏛️ SEARCHING FOR CONFERENCE LOGO")
177
+ # print("="*60)
178
+ # print(f"📍 Conference: {args.conference_venue}")
179
+ # print(f"🔍 Searching for logo...")
180
+
181
+ # conf_logo_path = logo_manager.get_logo_path(args.conference_venue, category="conference", use_google=args.use_google_search)
182
+ # if conf_logo_path:
183
+ # conference_logo_path = str(conf_logo_path)
184
+ # print(f"✅ Conference logo found: {conference_logo_path}")
185
+ # else:
186
+ # print(f"❌ Could not find/download logo for: {args.conference_venue}")
187
+ # # Note: Web search is now handled inside get_logo_path automatically
188
+ # print("="*60 + "\n")
189
 
190
  # Step 2: Filter unnecessary images and tables
191
  input_token, output_token = filter_image_table(args, agent_config_t)
 
217
  detail_log['outline_in_t'] = input_token
218
  detail_log['outline_out_t'] = output_token
219
 
220
+ # if args.ablation_no_tree_layout:
221
+ # panel_arrangement, figure_arrangement, text_arrangement, input_token, output_token = no_tree_get_layout(
222
+ # poster_width,
223
+ # poster_height,
224
+ # panels,
225
+ # figures,
226
+ # agent_config_t
227
+ # )
228
+ # total_input_tokens_t += input_token
229
+ # total_output_tokens_t += output_token
230
+ # print(f'No tree layout token consumption: {input_token} -> {output_token}')
231
+ # detail_log['no_tree_layout_in_t'] = input_token
232
+ # detail_log['no_tree_layout_out_t'] = output_token
233
+ # else:
234
+
235
+ # # Step 4: Learn and generate layout
236
+ # panel_model_params, figure_model_params = main_train()
237
+
238
+ # panel_arrangement, figure_arrangement, text_arrangement = main_inference(
239
+ # panels,
240
+ # panel_model_params,
241
+ # figure_model_params,
242
+ # poster_width,
243
+ # poster_height,
244
+ # shrink_margin=3
245
+ # )
246
+
247
+ # text_arrangement_title = text_arrangement[0]
248
+ # text_arrangement = text_arrangement[1:]
249
+ # # Split the title textbox into two parts
250
+ # text_arrangement_title_top, text_arrangement_title_bottom = split_textbox(
251
+ # text_arrangement_title,
252
+ # 0.8
253
+ # )
254
+ # # Add the split textboxes back to the list
255
+ # text_arrangement = [text_arrangement_title_top, text_arrangement_title_bottom] + text_arrangement
256
+
257
+ # for i in range(len(figure_arrangement)):
258
+ # panel_id = figure_arrangement[i]['panel_id']
259
+ # panel_section_name = panels[panel_id]['section_name']
260
+ # figure_info = figures[panel_section_name]
261
+ # if 'image' in figure_info:
262
+ # figure_id = figure_info['image']
263
+ # if not figure_id in images:
264
+ # figure_path = images[str(figure_id)]['image_path']
265
+ # else:
266
+ # figure_path = images[figure_id]['image_path']
267
+ # elif 'table' in figure_info:
268
+ # figure_id = figure_info['table']
269
+ # if not figure_id in tables:
270
+ # figure_path = tables[str(figure_id)]['table_path']
271
+ # else:
272
+ # figure_path = tables[figure_id]['table_path']
273
+
274
+ # figure_arrangement[i]['figure_path'] = figure_path
275
+
276
+ # for text_arrangement_item in text_arrangement:
277
+ # num_chars = char_capacity(
278
+ # bbox=(text_arrangement_item['x'], text_arrangement_item['y'], text_arrangement_item['height'], text_arrangement_item['width'])
279
+ # )
280
+ # text_arrangement_item['num_chars'] = num_chars
281
+
282
 
283
+ # width_inch, height_inch, panel_arrangement_inches, figure_arrangement_inches, text_arrangement_inches = get_arrangments_in_inches(
284
+ # poster_width, poster_height, panel_arrangement, figure_arrangement, text_arrangement, 25
285
+ # )
286
+
287
+ # # Save to file
288
+ # tree_split_results = {
289
+ # 'poster_width': poster_width,
290
+ # 'poster_height': poster_height,
291
+ # 'poster_width_inches': width_inch,
292
+ # 'poster_height_inches': height_inch,
293
+ # 'panels': panels,
294
+ # 'panel_arrangement': panel_arrangement,
295
+ # 'figure_arrangement': figure_arrangement,
296
+ # 'text_arrangement': text_arrangement,
297
+ # 'panel_arrangement_inches': panel_arrangement_inches,
298
+ # 'figure_arrangement_inches': figure_arrangement_inches,
299
+ # 'text_arrangement_inches': text_arrangement_inches,
300
+ # }
301
+
302
+ # ============================
303
+ # ### NEW: only build a simple figure_arrangement with {panel_id, figure_path}
304
+ # ============================
305
+
306
+ # 有些项目把 images/tables 放在上游全局;若未定义,则从过滤结果 JSON 兜底加载
307
+ try:
308
+ images
309
+ except NameError:
310
+ images = json.load(open(f'<{args.model_name_t}_{args.model_name_v}>_images_and_tables/{args.poster_name}_images_filtered.json', 'r'))
311
+ try:
312
+ tables
313
+ except NameError:
314
+ tables = json.load(open(f'<{args.model_name_t}_{args.model_name_v}>_images_and_tables/{args.poster_name}_tables_filtered.json', 'r'))
315
+
316
+ # 建立 section_name -> panel_id 的映射
317
+ section2pid = {p['section_name']: p['panel_id'] for p in panels}
318
+
319
+ # 生成精简后的 figure_arrangement:只保留 panel_id 与 figure_path
320
+ simple_figure_arrangement = []
321
+ for section_name, f in figures.items():
322
+ if section_name not in section2pid:
323
+ continue
324
+ pid = section2pid[section_name]
325
+
326
+ fig_path = None
327
+ if 'image' in f:
328
+ fid = str(f['image'])
329
+ info = images.get(fid) or images.get(str(fid)) or {}
330
+ fig_path = info.get('image_path')
331
+ elif 'table' in f:
332
+ tid = str(f['table'])
333
+ info = tables.get(tid) or tables.get(str(tid)) or {}
334
+ fig_path = info.get('table_path')
335
+
336
+ if fig_path: # 只收集有路径的
337
+ simple_figure_arrangement.append({
338
+ 'panel_id': pid,
339
+ 'figure_path': fig_path,
340
+ })
341
+
342
+ # ============================
343
+ # ### REMOVED: no layout/train/text capacity/inches conversion
344
+ # - 删除 args.ablation_no_tree_layout 分支
345
+ # - 删除 main_train() / main_inference()
346
+ # - 删除为 figure_arrangement[i] 补 figure_path 的循环
347
+ # - 删除 text_arrangement / char_capacity / get_arrangments_in_inches
348
+ # ============================
349
+
350
+ # Save to file (只保留 panels + figure_arrangement)
351
  tree_split_results = {
 
 
 
 
352
  'panels': panels,
353
+ 'figure_arrangement': simple_figure_arrangement,
 
 
 
 
 
354
  }
355
+
356
  os.makedirs('tree_splits', exist_ok=True)
357
  with open(f'tree_splits/<{args.model_name_t}_{args.model_name_v}>_{args.poster_name}_tree_split_{args.index}.json', 'w') as f:
358
  json.dump(tree_split_results, f, indent=4)
Paper2Poster/PosterAgent/parse_raw.py CHANGED
@@ -114,12 +114,12 @@ def parse_raw(args, actor_config, version=2):
114
  print(f"Ouch! The response is invalid, the LLM is not following the format :(")
115
  print('Trying again...')
116
  raise
117
- if 'title' in section['title'].lower():
118
- has_title = True
119
 
120
- if not has_title:
121
- print('Ouch! The response is invalid, the LLM is not following the format :(')
122
- raise
123
 
124
  os.makedirs('contents', exist_ok=True)
125
  json.dump(content_json, open(f'contents/<{args.model_name_t}_{args.model_name_v}>_{args.poster_name}_raw_content.json', 'w'), indent=4)
 
114
  print(f"Ouch! The response is invalid, the LLM is not following the format :(")
115
  print('Trying again...')
116
  raise
117
+ # if 'title' in section['title'].lower():
118
+ # has_title = True
119
 
120
+ # if not has_title:
121
+ # print('Ouch! The response is invalid, the LLM is not following the format :(')
122
+ # raise
123
 
124
  os.makedirs('contents', exist_ok=True)
125
  json.dump(content_json, open(f'contents/<{args.model_name_t}_{args.model_name_v}>_{args.poster_name}_raw_content.json', 'w'), indent=4)
Paper2Poster/utils/prompt_templates/poster_planner_new_v2.yaml CHANGED
@@ -20,6 +20,7 @@ system_prompt: |
20
  • The final output must be a single JSON object, mapping from section names to the chosen image/table ID plus the “reason” field.
21
  • Extra note: If multiple images or tables are suitable, select the single best one and assign only that.
22
  • If “image_information” or “table_information” is empty, you may end up assigning nothing to any section.
 
23
 
24
  template: |
25
  Instructions:
 
20
  • The final output must be a single JSON object, mapping from section names to the chosen image/table ID plus the “reason” field.
21
  • Extra note: If multiple images or tables are suitable, select the single best one and assign only that.
22
  • If “image_information” or “table_information” is empty, you may end up assigning nothing to any section.
23
+ • If there is a “teaser” or “overview” figure (often summarizing the entire paper or framework), prioritize assigning it to the first section or placing it at the beginning of the poster.
24
 
25
  template: |
26
  Instructions:
Paper2Poster/utils/prompts/gen_poster_raw_content_v2.txt CHANGED
@@ -1,28 +1,35 @@
1
- You are a document content divider and extractor specialist, expert in dividing and extracting content from various types of documents and reorganizing it into a two-level json format for later poster generation.
2
- And a LaTeX-enhanced document structuring and extraction specialist, expert in dividing academic content into logical sections and automatically emphasizing key ideas with LaTeX formatting suitable for Beamer poster generation.
 
3
 
4
  Based on given markdown document, generate a JSON output for direct insertion into a LaTeX Beamer poster, make sure the output is concise and focused.
5
 
 
 
 
6
  Step-by-Step Instructions:
7
  1. Identify Sections and Subsections in document and identify sections and subsections based on the heading levels and logical structure.
8
 
9
- 2. Divide Content: Reorganize the content into sections and subsections, ensuring that each subsection contains approximately 500 words.
10
 
11
- 3. Refine Titles: Create titles for each section with at most 3 words.
12
 
13
  4. Remove Unwanted Elements: Eliminate any unwanted elements such as headers, footers, text surrounded by `~~` indicating deletion.
14
 
15
- 5. Refine Text: For content, you should keep as much raw text as possible. Do not include citations.
16
 
17
- 6. Length: you should control the length of each section, according to their importance according to your understanding of the paper. For important sections, their content should be long.
18
 
19
  7. Make sure there is a poster title section at the beginning, and it should contain information like paper title, author, organization etc.
20
 
21
- 8. The "meta" key contains the meta information of the poster, where the title should be the raw title of the paper and is not summarized.
 
 
 
 
 
22
 
23
- 9. Ther **must** be a section for the poster title.
24
-
25
- 10. **IMPORTANT** Within the section content, use LaTeX commands to improve readability:
26
  - Use `\textbf{}` to emphasize *key terms, results, and conclusions*.
27
  - Use `\textit{}` for *concepts or variable names*.
28
  - Use `\textcolor{blue}{}` for *important statistics, numerical values, or methods*.
@@ -30,6 +37,11 @@ Step-by-Step Instructions:
30
  - NEVER output “extbf”, “extit”, or “extcolor”.
31
  - The final output must compile directly in LaTeX Beamer poster without errors.
32
 
 
 
 
 
 
33
  Example Output:
34
  {
35
  "meta": {
@@ -39,17 +51,16 @@ Example Output:
39
  },
40
  "sections": [
41
  {
42
- "title": "Poster Title & Author",
43
- "content": "content of poster title and author"
44
  },
45
  {
46
- "title": "title of section1",
47
- "content": "content of section 1 (e.g. We aim to \textbf{clarify the causal relationships} in text-to-image models. Previous studies \textcolor{red}{overlooked confounding embeddings}, leading to poor interpretability.)"
48
  },
49
  {
50
- "title": "title of section2",
51
- "content": "content of section 2 (e.g. The optimized model achieves \textcolor{blue}{+12.5\% accuracy improvement} and significantly reduces \textcolor{red}{semantic leakage}. Results demonstrate that \textbf{embedding optimization improves fidelity}.)"
52
- }
53
  ]
54
  }
55
 
 
1
+ You are a poster content designer & academic summarization expert, skilled in condensing complex ideas into visually engaging, high-impact content suitable for LaTeX Beamer posters.
2
+
3
+ Your goal is to extract, refine, and restructure the given markdown document into a 2-level JSON format that reads like a poster — concise, visually balanced, and lively.
4
 
5
  Based on given markdown document, generate a JSON output for direct insertion into a LaTeX Beamer poster, make sure the output is concise and focused.
6
 
7
+ Key goals: balance brevity, energy, and clarity — every line should look like it belongs on a poster.
8
+ Style: write with high-energy, action-oriented phrasing (e.g., “We reveal...”, “Our framework boosts...”), avoiding passive, overly academic tone.
9
+
10
  Step-by-Step Instructions:
11
  1. Identify Sections and Subsections in document and identify sections and subsections based on the heading levels and logical structure.
12
 
13
+ 2. Divide Content: Reorganize the content into sections (each <70 words) ( 7 ~ 8 sections in total) focusing on key findings, core methods, and \textbf{what’s new in this work}. Every 15–20 words should contain a focus marker at least
14
 
15
+ 3. Refine Titles: Create titles for each section with at most 6 words. Make titles dynamic, not too short/long and memorable
16
 
17
  4. Remove Unwanted Elements: Eliminate any unwanted elements such as headers, footers, text surrounded by `~~` indicating deletion.
18
 
19
+ 5. Refine Text: Write as if explaining to an intelligent but busy audience at a poster session. Contents should stay crisp and energetic. Remove redundant descriptions, long background context, and citation markers.
20
 
21
+ 6. You may use symbols (bullet points) for logical progression, but keep at most 3 per section, only use it when necessary.
22
 
23
  7. Make sure there is a poster title section at the beginning, and it should contain information like paper title, author, organization etc.
24
 
25
+ 8. The "meta" key contains the meta information of the poster, where the title **MUST** should be the **RAW** title of the paper and is not summarized.
26
+ Do NOT summarize, translate, or rephrase any of these fields.
27
+ - For authors, use LaTeX superscript notation to indicate institutional affiliation clearly.
28
+ Example: "Lily\textsuperscript{1}, Bob\textsuperscript{2}"
29
+ - Match superscripts with their corresponding institution numbers in the "affiliations" field, formatted as:
30
+ "1 Department of AI, NUS; 2 School of Computing, NTU"
31
 
32
+ 9. **IMPORTANT** Within the section content, use LaTeX commands to improve readability:
 
 
33
  - Use `\textbf{}` to emphasize *key terms, results, and conclusions*.
34
  - Use `\textit{}` for *concepts or variable names*.
35
  - Use `\textcolor{blue}{}` for *important statistics, numerical values, or methods*.
 
37
  - NEVER output “extbf”, “extit”, or “extcolor”.
38
  - The final output must compile directly in LaTeX Beamer poster without errors.
39
 
40
+ 10. Strictly Skip title and author part:
41
+ Under NO circumstances should the model include any title, author name, or affiliation text found in the first section of the input document.
42
+ These elements will be pre-defined from the “meta” field and should not be included in the generated sections.
43
+ The model should start directly with the main content, skipping all title and author information.
44
+
45
  Example Output:
46
  {
47
  "meta": {
 
51
  },
52
  "sections": [
53
  {
54
+ "title": "Motivation of this work",
55
+ "content": "We aim to \\textbf{clarify the causal relationships} in text-to-image models. Previous studies \\textcolor{red}{overlooked confounding embeddings}, leading to poor interpretability."
56
  },
57
  {
58
+ "title": "What make this task challenging",
59
+ "content": " \\textbf{Text-to-image generation} involves complex multi-modal dependencies.\\\\• Embeddings encode both \\textcolor{red}{semantic} and \\textcolor{blue}{stylistic} information.\\\\• Disentangle causal factors and ensure faithful image synthesis."
60
  },
61
  {
62
+ "title": "Key Contributions",
63
+ }
 
64
  ]
65
  }
66
 
Paper2Poster/utils/src/model_utils.py CHANGED
@@ -3,95 +3,95 @@ import os
3
  from copy import deepcopy
4
 
5
  import numpy as np
6
- import torch
7
- import torchvision.transforms as T
8
  from FlagEmbedding import BGEM3FlagModel
9
  from marker.config.parser import ConfigParser
10
  from marker.converters.pdf import PdfConverter
11
  from marker.output import text_from_rendered
12
  from PIL import Image
13
- from torchvision.transforms.functional import InterpolationMode
14
- from transformers import AutoFeatureExtractor, AutoModel
15
 
16
  from utils.src.presentation import Presentation, SlidePage
17
  from utils.src.utils import is_image_path, pjoin
18
 
19
- device_count = torch.cuda.device_count()
20
-
21
-
22
- def prs_dedup(
23
- presentation: Presentation,
24
- model: BGEM3FlagModel,
25
- batchsize: int = 32,
26
- threshold: float = 0.8,
27
- ) -> list[SlidePage]:
28
- """
29
- Deduplicate slides in a presentation based on text similarity.
30
-
31
- Args:
32
- presentation (Presentation): The presentation object containing slides.
33
- model: The model used for generating text embeddings.
34
- batchsize (int): The batch size for processing slides.
35
- threshold (float): The similarity threshold for deduplication.
36
-
37
- Returns:
38
- list: A list of removed duplicate slides.
39
- """
40
- text_embeddings = get_text_embedding(
41
- [i.to_text() for i in presentation.slides], model, batchsize
42
- )
43
- pre_embedding = text_embeddings[0]
44
- slide_idx = 1
45
- duplicates = []
46
- while slide_idx < len(presentation):
47
- cur_embedding = text_embeddings[slide_idx]
48
- if torch.cosine_similarity(pre_embedding, cur_embedding, -1) > threshold:
49
- duplicates.append(slide_idx - 1)
50
- slide_idx += 1
51
- pre_embedding = cur_embedding
52
- return [presentation.slides.pop(i) for i in reversed(duplicates)]
53
-
54
-
55
- def get_text_model(device: str = None) -> BGEM3FlagModel:
56
- """
57
- Initialize and return a text model.
58
-
59
- Args:
60
- device (str): The device to run the model on.
61
-
62
- Returns:
63
- BGEM3FlagModel: The initialized text model.
64
- """
65
- return BGEM3FlagModel(
66
- "BAAI/bge-m3",
67
- use_fp16=True,
68
- device=device,
69
- )
70
-
71
-
72
- def get_image_model(device: str = None):
73
- """
74
- Initialize and return an image model and its feature extractor.
75
-
76
- Args:
77
- device (str): The device to run the model on.
78
-
79
- Returns:
80
- tuple: A tuple containing the feature extractor and the image model.
81
- """
82
- model_base = "google/vit-base-patch16-224-in21k"
83
- return (
84
- AutoFeatureExtractor.from_pretrained(
85
- model_base,
86
- torch_dtype=torch.float16,
87
- device_map=device,
88
- ),
89
- AutoModel.from_pretrained(
90
- model_base,
91
- torch_dtype=torch.float16,
92
- device_map=device,
93
- ).eval(),
94
- )
95
 
96
 
97
  def parse_pdf(
@@ -140,158 +140,158 @@ def parse_pdf(
140
  return full_text
141
 
142
 
143
- def get_text_embedding(
144
- text: list[str], model: BGEM3FlagModel, batchsize: int = 32
145
- ) -> list[torch.Tensor]:
146
- """
147
- Generate text embeddings for a list of text strings.
148
-
149
- Args:
150
- text (list[str]): A list of text strings.
151
- model: The model used for generating embeddings.
152
- batchsize (int): The batch size for processing text.
153
-
154
- Returns:
155
- list: A list of text embeddings.
156
- """
157
- if isinstance(text, str):
158
- return torch.tensor(model.encode(text)["dense_vecs"]).to(model.device)
159
- result = []
160
- for i in range(0, len(text), batchsize):
161
- result.extend(
162
- torch.tensor(model.encode(text[i : i + batchsize])["dense_vecs"]).to(
163
- model.device
164
- )
165
- )
166
- return result
167
-
168
-
169
- def get_image_embedding(
170
- image_dir: str, extractor, model, batchsize: int = 16
171
- ) -> dict[str, torch.Tensor]:
172
- """
173
- Generate image embeddings for images in a directory.
174
-
175
- Args:
176
- image_dir (str): The directory containing images.
177
- extractor: The feature extractor for images.
178
- model: The model used for generating embeddings.
179
- batchsize (int): The batch size for processing images.
180
-
181
- Returns:
182
- dict: A dictionary mapping image filenames to their embeddings.
183
- """
184
- transform = T.Compose(
185
- [
186
- T.Resize(int((256 / 224) * extractor.size["height"])),
187
- T.CenterCrop(extractor.size["height"]),
188
- T.ToTensor(),
189
- T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
190
- ]
191
- )
192
-
193
- inputs = []
194
- embeddings = []
195
- images = [i for i in sorted(os.listdir(image_dir)) if is_image_path(i)]
196
- for file in images:
197
- image = Image.open(pjoin(image_dir, file)).convert("RGB")
198
- inputs.append(transform(image))
199
- if len(inputs) % batchsize == 0 or file == images[-1]:
200
- batch = {"pixel_values": torch.stack(inputs).to(model.device)}
201
- embeddings.extend(model(**batch).last_hidden_state.detach())
202
- inputs.clear()
203
- return {image: embedding.flatten() for image, embedding in zip(images, embeddings)}
204
-
205
-
206
- def images_cosine_similarity(embeddings: list[torch.Tensor]) -> torch.Tensor:
207
- """
208
- Calculate the cosine similarity matrix for a list of embeddings.
209
- Args:
210
- embeddings (list[torch.Tensor]): A list of image embeddings.
211
-
212
- Returns:
213
- torch.Tensor: A NxN similarity matrix.
214
- """
215
- embeddings = [embedding for embedding in embeddings]
216
- sim_matrix = torch.zeros((len(embeddings), len(embeddings)))
217
- for i in range(len(embeddings)):
218
- for j in range(i + 1, len(embeddings)):
219
- sim_matrix[i, j] = sim_matrix[j, i] = torch.cosine_similarity(
220
- embeddings[i], embeddings[j], -1
221
- )
222
- return sim_matrix
223
 
224
 
225
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
226
  IMAGENET_STD = (0.229, 0.224, 0.225)
227
 
228
 
229
- def average_distance(
230
- similarity: torch.Tensor, idx: int, cluster_idx: list[int]
231
- ) -> float:
232
- """
233
- Calculate the average distance between a point (idx) and a cluster (cluster_idx).
234
-
235
- Args:
236
- similarity (np.ndarray): The similarity matrix.
237
- idx (int): The index of the point.
238
- cluster_idx (list): The indices of the cluster.
239
-
240
- Returns:
241
- float: The average distance.
242
- """
243
- if idx in cluster_idx:
244
- return 0
245
- total_similarity = 0
246
- for idx_in_cluster in cluster_idx:
247
- total_similarity += similarity[idx, idx_in_cluster]
248
- return total_similarity / len(cluster_idx)
249
-
250
-
251
- def get_cluster(similarity: np.ndarray, sim_bound: float = 0.65):
252
- """
253
- Cluster points based on similarity.
254
-
255
- Args:
256
- similarity (np.ndarray): The similarity matrix.
257
- sim_bound (float): The similarity threshold for clustering.
258
-
259
- Returns:
260
- list: A list of clusters.
261
- """
262
- num_points = similarity.shape[0]
263
- clusters = []
264
- sim_copy = deepcopy(similarity)
265
- added = [False] * num_points
266
- while True:
267
- max_avg_dist = sim_bound
268
- best_cluster = None
269
- best_point = None
270
-
271
- for c in clusters:
272
- for point_idx in range(num_points):
273
- if added[point_idx]:
274
- continue
275
- avg_dist = average_distance(sim_copy, point_idx, c)
276
- if avg_dist > max_avg_dist:
277
- max_avg_dist = avg_dist
278
- best_cluster = c
279
- best_point = point_idx
280
-
281
- if best_point is not None:
282
- best_cluster.append(best_point)
283
- added[best_point] = True
284
- similarity[best_point, :] = 0
285
- similarity[:, best_point] = 0
286
- else:
287
- if similarity.max() < sim_bound:
288
- break
289
- i, j = np.unravel_index(np.argmax(similarity), similarity.shape)
290
- clusters.append([int(i), int(j)])
291
- added[i] = True
292
- added[j] = True
293
- similarity[i, :] = 0
294
- similarity[:, i] = 0
295
- similarity[j, :] = 0
296
- similarity[:, j] = 0
297
- return clusters
 
3
  from copy import deepcopy
4
 
5
  import numpy as np
6
+ # import torch
7
+ # import torchvision.transforms as T
8
  from FlagEmbedding import BGEM3FlagModel
9
  from marker.config.parser import ConfigParser
10
  from marker.converters.pdf import PdfConverter
11
  from marker.output import text_from_rendered
12
  from PIL import Image
13
+ # from torchvision.transforms.functional import InterpolationMode
14
+ # from transformers import AutoFeatureExtractor, AutoModel
15
 
16
  from utils.src.presentation import Presentation, SlidePage
17
  from utils.src.utils import is_image_path, pjoin
18
 
19
+ # device_count = torch.cuda.device_count()
20
+
21
+
22
+ # def prs_dedup(
23
+ # presentation: Presentation,
24
+ # model: BGEM3FlagModel,
25
+ # batchsize: int = 32,
26
+ # threshold: float = 0.8,
27
+ # ) -> list[SlidePage]:
28
+ # """
29
+ # Deduplicate slides in a presentation based on text similarity.
30
+
31
+ # Args:
32
+ # presentation (Presentation): The presentation object containing slides.
33
+ # model: The model used for generating text embeddings.
34
+ # batchsize (int): The batch size for processing slides.
35
+ # threshold (float): The similarity threshold for deduplication.
36
+
37
+ # Returns:
38
+ # list: A list of removed duplicate slides.
39
+ # """
40
+ # text_embeddings = get_text_embedding(
41
+ # [i.to_text() for i in presentation.slides], model, batchsize
42
+ # )
43
+ # pre_embedding = text_embeddings[0]
44
+ # slide_idx = 1
45
+ # duplicates = []
46
+ # while slide_idx < len(presentation):
47
+ # cur_embedding = text_embeddings[slide_idx]
48
+ # if torch.cosine_similarity(pre_embedding, cur_embedding, -1) > threshold:
49
+ # duplicates.append(slide_idx - 1)
50
+ # slide_idx += 1
51
+ # pre_embedding = cur_embedding
52
+ # return [presentation.slides.pop(i) for i in reversed(duplicates)]
53
+
54
+
55
+ # def get_text_model(device: str = None) -> BGEM3FlagModel:
56
+ # """
57
+ # Initialize and return a text model.
58
+
59
+ # Args:
60
+ # device (str): The device to run the model on.
61
+
62
+ # Returns:
63
+ # BGEM3FlagModel: The initialized text model.
64
+ # """
65
+ # return BGEM3FlagModel(
66
+ # "BAAI/bge-m3",
67
+ # use_fp16=True,
68
+ # device=device,
69
+ # )
70
+
71
+
72
+ # def get_image_model(device: str = None):
73
+ # """
74
+ # Initialize and return an image model and its feature extractor.
75
+
76
+ # Args:
77
+ # device (str): The device to run the model on.
78
+
79
+ # Returns:
80
+ # tuple: A tuple containing the feature extractor and the image model.
81
+ # """
82
+ # model_base = "google/vit-base-patch16-224-in21k"
83
+ # return (
84
+ # AutoFeatureExtractor.from_pretrained(
85
+ # model_base,
86
+ # torch_dtype=torch.float16,
87
+ # device_map=device,
88
+ # ),
89
+ # AutoModel.from_pretrained(
90
+ # model_base,
91
+ # torch_dtype=torch.float16,
92
+ # device_map=device,
93
+ # ).eval(),
94
+ # )
95
 
96
 
97
  def parse_pdf(
 
140
  return full_text
141
 
142
 
143
+ # def get_text_embedding(
144
+ # text: list[str], model: BGEM3FlagModel, batchsize: int = 32
145
+ # ) -> list[torch.Tensor]:
146
+ # """
147
+ # Generate text embeddings for a list of text strings.
148
+
149
+ # Args:
150
+ # text (list[str]): A list of text strings.
151
+ # model: The model used for generating embeddings.
152
+ # batchsize (int): The batch size for processing text.
153
+
154
+ # Returns:
155
+ # list: A list of text embeddings.
156
+ # """
157
+ # if isinstance(text, str):
158
+ # return torch.tensor(model.encode(text)["dense_vecs"]).to(model.device)
159
+ # result = []
160
+ # for i in range(0, len(text), batchsize):
161
+ # result.extend(
162
+ # torch.tensor(model.encode(text[i : i + batchsize])["dense_vecs"]).to(
163
+ # model.device
164
+ # )
165
+ # )
166
+ # return result
167
+
168
+
169
+ # def get_image_embedding(
170
+ # image_dir: str, extractor, model, batchsize: int = 16
171
+ # ) -> dict[str, torch.Tensor]:
172
+ # """
173
+ # Generate image embeddings for images in a directory.
174
+
175
+ # Args:
176
+ # image_dir (str): The directory containing images.
177
+ # extractor: The feature extractor for images.
178
+ # model: The model used for generating embeddings.
179
+ # batchsize (int): The batch size for processing images.
180
+
181
+ # Returns:
182
+ # dict: A dictionary mapping image filenames to their embeddings.
183
+ # """
184
+ # transform = T.Compose(
185
+ # [
186
+ # T.Resize(int((256 / 224) * extractor.size["height"])),
187
+ # T.CenterCrop(extractor.size["height"]),
188
+ # T.ToTensor(),
189
+ # T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
190
+ # ]
191
+ # )
192
+
193
+ # inputs = []
194
+ # embeddings = []
195
+ # images = [i for i in sorted(os.listdir(image_dir)) if is_image_path(i)]
196
+ # for file in images:
197
+ # image = Image.open(pjoin(image_dir, file)).convert("RGB")
198
+ # inputs.append(transform(image))
199
+ # if len(inputs) % batchsize == 0 or file == images[-1]:
200
+ # batch = {"pixel_values": torch.stack(inputs).to(model.device)}
201
+ # embeddings.extend(model(**batch).last_hidden_state.detach())
202
+ # inputs.clear()
203
+ # return {image: embedding.flatten() for image, embedding in zip(images, embeddings)}
204
+
205
+
206
+ # def images_cosine_similarity(embeddings: list[torch.Tensor]) -> torch.Tensor:
207
+ # """
208
+ # Calculate the cosine similarity matrix for a list of embeddings.
209
+ # Args:
210
+ # embeddings (list[torch.Tensor]): A list of image embeddings.
211
+
212
+ # Returns:
213
+ # torch.Tensor: A NxN similarity matrix.
214
+ # """
215
+ # embeddings = [embedding for embedding in embeddings]
216
+ # sim_matrix = torch.zeros((len(embeddings), len(embeddings)))
217
+ # for i in range(len(embeddings)):
218
+ # for j in range(i + 1, len(embeddings)):
219
+ # sim_matrix[i, j] = sim_matrix[j, i] = torch.cosine_similarity(
220
+ # embeddings[i], embeddings[j], -1
221
+ # )
222
+ # return sim_matrix
223
 
224
 
225
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
226
  IMAGENET_STD = (0.229, 0.224, 0.225)
227
 
228
 
229
+ # def average_distance(
230
+ # similarity: torch.Tensor, idx: int, cluster_idx: list[int]
231
+ # ) -> float:
232
+ # """
233
+ # Calculate the average distance between a point (idx) and a cluster (cluster_idx).
234
+
235
+ # Args:
236
+ # similarity (np.ndarray): The similarity matrix.
237
+ # idx (int): The index of the point.
238
+ # cluster_idx (list): The indices of the cluster.
239
+
240
+ # Returns:
241
+ # float: The average distance.
242
+ # """
243
+ # if idx in cluster_idx:
244
+ # return 0
245
+ # total_similarity = 0
246
+ # for idx_in_cluster in cluster_idx:
247
+ # total_similarity += similarity[idx, idx_in_cluster]
248
+ # return total_similarity / len(cluster_idx)
249
+
250
+
251
+ # def get_cluster(similarity: np.ndarray, sim_bound: float = 0.65):
252
+ # """
253
+ # Cluster points based on similarity.
254
+
255
+ # Args:
256
+ # similarity (np.ndarray): The similarity matrix.
257
+ # sim_bound (float): The similarity threshold for clustering.
258
+
259
+ # Returns:
260
+ # list: A list of clusters.
261
+ # """
262
+ # num_points = similarity.shape[0]
263
+ # clusters = []
264
+ # sim_copy = deepcopy(similarity)
265
+ # added = [False] * num_points
266
+ # while True:
267
+ # max_avg_dist = sim_bound
268
+ # best_cluster = None
269
+ # best_point = None
270
+
271
+ # for c in clusters:
272
+ # for point_idx in range(num_points):
273
+ # if added[point_idx]:
274
+ # continue
275
+ # avg_dist = average_distance(sim_copy, point_idx, c)
276
+ # if avg_dist > max_avg_dist:
277
+ # max_avg_dist = avg_dist
278
+ # best_cluster = c
279
+ # best_point = point_idx
280
+
281
+ # if best_point is not None:
282
+ # best_cluster.append(best_point)
283
+ # added[best_point] = True
284
+ # similarity[best_point, :] = 0
285
+ # similarity[:, best_point] = 0
286
+ # else:
287
+ # if similarity.max() < sim_bound:
288
+ # break
289
+ # i, j = np.unravel_index(np.argmax(similarity), similarity.shape)
290
+ # clusters.append([int(i), int(j)])
291
+ # added[i] = True
292
+ # added[j] = True
293
+ # similarity[i, :] = 0
294
+ # similarity[:, i] = 0
295
+ # similarity[j, :] = 0
296
+ # similarity[:, j] = 0
297
+ # return clusters
app.py CHANGED
@@ -1,22 +1,24 @@
1
- import gradio as gr
2
  import subprocess, shutil, os, zipfile, datetime
3
  from pathlib import Path
4
 
5
  ROOT = Path(__file__).resolve().parent
6
  OUTPUT_DIR = ROOT / "output"
 
 
 
7
  ZIP_PATH = ROOT / "output.zip"
8
  LOG_PATH = ROOT / "last_run.log"
9
 
10
- def run_pipeline(model_name_t, model_name_v, result_dir, paper_latex_root, arxiv_url, openai_key, gemini_key):
11
  start_time = datetime.datetime.now()
12
  logs = [f"🚀 Starting pipeline at {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n"]
13
 
14
- # 🧩 确保 output 目录存在(避免 No output generated)
15
- if not OUTPUT_DIR.exists():
16
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
17
- logs.append(f"📁 Created output directory: {OUTPUT_DIR}\n")
18
 
19
- # 🧹 清理旧输出(但保留空目录)
20
  for item in OUTPUT_DIR.iterdir():
21
  if item.is_dir():
22
  shutil.rmtree(item)
@@ -24,26 +26,74 @@ def run_pipeline(model_name_t, model_name_v, result_dir, paper_latex_root, arxiv
24
  item.unlink()
25
  if ZIP_PATH.exists():
26
  ZIP_PATH.unlink()
27
- logs.append("🧹 Cleaned previous output and zip files.\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # 构造命令
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  cmd = [
31
  "python", "pipeline.py",
32
- "--model_name_t", model_name_t,
33
- "--model_name_v", model_name_v,
34
- "--result_dir", result_dir,
35
- "--paper_latex_root", paper_latex_root,
36
- "--arxiv_url", arxiv_url,
 
 
37
  ]
38
 
39
- # 临时设置 API keys(供 pipeline 内部使用)
40
- os.environ["OPENAI_API_KEY"] = openai_key or ""
41
- os.environ["GEMINI_API_KEY"] = gemini_key or ""
 
42
 
43
- logs.append(f"🧠 Running command: {' '.join(cmd)}\n")
44
 
45
  try:
46
- # 同时捕获 stdout + stderr
47
  result = subprocess.run(
48
  cmd, capture_output=True, text=True, timeout=1800
49
  )
@@ -55,21 +105,20 @@ def run_pipeline(model_name_t, model_name_v, result_dir, paper_latex_root, arxiv
55
  msg = "❌ Pipeline timed out (30 min limit)."
56
  logs.append(msg)
57
  _write_logs(logs)
58
- return msg, None
59
  except Exception as e:
60
  msg = f"❌ Pipeline error: {e}"
61
  logs.append(msg)
62
  _write_logs(logs)
63
- return msg, None
64
 
65
- # 检查输出目录
66
  if not any(OUTPUT_DIR.iterdir()):
67
  msg = "❌ No output generated. Please check logs below."
68
  logs.append(msg)
69
  _write_logs(logs)
70
  return "\n".join(logs), None
71
 
72
- # 压缩 output 文件夹
73
  with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
74
  for root, dirs, files in os.walk(OUTPUT_DIR):
75
  for file in files:
@@ -81,14 +130,11 @@ def run_pipeline(model_name_t, model_name_v, result_dir, paper_latex_root, arxiv
81
  end_time = datetime.datetime.now()
82
  logs.append(f"🏁 Completed at {end_time.strftime('%Y-%m-%d %H:%M:%S')} (Duration: {(end_time - start_time).seconds}s)\n")
83
 
84
- # 保存日志到文件
85
  _write_logs(logs)
86
-
87
  return "\n".join(logs), ZIP_PATH
88
 
89
 
90
  def _write_logs(logs):
91
- """将日志写入文件,便于 HF Logs 窗口调试"""
92
  with open(LOG_PATH, "w", encoding="utf-8") as f:
93
  f.write("\n".join(logs))
94
 
@@ -97,20 +143,20 @@ def _write_logs(logs):
97
  iface = gr.Interface(
98
  fn=run_pipeline,
99
  inputs=[
100
- gr.Textbox(label="Model Name (Text)", value="gpt-4.1"),
101
- gr.Textbox(label="Model Name (Vision)", value="gpt-4.1"),
102
- gr.Textbox(label="Result Dir", value="output"),
103
- gr.Textbox(label="Paper LaTeX Root", value="input/latex_proj"),
104
- gr.Textbox(label="ArXiv URL", value="https://arxiv.org/abs/2505.21497"),
105
- gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password"),
106
- gr.Textbox(label="Gemini API Key", placeholder="AIza...", type="password"),
107
  ],
108
  outputs=[
109
- gr.Textbox(label="Logs", lines=30, max_lines=50),
110
- gr.File(label="Download Output (.zip)")
111
  ],
112
  title="📄 PaperShow Pipeline",
113
- description="输入 arXiv 链接和参数,自动生成 slides + poster,结果打包下载。",
 
 
 
114
  allow_flagging="never",
115
  )
116
 
 
1
+ import gradio as gr
2
  import subprocess, shutil, os, zipfile, datetime
3
  from pathlib import Path
4
 
5
  ROOT = Path(__file__).resolve().parent
6
  OUTPUT_DIR = ROOT / "output"
7
+ INPUT_DIR = ROOT / "input"
8
+ LOGO_DIR = INPUT_DIR / "logo"
9
+ POSTER_LATEX_DIR = ROOT / "posterbuilder" / "latex_proj"
10
  ZIP_PATH = ROOT / "output.zip"
11
  LOG_PATH = ROOT / "last_run.log"
12
 
13
+ def run_pipeline(arxiv_url, pdf_file, openai_key, logo_files):
14
  start_time = datetime.datetime.now()
15
  logs = [f"🚀 Starting pipeline at {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n"]
16
 
17
+ # ====== 目录准备 ======
18
+ for d in [OUTPUT_DIR, LOGO_DIR, POSTER_LATEX_DIR, INPUT_DIR]:
19
+ d.mkdir(parents=True, exist_ok=True)
 
20
 
21
+ # 清理旧输出
22
  for item in OUTPUT_DIR.iterdir():
23
  if item.is_dir():
24
  shutil.rmtree(item)
 
26
  item.unlink()
27
  if ZIP_PATH.exists():
28
  ZIP_PATH.unlink()
29
+ logs.append("🧹 Cleaned previous output.\n")
30
+
31
+ # ====== 校验:必须上传 LOGO ======
32
+ # Gradio 可能返回单个文件对象或列表,这里统一成列表处理
33
+ if logo_files is None:
34
+ logo_files = []
35
+ if not isinstance(logo_files, (list, tuple)):
36
+ logo_files = [logo_files]
37
+ logo_files = [f for f in logo_files if f] # 过滤掉 None
38
+
39
+ if len(logo_files) == 0:
40
+ msg = "❌ 必须上传作者所属机构 Logo(可多张)。"
41
+ logs.append(msg)
42
+ _write_logs(logs)
43
+ return "\n".join(logs), None
44
 
45
+ # 清空 input/logo 后再保存
46
+ for item in LOGO_DIR.iterdir():
47
+ if item.is_file():
48
+ item.unlink()
49
+ saved_logo_paths = []
50
+ for lf in logo_files:
51
+ p = LOGO_DIR / Path(lf.name).name
52
+ shutil.copy(lf.name, p)
53
+ saved_logo_paths.append(p)
54
+ logs.append(f"🏷️ Saved {len(saved_logo_paths)} logo file(s) to: {LOGO_DIR}\n")
55
+
56
+ # ====== 处理上传 PDF(可选) ======
57
+ pdf_path = None
58
+ if pdf_file:
59
+ pdf_dir = INPUT_DIR / "pdf"
60
+ pdf_dir.mkdir(parents=True, exist_ok=True)
61
+ pdf_path = pdf_dir / Path(pdf_file.name).name
62
+ shutil.copy(pdf_file.name, pdf_path)
63
+ logs.append(f"📄 Uploaded PDF saved to: {pdf_path}\n")
64
+
65
+ # 为 pipeline 的 Step 1.5 兼容:额外复制到 input/paper.pdf
66
+ canonical_pdf = INPUT_DIR / "paper.pdf"
67
+ shutil.copy(pdf_file.name, canonical_pdf)
68
+ logs.append(f"🔁 Also copied PDF to: {canonical_pdf}\n")
69
+
70
+ # ====== 校验输入来源 ======
71
+ if not arxiv_url and not pdf_file:
72
+ msg = "❌ 请提供 arXiv 链接或上传 PDF 文件(二选一)。"
73
+ logs.append(msg)
74
+ _write_logs(logs)
75
+ return "\n".join(logs), None
76
+
77
+ # ====== 构造命令 ======
78
  cmd = [
79
  "python", "pipeline.py",
80
+ "--model_name_t", "gpt-5",
81
+ "--model_name_v", "gpt-5",
82
+ "--result_dir", "output",
83
+ "--paper_latex_root", "input/latex_proj",
84
+ "--openai_key", openai_key,
85
+ "--gemini_key", "AIzaSyA1wVVdlYAVs3FULSmCVD1Noulwrq7zqeo",
86
+ "--logo_dir", str(LOGO_DIR) # 👈 新增:把 logo 目录传入
87
  ]
88
 
89
+ if arxiv_url:
90
+ cmd += ["--arxiv_url", arxiv_url]
91
+ if pdf_path:
92
+ cmd += ["--pdf_path", str(pdf_path)]
93
 
94
+ logs.append(f"🧠 Running command:\n{' '.join(cmd)}\n")
95
 
96
  try:
 
97
  result = subprocess.run(
98
  cmd, capture_output=True, text=True, timeout=1800
99
  )
 
105
  msg = "❌ Pipeline timed out (30 min limit)."
106
  logs.append(msg)
107
  _write_logs(logs)
108
+ return "\n".join(logs), None
109
  except Exception as e:
110
  msg = f"❌ Pipeline error: {e}"
111
  logs.append(msg)
112
  _write_logs(logs)
113
+ return "\n".join(logs), None
114
 
115
+ # ====== 检查输出 & 打包 ======
116
  if not any(OUTPUT_DIR.iterdir()):
117
  msg = "❌ No output generated. Please check logs below."
118
  logs.append(msg)
119
  _write_logs(logs)
120
  return "\n".join(logs), None
121
 
 
122
  with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
123
  for root, dirs, files in os.walk(OUTPUT_DIR):
124
  for file in files:
 
130
  end_time = datetime.datetime.now()
131
  logs.append(f"🏁 Completed at {end_time.strftime('%Y-%m-%d %H:%M:%S')} (Duration: {(end_time - start_time).seconds}s)\n")
132
 
 
133
  _write_logs(logs)
 
134
  return "\n".join(logs), ZIP_PATH
135
 
136
 
137
  def _write_logs(logs):
 
138
  with open(LOG_PATH, "w", encoding="utf-8") as f:
139
  f.write("\n".join(logs))
140
 
 
143
  iface = gr.Interface(
144
  fn=run_pipeline,
145
  inputs=[
146
+ gr.Textbox(label="📘 ArXiv URL(二选一)", placeholder="https://arxiv.org/abs/2505.xxxxx"),
147
+ gr.File(label="📄 上传 PDF(二选一)"),
148
+ gr.Textbox(label="🔑 OpenAI API Key", placeholder="sk-...", type="password"),
149
+ gr.File(label="🏷️ 上传作者所属机构 Logo(必选,可多文件)", file_count="multiple", file_types=["image"]),
 
 
 
150
  ],
151
  outputs=[
152
+ gr.Textbox(label="🧾 Logs", lines=30, max_lines=50),
153
+ gr.File(label="📦 下载生成结果 (.zip)")
154
  ],
155
  title="📄 PaperShow Pipeline",
156
+ description=(
157
+ "必须上传机构 Logo(可多张)。\n"
158
+ "可输入 arXiv 链接或上传 PDF(二选一),系统将生成 Poster 并打包下载。"
159
+ ),
160
  allow_flagging="never",
161
  )
162
 
pipeline.py CHANGED
@@ -6,6 +6,7 @@ import subprocess
6
  from os import path
7
  from pdf2image import convert_from_path
8
  from pathlib import Path
 
9
 
10
  print("Initializing...")
11
 
@@ -54,8 +55,8 @@ def run_paper2poster_content_build():
54
  cmd = [
55
  sys.executable, "-m", "PosterAgent.new_pipeline",
56
  f'--poster_path={dst_pdf.relative_to(P2P_ROOT)}',
57
- '--model_name_t=4o',
58
- '--model_name_v=4o',
59
  '--poster_width_inches=48',
60
  '--poster_height_inches=36'
61
  ]
@@ -87,11 +88,104 @@ def run_paper2poster_content_build():
87
  print(" 📦 JSON copied & renamed.")
88
  print(" ✅ Step 1.5 done.\n")
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if __name__ == '__main__':
91
  parser = argparse.ArgumentParser(description='Paper2Video Generation Pipeline')
92
- parser.add_argument('--result_dir', type=str, default='./result/zeyu')
93
- parser.add_argument('--model_name_t', type=str, default='gpt-4.1')
94
- parser.add_argument('--model_name_v', type=str, default='gpt-4.1')
95
  parser.add_argument('--paper_latex_root', type=str, default=str(P2V_ASSETS))
96
  parser.add_argument('--ref_text', type=str, default=None)
97
  parser.add_argument('--if_tree_search', type=bool, default=True)
@@ -100,7 +194,7 @@ if __name__ == '__main__':
100
  parser.add_argument('--arxiv_url', type=str, default=None)
101
  parser.add_argument('--openai_key', type=str, required=True, help='Your OpenAI API key')
102
  parser.add_argument('--gemini_key', type=str, required=True, help='Your Gemini API key')
103
-
104
  args = parser.parse_args()
105
  print("start")
106
 
@@ -166,44 +260,44 @@ if __name__ == '__main__':
166
  # =========================
167
  # Step 1: Slide Generation
168
  # =========================
169
- try:
170
- print("🧩 Step 1: Generating Slides ...")
171
- slide_latex_path = path.join(args.paper_latex_root, "slides.tex")
172
- slide_image_dir = path.join(args.result_dir, 'slide_imgs')
173
- os.makedirs(slide_image_dir, exist_ok=True)
174
-
175
- start_time = time.time()
176
- prompt_path = "./Paper2Video/src/prompts/slide_beamer_prompt.txt"
177
-
178
- if args.if_tree_search:
179
- usage_slide, beamer_path = latex_code_gen(
180
- prompt_path=prompt_path,
181
- tex_dir=args.paper_latex_root,
182
- beamer_save_path=slide_latex_path,
183
- model_config_ll=get_agent_config(args.model_name_t),
184
- model_config_vl=get_agent_config(args.model_name_v),
185
- beamer_temp_name=args.beamer_templete_prompt
186
- )
187
- else:
188
- paper_latex_path = path.join(args.paper_latex_root, "main.tex")
189
- usage_slide = latex_code_gen(
190
- prompt_path=prompt_path,
191
- tex_dir=args.paper_latex_root,
192
- tex_path=paper_latex_path,
193
- beamer_save_path=slide_latex_path,
194
- model_config=get_agent_config(args.model_name_t)
195
- )
196
- beamer_path = slide_latex_path
197
-
198
- if not os.path.exists(beamer_path):
199
- raise FileNotFoundError(f"❌ Beamer PDF not found: {beamer_path}")
200
-
201
- slide_imgs = convert_from_path(beamer_path, dpi=400)
202
- for i, img in enumerate(slide_imgs):
203
- img.save(path.join(slide_image_dir, f"{i+1}.png"))
204
- print("✅ Step 1 done.")
205
- except Exception as e:
206
- print(f"❌ Step 1 failed: {e}")
207
 
208
  # =========================
209
  # Step 1.5: Poster2Poster 内容生成
@@ -224,13 +318,47 @@ if __name__ == '__main__':
224
  print(f"❌ Step 2 failed: {e}")
225
 
226
  # =========================
227
- # Step 3: 导出 latex_proj
228
  # =========================
229
  try:
230
  src_lp = PB_ROOT / "latex_proj"
231
  dst_lp = ROOT_DIR / "output" / "poster_latex_proj"
232
  copytree_overwrite(src_lp, dst_lp)
233
  print(f"📦 Exported LaTeX project → {dst_lp.relative_to(ROOT_DIR)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  except Exception as e:
235
  print(f"❌ Step 3 failed: {e}")
236
 
 
6
  from os import path
7
  from pdf2image import convert_from_path
8
  from pathlib import Path
9
+ from PIL import Image
10
 
11
  print("Initializing...")
12
 
 
55
  cmd = [
56
  sys.executable, "-m", "PosterAgent.new_pipeline",
57
  f'--poster_path={dst_pdf.relative_to(P2P_ROOT)}',
58
+ '--model_name_t=gpt-5',
59
+ '--model_name_v=gpt-5',
60
  '--poster_width_inches=48',
61
  '--poster_height_inches=36'
62
  ]
 
88
  print(" 📦 JSON copied & renamed.")
89
  print(" ✅ Step 1.5 done.\n")
90
 
91
+ def _list_logo_files(logo_dir: Path):
92
+ exts = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
93
+ files = []
94
+ if logo_dir.exists():
95
+ for p in sorted(logo_dir.iterdir()):
96
+ if p.suffix.lower() in exts and p.is_file():
97
+ files.append(p)
98
+ return files
99
+
100
+ def _compose_logos_horizontally(logo_paths, out_path: Path, box_w=2000, box_h=476, gap=16):
101
+ """
102
+ 宽度为硬约束:输出图像宽度必为 box_w(默认 2000px)。
103
+ 多 logo 按比例统一缩放,拼接后刚好占满 box_w(包含间距)。
104
+ 高度由比例自然决定,可能 < box_h,也可能 > box_h(甚至 > 2*box_h),不会再二次压缩。
105
+ 透明背景,输出 PNG。
106
+ """
107
+ # 读取图片
108
+ imgs = []
109
+ for p in logo_paths:
110
+ p = Path(p)
111
+ if p.exists() and p.is_file():
112
+ imgs.append(Image.open(p).convert("RGBA"))
113
+ n = len(imgs)
114
+ if n == 0:
115
+ raise RuntimeError("No logo images found.")
116
+
117
+ # 原始总宽度(不含 gap);拼接总宽 = sum(w_i) + gap*(n-1)
118
+ widths = [im.width for im in imgs]
119
+ heights = [im.height for im in imgs]
120
+ sum_w = sum(widths)
121
+ if sum_w <= 0:
122
+ raise RuntimeError("All logo images have zero width.")
123
+
124
+ # 计算统一缩放比例,使:sum(w_i * s) + gap*(n-1) == box_w
125
+ # => s = (box_w - gap*(n-1)) / sum_w
126
+ total_gap = max(0, gap * (n - 1))
127
+ if box_w <= total_gap:
128
+ raise ValueError(f"box_w({box_w}) too small vs total gaps({total_gap}). Increase box_w or reduce gap.")
129
+ s = (box_w - total_gap) / float(sum_w)
130
+
131
+ # 按统一比例缩放(四舍五入到整数像素,避免累计误差)
132
+ resized = []
133
+ scaled_widths = []
134
+ scaled_heights = []
135
+ for im, w, h in zip(imgs, widths, heights):
136
+ nw = max(1, int(round(w * s)))
137
+ nh = max(1, int(round(h * s)))
138
+ resized.append(im.resize((nw, nh), Image.LANCZOS))
139
+ scaled_widths.append(nw)
140
+ scaled_heights.append(nh)
141
+
142
+ # 由于整数取整,可能出现总宽 != box_w - total_gap;对若干图微调 1px 以精确对齐
143
+ current_sum_w = sum(scaled_widths)
144
+ diff = (box_w - total_gap) - current_sum_w
145
+ # 按从宽到窄/从大到小顺序均匀分配像素误差
146
+ if diff != 0:
147
+ order = sorted(range(n), key=lambda i: scaled_widths[i], reverse=(diff > 0))
148
+ idx = 0
149
+ step = 1 if diff > 0 else -1
150
+ remaining = abs(diff)
151
+ while remaining > 0 and n > 0:
152
+ i = order[idx % n]
153
+ new_w = scaled_widths[i] + step
154
+ if new_w >= 1:
155
+ scaled_widths[i] = new_w
156
+ resized[i] = resized[i].resize((new_w, resized[i].height), Image.LANCZOS)
157
+ remaining -= 1
158
+ idx += 1
159
+
160
+ # 计算最终尺寸
161
+ total_w = sum(scaled_widths) + total_gap
162
+ assert total_w == box_w, f"width pack mismatch: got {total_w}, expect {box_w}"
163
+ canvas_w = box_w
164
+ canvas_h = max(im.height for im in resized) # 高度由比例自然决定(可能 > 2*box_h)
165
+
166
+ # 画布 & 居中摆放(垂直方向居中)
167
+ canvas = Image.new("RGBA", (canvas_w, canvas_h), (0, 0, 0, 0))
168
+ cur_x = 0
169
+ for idx, im in enumerate(resized):
170
+ y = (canvas_h - im.height) // 2
171
+ canvas.alpha_composite(im, (cur_x, y))
172
+ cur_x += im.width
173
+ if idx != n - 1:
174
+ cur_x += gap
175
+
176
+ out_path.parent.mkdir(parents=True, exist_ok=True)
177
+ canvas.save(out_path, format="PNG")
178
+ print(f" 🧩 Logos composed (width-locked) → {out_path.relative_to(ROOT_DIR)} "
179
+ f"(n={n}, final_size={canvas_w}x{canvas_h})")
180
+
181
+
182
+
183
+
184
  if __name__ == '__main__':
185
  parser = argparse.ArgumentParser(description='Paper2Video Generation Pipeline')
186
+ parser.add_argument('--result_dir', type=str, default='output')
187
+ parser.add_argument('--model_name_t', type=str, default='gpt-5')
188
+ parser.add_argument('--model_name_v', type=str, default='gpt-5')
189
  parser.add_argument('--paper_latex_root', type=str, default=str(P2V_ASSETS))
190
  parser.add_argument('--ref_text', type=str, default=None)
191
  parser.add_argument('--if_tree_search', type=bool, default=True)
 
194
  parser.add_argument('--arxiv_url', type=str, default=None)
195
  parser.add_argument('--openai_key', type=str, required=True, help='Your OpenAI API key')
196
  parser.add_argument('--gemini_key', type=str, required=True, help='Your Gemini API key')
197
+ parser.add_argument('--logo_dir', type=str, required=True, help='Directory containing uploaded logo image(s)')
198
  args = parser.parse_args()
199
  print("start")
200
 
 
260
  # =========================
261
  # Step 1: Slide Generation
262
  # =========================
263
+ # try:
264
+ # print("🧩 Step 1: Generating Slides ...")
265
+ # slide_latex_path = path.join(args.paper_latex_root, "slides.tex")
266
+ # slide_image_dir = path.join(args.result_dir, 'slide_imgs')
267
+ # os.makedirs(slide_image_dir, exist_ok=True)
268
+
269
+ # start_time = time.time()
270
+ # prompt_path = "./Paper2Video/src/prompts/slide_beamer_prompt.txt"
271
+
272
+ # if args.if_tree_search:
273
+ # usage_slide, beamer_path = latex_code_gen(
274
+ # prompt_path=prompt_path,
275
+ # tex_dir=args.paper_latex_root,
276
+ # beamer_save_path=slide_latex_path,
277
+ # model_config_ll=get_agent_config(args.model_name_t),
278
+ # model_config_vl=get_agent_config(args.model_name_v),
279
+ # beamer_temp_name=args.beamer_templete_prompt
280
+ # )
281
+ # else:
282
+ # paper_latex_path = path.join(args.paper_latex_root, "main.tex")
283
+ # usage_slide = latex_code_gen(
284
+ # prompt_path=prompt_path,
285
+ # tex_dir=args.paper_latex_root,
286
+ # tex_path=paper_latex_path,
287
+ # beamer_save_path=slide_latex_path,
288
+ # model_config=get_agent_config(args.model_name_t)
289
+ # )
290
+ # beamer_path = slide_latex_path
291
+
292
+ # if not os.path.exists(beamer_path):
293
+ # raise FileNotFoundError(f"❌ Beamer PDF not found: {beamer_path}")
294
+
295
+ # slide_imgs = convert_from_path(beamer_path, dpi=400)
296
+ # for i, img in enumerate(slide_imgs):
297
+ # img.save(path.join(slide_image_dir, f"{i+1}.png"))
298
+ # print("✅ Step 1 done.")
299
+ # except Exception as e:
300
+ # print(f"❌ Step 1 failed: {e}")
301
 
302
  # =========================
303
  # Step 1.5: Poster2Poster 内容生成
 
318
  print(f"❌ Step 2 failed: {e}")
319
 
320
  # =========================
321
+ # Step 3: 导出 latex_proj & 处理 LOGO & 应用 template
322
  # =========================
323
  try:
324
  src_lp = PB_ROOT / "latex_proj"
325
  dst_lp = ROOT_DIR / "output" / "poster_latex_proj"
326
  copytree_overwrite(src_lp, dst_lp)
327
  print(f"📦 Exported LaTeX project → {dst_lp.relative_to(ROOT_DIR)}")
328
+
329
+ logo_dir = Path(args.logo_dir)
330
+ logo_files = _list_logo_files(logo_dir)
331
+ if len(logo_files) == 0:
332
+ raise RuntimeError("❌ No logo files found in --logo_dir (must upload at least one).")
333
+
334
+ logos_out_dir = dst_lp / "logos"
335
+ logos_out_dir.mkdir(parents=True, exist_ok=True)
336
+ left_logo_path = logos_out_dir / "left_logo.png"
337
+
338
+ if len(logo_files) == 1:
339
+ # 单图:拷贝并转成 PNG(以确保一致)
340
+ im = Image.open(logo_files[0]).convert("RGBA")
341
+ im.save(left_logo_path, format="PNG")
342
+ print(f"🖼️ Single logo saved → {left_logo_path.relative_to(ROOT_DIR)}")
343
+ else:
344
+ # 多图:拼接
345
+ _compose_logos_horizontally(logo_files, left_logo_path, box_w=2000, box_h=476, gap=16)
346
+
347
+ template_dir = ROOT_DIR / "template"
348
+ if template_dir.exists():
349
+ for item in template_dir.iterdir():
350
+ dst_path = dst_lp / item.name
351
+ if item.is_dir():
352
+ if dst_path.exists():
353
+ shutil.rmtree(dst_path)
354
+ shutil.copytree(item, dst_path)
355
+ else:
356
+ shutil.copy2(item, dst_path)
357
+ print(f"📂 Copied all template files → {dst_lp.relative_to(ROOT_DIR)}")
358
+ else:
359
+ print("⚠️ template directory not found, skipping Step 3.5.")
360
+
361
+ print("✅ Step 3 done.")
362
  except Exception as e:
363
  print(f"❌ Step 3 failed: {e}")
364
 
posterbuilder/arrangement.json CHANGED
@@ -1,542 +1,115 @@
1
  {
2
- "poster_width": 1200,
3
- "poster_height": 900,
4
- "poster_width_inches": 48.0,
5
- "poster_height_inches": 36.0,
6
  "panels": [
7
  {
8
  "panel_id": 0,
9
- "section_name": "Poster Title & Author",
10
- "tp": 0.12971887550200803,
11
- "text_len": 323,
12
- "gp": 0,
13
- "figure_size": 0,
14
- "figure_aspect": 1,
15
- "sp": 0.06301447323913961,
16
- "rp": 2.505748069071783
17
  },
18
  {
19
  "panel_id": 1,
20
- "section_name": "Introduction",
21
- "tp": 0.1859437751004016,
22
- "text_len": 463,
23
- "gp": 0,
24
- "figure_size": 0,
25
- "figure_aspect": 1,
26
- "sp": 0.08063905395956796,
27
- "rp": 2.359873888191933
28
  },
29
  {
30
  "panel_id": 2,
31
- "section_name": "Benchmark & Metrics",
32
- "tp": 0.15903614457831325,
33
- "text_len": 396,
34
- "gp": 0.016682202105281593,
35
- "figure_size": 64769,
36
- "figure_aspect": 0.8819188191881919,
37
- "sp": 0.07756528917306713,
38
- "rp": 2.386019900332315
39
  },
40
  {
41
  "panel_id": 3,
42
- "section_name": "PosterAgent Framework",
43
- "tp": 0.1859437751004016,
44
- "text_len": 463,
45
- "gp": 0.49217196764679444,
46
  "figure_size": 1910868,
47
- "figure_aspect": 2.0350877192982457,
48
- "sp": 0.23879941088764153,
49
- "rp": 1.0716273641356449
50
  },
51
  {
52
  "panel_id": 4,
53
- "section_name": "Evaluation & Results",
54
- "tp": 0.1859437751004016,
55
- "text_len": 463,
56
- "gp": 0.49114583024792396,
57
- "figure_size": 1906884,
58
- "figure_aspect": 2.0434782608695654,
59
- "sp": 0.23846965976825418,
60
- "rp": 1.0743132504197004
61
- },
62
- {
63
- "panel_id": 5,
64
- "section_name": "Conclusion",
65
- "tp": 0.1534136546184739,
66
- "text_len": 382,
67
  "gp": 0,
68
  "figure_size": 0,
69
- "figure_aspect": 1,
70
- "sp": 0.07044197511417727,
71
- "rp": 2.4442725214152747
72
- }
73
- ],
74
- "panel_arrangement": [
75
- {
76
- "panel_name": "Poster Title & Author",
77
- "panel_id": 0,
78
- "x": 0,
79
- "y": 0,
80
- "width": 1200,
81
- "height": 90.0
82
- },
83
- {
84
- "panel_name": "Introduction",
85
- "panel_id": 1,
86
- "x": 0,
87
- "y": 90.0,
88
- "width": 550.621296168701,
89
- "height": 201.65362571734266
90
- },
91
- {
92
- "panel_name": "Benchmark & Metrics",
93
- "panel_id": 2,
94
- "x": 550.621296168701,
95
- "y": 90.0,
96
- "width": 529.6329503516805,
97
- "height": 201.65362571734266
98
- },
99
- {
100
- "panel_name": "PosterAgent Framework",
101
- "panel_id": 3,
102
- "x": 0,
103
- "y": 291.65362571734266,
104
- "width": 540.5003037876063,
105
- "height": 608.3463742826573
106
- },
107
- {
108
- "panel_name": "Evaluation & Results",
109
- "panel_id": 4,
110
- "x": 540.5003037876063,
111
- "y": 291.65362571734266,
112
- "width": 539.7539427327752,
113
- "height": 608.3463742826573
114
- },
115
- {
116
- "panel_name": "Conclusion",
117
- "panel_id": 5,
118
- "x": 1080.2542465203815,
119
- "y": 90.0,
120
- "width": 119.74575347961854,
121
- "height": 810.0
122
- }
123
- ],
124
- "figure_arrangement": [
125
- {
126
- "panel_id": 2,
127
- "x": 763.672586975783,
128
- "y": 132.13072514346854,
129
- "width": 103.53036873751637,
130
- "height": 117.39217543040559,
131
- "figure_id": 0,
132
- "figure_name": "p<Benchmark & Metrics>_f0",
133
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png"
134
- },
135
- {
136
- "panel_id": 3,
137
- "x": 56.45003037876063,
138
- "y": 490.76985659696936,
139
- "width": 427.60024303008504,
140
- "height": 210.11391252340385,
141
- "figure_id": 0,
142
- "figure_name": "p<PosterAgent Framework>_f0",
143
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png"
144
- },
145
- {
146
- "panel_id": 4,
147
- "x": 596.8756980608838,
148
- "y": 491.34731768544725,
149
- "width": 427.0031541862202,
150
- "height": 208.95899034644816,
151
- "figure_id": 0,
152
- "figure_name": "p<Evaluation & Results>_f0",
153
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-1.png"
154
- }
155
- ],
156
- "text_arrangement": [
157
- {
158
- "panel_id": 0,
159
- "x": 3.0,
160
- "y": 3.0,
161
- "width": 1194.0,
162
- "height": 37.333333333333336,
163
- "textbox_id": 0,
164
- "textbox_name": "p<Poster Title & Author>_t0",
165
- "num_chars": 410
166
- },
167
- {
168
- "panel_id": 0,
169
- "x": 3.0,
170
- "y": 40.333333333333336,
171
- "width": 1194.0,
172
- "height": 46.666666666666664,
173
- "textbox_id": 0,
174
- "textbox_name": "p<Poster Title & Author>_t1",
175
- "num_chars": 410
176
- },
177
- {
178
- "panel_id": 1,
179
- "x": 3.0,
180
- "y": 93.0,
181
- "width": 544.621296168701,
182
- "height": 32.0,
183
- "textbox_id": 0,
184
- "textbox_name": "p<Introduction>_t0",
185
- "num_chars": 180
186
- },
187
- {
188
- "panel_id": 1,
189
- "x": 3.0,
190
- "y": 125.0,
191
- "width": 544.621296168701,
192
- "height": 163.65362571734266,
193
- "textbox_id": 1,
194
- "textbox_name": "p<Introduction>_t1",
195
- "num_chars": 540
196
- },
197
- {
198
- "panel_id": 2,
199
- "x": 553.621296168701,
200
- "y": 93.0,
201
- "width": 523.6329503516805,
202
- "height": 32.0,
203
- "textbox_id": 0,
204
- "textbox_name": "p<Benchmark & Metrics>_t0",
205
- "num_chars": 180
206
- },
207
- {
208
- "panel_id": 2,
209
- "x": 553.621296168701,
210
- "y": 125.0,
211
- "width": 523.6329503516805,
212
- "height": 7.130725143468538,
213
- "textbox_id": 1,
214
- "textbox_name": "p<Benchmark & Metrics>_t1",
215
- "num_chars": 180
216
- },
217
- {
218
- "panel_id": 2,
219
- "x": 553.621296168701,
220
- "y": 249.52290057387413,
221
- "width": 523.6329503516805,
222
- "height": 39.13072514346854,
223
- "textbox_id": 2,
224
- "textbox_name": "p<Benchmark & Metrics>_t2",
225
- "num_chars": 180
226
- },
227
- {
228
- "panel_id": 3,
229
- "x": 3.0,
230
- "y": 294.65362571734266,
231
- "width": 534.5003037876063,
232
- "height": 32.0,
233
- "textbox_id": 0,
234
- "textbox_name": "p<PosterAgent Framework>_t0",
235
- "num_chars": 180
236
- },
237
- {
238
- "panel_id": 3,
239
- "x": 3.0,
240
- "y": 326.65362571734266,
241
- "width": 534.5003037876063,
242
- "height": 164.1162308796267,
243
- "textbox_id": 1,
244
- "textbox_name": "p<PosterAgent Framework>_t1",
245
- "num_chars": 540
246
- },
247
- {
248
- "panel_id": 3,
249
- "x": 3.0,
250
- "y": 700.8837691203732,
251
- "width": 534.5003037876063,
252
- "height": 196.11623087962676,
253
- "textbox_id": 2,
254
- "textbox_name": "p<PosterAgent Framework>_t2",
255
- "num_chars": 540
256
- },
257
- {
258
- "panel_id": 4,
259
- "x": 543.5003037876063,
260
- "y": 294.65362571734266,
261
- "width": 533.7539427327752,
262
- "height": 32.0,
263
- "textbox_id": 0,
264
- "textbox_name": "p<Evaluation & Results>_t0",
265
- "num_chars": 180
266
- },
267
- {
268
- "panel_id": 4,
269
- "x": 543.5003037876063,
270
- "y": 326.65362571734266,
271
- "width": 533.7539427327752,
272
- "height": 164.6936919681046,
273
- "textbox_id": 1,
274
- "textbox_name": "p<Evaluation & Results>_t1",
275
- "num_chars": 540
276
- },
277
- {
278
- "panel_id": 4,
279
- "x": 543.5003037876063,
280
- "y": 700.3063080318955,
281
- "width": 533.7539427327752,
282
- "height": 196.69369196810453,
283
- "textbox_id": 2,
284
- "textbox_name": "p<Evaluation & Results>_t2",
285
- "num_chars": 540
286
  },
287
  {
288
  "panel_id": 5,
289
- "x": 1083.2542465203815,
290
- "y": 93.0,
291
- "width": 113.74575347961854,
292
- "height": 32.0,
293
- "textbox_id": 0,
294
- "textbox_name": "p<Conclusion>_t0",
295
- "num_chars": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  },
297
  {
298
- "panel_id": 5,
299
- "x": 1083.2542465203815,
300
- "y": 125.0,
301
- "width": 113.74575347961854,
302
- "height": 772.0,
303
- "textbox_id": 1,
304
- "textbox_name": "p<Conclusion>_t1",
305
- "num_chars": 420
306
  }
307
  ],
308
- "panel_arrangement_inches": [
309
  {
310
- "panel_name": "Poster Title & Author",
311
  "panel_id": 0,
312
- "x": 0.0,
313
- "y": 0.0,
314
- "width": 48.0,
315
- "height": 3.6
316
  },
317
  {
318
- "panel_name": "Introduction",
319
  "panel_id": 1,
320
- "x": 0.0,
321
- "y": 3.6,
322
- "width": 22.02485184674804,
323
- "height": 8.066145028693706
324
  },
325
  {
326
- "panel_name": "Benchmark & Metrics",
327
  "panel_id": 2,
328
- "x": 22.02485184674804,
329
- "y": 3.6,
330
- "width": 21.18531801406722,
331
- "height": 8.066145028693706
332
  },
333
  {
334
- "panel_name": "PosterAgent Framework",
335
  "panel_id": 3,
336
- "x": 0.0,
337
- "y": 11.666145028693707,
338
- "width": 21.620012151504252,
339
- "height": 24.33385497130629
340
- },
341
- {
342
- "panel_name": "Evaluation & Results",
343
- "panel_id": 4,
344
- "x": 21.620012151504252,
345
- "y": 11.666145028693707,
346
- "width": 21.590157709311008,
347
- "height": 24.33385497130629
348
  },
349
  {
350
- "panel_name": "Conclusion",
351
  "panel_id": 5,
352
- "x": 43.210169860815256,
353
- "y": 3.6,
354
- "width": 4.789830139184741,
355
- "height": 32.4
356
- }
357
- ],
358
- "figure_arrangement_inches": [
359
- {
360
- "panel_id": 2,
361
- "x": 30.54690347903132,
362
- "y": 5.285229005738741,
363
- "width": 4.141214749500655,
364
- "height": 4.6956870172162235,
365
- "figure_id": 0,
366
- "figure_name": "p<Benchmark & Metrics>_f0",
367
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png"
368
- },
369
- {
370
- "panel_id": 3,
371
- "x": 2.258001215150425,
372
- "y": 19.630794263878773,
373
- "width": 17.1040097212034,
374
- "height": 8.404556500936154,
375
- "figure_id": 0,
376
- "figure_name": "p<PosterAgent Framework>_f0",
377
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png"
378
- },
379
- {
380
- "panel_id": 4,
381
- "x": 23.87502792243535,
382
- "y": 19.65389270741789,
383
- "width": 17.080126167448807,
384
- "height": 8.358359613857926,
385
- "figure_id": 0,
386
- "figure_name": "p<Evaluation & Results>_f0",
387
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-1.png"
388
- }
389
- ],
390
- "text_arrangement_inches": [
391
- {
392
- "panel_id": 0,
393
- "x": 0.12,
394
- "y": 0.12,
395
- "width": 47.76,
396
- "height": 1.4933333333333334,
397
- "textbox_id": 0,
398
- "textbox_name": "p<Poster Title & Author>_t0",
399
- "num_chars": 410
400
- },
401
- {
402
- "panel_id": 0,
403
- "x": 0.12,
404
- "y": 1.6133333333333335,
405
- "width": 47.76,
406
- "height": 1.8666666666666665,
407
- "textbox_id": 0,
408
- "textbox_name": "p<Poster Title & Author>_t1",
409
- "num_chars": 410
410
  },
411
  {
412
- "panel_id": 1,
413
- "x": 0.12,
414
- "y": 3.72,
415
- "width": 21.784851846748037,
416
- "height": 1.28,
417
- "textbox_id": 0,
418
- "textbox_name": "p<Introduction>_t0",
419
- "num_chars": 180
420
- },
421
- {
422
- "panel_id": 1,
423
- "x": 0.12,
424
- "y": 5.0,
425
- "width": 21.784851846748037,
426
- "height": 6.546145028693706,
427
- "textbox_id": 1,
428
- "textbox_name": "p<Introduction>_t1",
429
- "num_chars": 540
430
- },
431
- {
432
- "panel_id": 2,
433
- "x": 22.14485184674804,
434
- "y": 3.72,
435
- "width": 20.94531801406722,
436
- "height": 1.28,
437
- "textbox_id": 0,
438
- "textbox_name": "p<Benchmark & Metrics>_t0",
439
- "num_chars": 180
440
- },
441
- {
442
- "panel_id": 2,
443
- "x": 22.14485184674804,
444
- "y": 5.0,
445
- "width": 20.94531801406722,
446
- "height": 0.28522900573874155,
447
- "textbox_id": 1,
448
- "textbox_name": "p<Benchmark & Metrics>_t1",
449
- "num_chars": 180
450
  },
451
  {
452
- "panel_id": 2,
453
- "x": 22.14485184674804,
454
- "y": 9.980916022954965,
455
- "width": 20.94531801406722,
456
- "height": 1.5652290057387415,
457
- "textbox_id": 2,
458
- "textbox_name": "p<Benchmark & Metrics>_t2",
459
- "num_chars": 180
460
- },
461
- {
462
- "panel_id": 3,
463
- "x": 0.12,
464
- "y": 11.786145028693706,
465
- "width": 21.380012151504253,
466
- "height": 1.28,
467
- "textbox_id": 0,
468
- "textbox_name": "p<PosterAgent Framework>_t0",
469
- "num_chars": 180
470
- },
471
- {
472
- "panel_id": 3,
473
- "x": 0.12,
474
- "y": 13.066145028693706,
475
- "width": 21.380012151504253,
476
- "height": 6.564649235185068,
477
- "textbox_id": 1,
478
- "textbox_name": "p<PosterAgent Framework>_t1",
479
- "num_chars": 540
480
- },
481
- {
482
- "panel_id": 3,
483
- "x": 0.12,
484
- "y": 28.03535076481493,
485
- "width": 21.380012151504253,
486
- "height": 7.84464923518507,
487
- "textbox_id": 2,
488
- "textbox_name": "p<PosterAgent Framework>_t2",
489
- "num_chars": 540
490
- },
491
- {
492
- "panel_id": 4,
493
- "x": 21.740012151504253,
494
- "y": 11.786145028693706,
495
- "width": 21.350157709311006,
496
- "height": 1.28,
497
- "textbox_id": 0,
498
- "textbox_name": "p<Evaluation & Results>_t0",
499
- "num_chars": 180
500
- },
501
- {
502
- "panel_id": 4,
503
- "x": 21.740012151504253,
504
- "y": 13.066145028693706,
505
- "width": 21.350157709311006,
506
- "height": 6.587747678724184,
507
- "textbox_id": 1,
508
- "textbox_name": "p<Evaluation & Results>_t1",
509
- "num_chars": 540
510
- },
511
- {
512
- "panel_id": 4,
513
- "x": 21.740012151504253,
514
- "y": 28.01225232127582,
515
- "width": 21.350157709311006,
516
- "height": 7.867747678724181,
517
- "textbox_id": 2,
518
- "textbox_name": "p<Evaluation & Results>_t2",
519
- "num_chars": 540
520
- },
521
- {
522
- "panel_id": 5,
523
- "x": 43.33016986081526,
524
- "y": 3.72,
525
- "width": 4.549830139184742,
526
- "height": 1.28,
527
- "textbox_id": 0,
528
- "textbox_name": "p<Conclusion>_t0",
529
- "num_chars": 30
530
- },
531
- {
532
- "panel_id": 5,
533
- "x": 43.33016986081526,
534
- "y": 5.0,
535
- "width": 4.549830139184742,
536
- "height": 30.88,
537
- "textbox_id": 1,
538
- "textbox_name": "p<Conclusion>_t1",
539
- "num_chars": 420
540
  }
541
  ]
542
  }
 
1
  {
 
 
 
 
2
  "panels": [
3
  {
4
  "panel_id": 0,
5
+ "section_name": "Why Posters Are Hard",
6
+ "tp": 0.12082710513203787,
7
+ "text_len": 485,
8
+ "gp": 0.009888851380803912,
9
+ "figure_size": 64769,
10
+ "figure_aspect": 0.8819188191881919
 
 
11
  },
12
  {
13
  "panel_id": 1,
14
+ "section_name": "Benchmark and Data",
15
+ "tp": 0.12531141006477328,
16
+ "text_len": 503,
17
+ "gp": 0.04796373085236436,
18
+ "figure_size": 314148,
19
+ "figure_aspect": 1.0125673249551166
 
 
20
  },
21
  {
22
  "panel_id": 2,
23
+ "section_name": "PaperQuiz: What Matters",
24
+ "tp": 0.11285500747384156,
25
+ "text_len": 453,
26
+ "gp": 0.1192882298865948,
27
+ "figure_size": 781302,
28
+ "figure_aspect": 5.032994923857868
 
 
29
  },
30
  {
31
  "panel_id": 3,
32
+ "section_name": "PosterAgent Pipeline",
33
+ "tp": 0.10637767812655705,
34
+ "text_len": 427,
35
+ "gp": 0.29174897960959734,
36
  "figure_size": 1910868,
37
+ "figure_aspect": 2.0350877192982457
 
 
38
  },
39
  {
40
  "panel_id": 4,
41
+ "section_name": "Parser: Structured Assets",
42
+ "tp": 0.10612855007473841,
43
+ "text_len": 426,
 
 
 
 
 
 
 
 
 
 
 
44
  "gp": 0,
45
  "figure_size": 0,
46
+ "figure_aspect": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  },
48
  {
49
  "panel_id": 5,
50
+ "section_name": "Planner: Layout Mastery",
51
+ "tp": 0.10089686098654709,
52
+ "text_len": 405,
53
+ "gp": 0.08839429109643054,
54
+ "figure_size": 578956,
55
+ "figure_aspect": 1.3959627329192548
56
+ },
57
+ {
58
+ "panel_id": 6,
59
+ "section_name": "Painter\u2013Commenter Loop",
60
+ "tp": 0.10662680617837568,
61
+ "text_len": 428,
62
+ "gp": 0.15157520979208358,
63
+ "figure_size": 992772,
64
+ "figure_aspect": 1.4480676328502415
65
+ },
66
+ {
67
+ "panel_id": 7,
68
+ "section_name": "Results: Stronger, Leaner",
69
+ "tp": 0.10986547085201794,
70
+ "text_len": 441,
71
+ "gp": 0.2911407073821255,
72
+ "figure_size": 1906884,
73
+ "figure_aspect": 2.0434782608695654
74
  },
75
  {
76
+ "panel_id": 8,
77
+ "section_name": "Limits and Next Steps",
78
+ "tp": 0.1111111111111111,
79
+ "text_len": 446,
80
+ "gp": 0,
81
+ "figure_size": 0,
82
+ "figure_aspect": 1
 
83
  }
84
  ],
85
+ "figure_arrangement": [
86
  {
 
87
  "panel_id": 0,
88
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-1.png"
 
 
 
89
  },
90
  {
 
91
  "panel_id": 1,
92
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-6.png"
 
 
 
93
  },
94
  {
 
95
  "panel_id": 2,
96
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-7.png"
 
 
 
97
  },
98
  {
 
99
  "panel_id": 3,
100
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-8.png"
 
 
 
 
 
 
 
 
 
 
 
101
  },
102
  {
 
103
  "panel_id": 5,
104
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-30.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  },
106
  {
107
+ "panel_id": 6,
108
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-61.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  },
110
  {
111
+ "panel_id": 7,
112
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-table-1.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
  ]
115
  }
posterbuilder/build_poster.py CHANGED
@@ -3,7 +3,7 @@
3
  import json, re, pathlib, shutil, os
4
 
5
  # ===================== 自动定位项目根 =====================
6
- IMAGES_DIR_NAME = "<4o_4o>_images_and_tables" # 蓝色文件夹名
7
 
8
  def find_project_root(start: pathlib.Path) -> pathlib.Path:
9
  cur = start.resolve()
 
3
  import json, re, pathlib, shutil, os
4
 
5
  # ===================== 自动定位项目根 =====================
6
+ IMAGES_DIR_NAME = "<gpt-5_gpt-5>_images_and_tables" # 蓝色文件夹名
7
 
8
  def find_project_root(start: pathlib.Path) -> pathlib.Path:
9
  cur = start.resolve()
posterbuilder/cambridge_template.tex CHANGED
@@ -22,6 +22,13 @@
22
  \pgfplotsset{compat=1.14}
23
  \usepackage{anyfontsize}
24
 
 
 
 
 
 
 
 
25
  % ====================
26
  % Lengths
27
  % ====================
@@ -60,8 +67,8 @@
60
  % ====================
61
 
62
  % use this to include logos on the left and/or right side of the header:
63
- % \logoright{\includegraphics[height=7cm]{logo1.pdf}}
64
- % \logoleft{\includegraphics[height=7cm]{logo2.pdf}}
65
 
66
  % ====================
67
  % Body
@@ -75,7 +82,6 @@
75
  {
76
  \begin{tikzpicture}[remember picture,overlay]
77
  \node [anchor=north west, inner sep=3cm] at ([xshift=0.0cm,yshift=1.0cm]current page.north west)
78
- {\includegraphics[height=4.5cm]{logos/cambridge-reversed-color-logo.eps}};
79
  \end{tikzpicture}
80
  }
81
 
 
22
  \pgfplotsset{compat=1.14}
23
  \usepackage{anyfontsize}
24
 
25
+ \definecolor{nipspurple}{RGB}{94,46,145}
26
+ \setbeamercolor{headline}{bg=white, fg=black}
27
+ \setbeamercolor{block title}{bg=nipspurple, fg=white}
28
+ \addtobeamertemplate{block begin}{
29
+ \setlength{\textpaddingtop}{0.2em}%
30
+ \setlength{\textpaddingbottom}{0.2em}%
31
+ }{}
32
  % ====================
33
  % Lengths
34
  % ====================
 
67
  % ====================
68
 
69
  % use this to include logos on the left and/or right side of the header:
70
+ \logoright{\includegraphics[height=5cm]{logos/right_logo.png}}
71
+ \logoleft{\includegraphics[height=4cm]{logos/left_logo.png}}
72
 
73
  % ====================
74
  % Body
 
82
  {
83
  \begin{tikzpicture}[remember picture,overlay]
84
  \node [anchor=north west, inner sep=3cm] at ([xshift=0.0cm,yshift=1.0cm]current page.north west)
 
85
  \end{tikzpicture}
86
  }
87
 
posterbuilder/contents copy/arrangement.json DELETED
@@ -1,783 +0,0 @@
1
- {
2
- "poster_width": 1200,
3
- "poster_height": 900,
4
- "poster_width_inches": 48.0,
5
- "poster_height_inches": 36.0,
6
- "panels": [
7
- {
8
- "panel_id": 0,
9
- "section_name": "Poster Title & Author",
10
- "tp": 0.11634695579649708,
11
- "text_len": 279,
12
- "gp": 0,
13
- "figure_size": 0,
14
- "figure_aspect": 1,
15
- "sp": 0.05882283430478395,
16
- "rp": 2.5404412005449477
17
- },
18
- {
19
- "panel_id": 1,
20
- "section_name": "Abstract",
21
- "tp": 0.15804837364470392,
22
- "text_len": 379,
23
- "gp": 0,
24
- "figure_size": 0,
25
- "figure_aspect": 1,
26
- "sp": 0.07189480082267583,
27
- "rp": 2.4322478518046795
28
- },
29
- {
30
- "panel_id": 2,
31
- "section_name": "Preliminaries",
32
- "tp": 0.0963302752293578,
33
- "text_len": 231,
34
- "gp": 0.5791655366369068,
35
- "figure_size": 2697149,
36
- "figure_aspect": 1.393961179007908,
37
- "sp": 0.23866418891649963,
38
- "rp": 1.076424221135003
39
- },
40
- {
41
- "panel_id": 3,
42
- "section_name": "Experiments",
43
- "tp": 0.10758965804837364,
44
- "text_len": 258,
45
- "gp": 0,
46
- "figure_size": 0,
47
- "figure_aspect": 1,
48
- "sp": 0.056077721336026655,
49
- "rp": 2.563161803780404
50
- },
51
- {
52
- "panel_id": 4,
53
- "section_name": "TEBOpt",
54
- "tp": 0.08632193494578816,
55
- "text_len": 207,
56
- "gp": 0.1977243938477422,
57
- "figure_size": 920794,
58
- "figure_aspect": 2.3723916532905296,
59
- "sp": 0.1129501119808965,
60
- "rp": 2.1008022748899986
61
- },
62
- {
63
- "panel_id": 5,
64
- "section_name": "Qualitative & Quantitative Results",
65
- "tp": 0.09257714762301918,
66
- "text_len": 222,
67
- "gp": 0.1792402205989877,
68
- "figure_size": 834714,
69
- "figure_aspect": 1.651195499296765,
70
- "sp": 0.10897098426749077,
71
- "rp": 2.132955085244183
72
- },
73
- {
74
- "panel_id": 6,
75
- "section_name": "Introduction",
76
- "tp": 0.1542952460383653,
77
- "text_len": 370,
78
- "gp": 0,
79
- "figure_size": 0,
80
- "figure_aspect": 1,
81
- "sp": 0.07071832383606555,
82
- "rp": 2.4419852531913038
83
- },
84
- {
85
- "panel_id": 7,
86
- "section_name": "Discussion",
87
- "tp": 0.0896580483736447,
88
- "text_len": 215,
89
- "gp": 0.0438698489163632,
90
- "figure_size": 204300,
91
- "figure_aspect": 1.008888888888889,
92
- "sp": 0.06455443146973633,
93
- "rp": 2.4948568265511564
94
- },
95
- {
96
- "panel_id": 8,
97
- "section_name": "Conclusion",
98
- "tp": 0.09883236030025021,
99
- "text_len": 237,
100
- "gp": 0,
101
- "figure_size": 0,
102
- "figure_aspect": 1,
103
- "sp": 0.053332608367269364,
104
- "rp": 2.58588240701586
105
- }
106
- ],
107
- "panel_arrangement": [
108
- {
109
- "panel_name": "Poster Title & Author",
110
- "panel_id": 0,
111
- "x": 0,
112
- "y": 0,
113
- "width": 1200,
114
- "height": 90.0
115
- },
116
- {
117
- "panel_name": "Abstract",
118
- "panel_id": 1,
119
- "x": 0,
120
- "y": 90.0,
121
- "width": 479.52708207863833,
122
- "height": 187.5160294515261
123
- },
124
- {
125
- "panel_name": "Preliminaries",
126
- "panel_id": 2,
127
- "x": 0,
128
- "y": 277.5160294515261,
129
- "width": 479.52708207863833,
130
- "height": 622.483970548474
131
- },
132
- {
133
- "panel_name": "Experiments",
134
- "panel_id": 3,
135
- "x": 479.52708207863833,
136
- "y": 90.0,
137
- "width": 239.02855954850014,
138
- "height": 293.4233135625409
139
- },
140
- {
141
- "panel_name": "TEBOpt",
142
- "panel_id": 4,
143
- "x": 718.5556416271385,
144
- "y": 90.0,
145
- "width": 481.4443583728615,
146
- "height": 293.4233135625409
147
- },
148
- {
149
- "panel_name": "Qualitative & Quantitative Results",
150
- "panel_id": 5,
151
- "x": 479.52708207863833,
152
- "y": 383.4233135625409,
153
- "width": 263.8336129444118,
154
- "height": 516.5766864374591
155
- },
156
- {
157
- "panel_name": "Introduction",
158
- "panel_id": 6,
159
- "x": 743.3606950230501,
160
- "y": 383.4233135625409,
161
- "width": 456.6393049769498,
162
- "height": 193.69246285577307
163
- },
164
- {
165
- "panel_name": "Discussion",
166
- "panel_id": 7,
167
- "x": 743.3606950230501,
168
- "y": 577.115776418314,
169
- "width": 456.6393049769498,
170
- "height": 176.80999974791118
171
- },
172
- {
173
- "panel_name": "Conclusion",
174
- "panel_id": 8,
175
- "x": 743.3606950230501,
176
- "y": 753.9257761662252,
177
- "width": 456.6393049769498,
178
- "height": 146.07422383377482
179
- }
180
- ],
181
- "figure_arrangement": [
182
- {
183
- "panel_id": 2,
184
- "x": 50.35270820786383,
185
- "y": 452.87845388586913,
186
- "width": 378.8216656629107,
187
- "height": 271.7591216797879,
188
- "figure_id": 0,
189
- "figure_name": "p<Preliminaries>_f0",
190
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-2.png"
191
- },
192
- {
193
- "panel_id": 4,
194
- "x": 769.1000774644247,
195
- "y": 156.54877849539963,
196
- "width": 380.3554866982892,
197
- "height": 160.32575657174166,
198
- "figure_id": 0,
199
- "figure_name": "p<TEBOpt>_f0",
200
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-4.png"
201
- },
202
- {
203
- "panel_id": 5,
204
- "x": 508.3104433730795,
205
- "y": 579.2517934751454,
206
- "width": 206.26689035552945,
207
- "height": 124.91972661224996,
208
- "figure_id": 0,
209
- "figure_name": "p<Qualitative & Quantitative Results>_f0",
210
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-13.png"
211
- },
212
- {
213
- "panel_id": 7,
214
- "x": 919.9818542544906,
215
- "y": 614.2777763678962,
216
- "width": 103.39698651406891,
217
- "height": 102.4859998487467,
218
- "figure_id": 0,
219
- "figure_name": "p<Discussion>_f0",
220
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-9.png"
221
- }
222
- ],
223
- "text_arrangement": [
224
- {
225
- "panel_id": 0,
226
- "x": 3.0,
227
- "y": 3.0,
228
- "width": 1194.0,
229
- "height": 37.333333333333336,
230
- "textbox_id": 0,
231
- "textbox_name": "p<Poster Title & Author>_t0",
232
- "num_chars": 410
233
- },
234
- {
235
- "panel_id": 0,
236
- "x": 3.0,
237
- "y": 40.333333333333336,
238
- "width": 1194.0,
239
- "height": 46.666666666666664,
240
- "textbox_id": 0,
241
- "textbox_name": "p<Poster Title & Author>_t1",
242
- "num_chars": 410
243
- },
244
- {
245
- "panel_id": 1,
246
- "x": 3.0,
247
- "y": 93.0,
248
- "width": 473.52708207863833,
249
- "height": 32.0,
250
- "textbox_id": 0,
251
- "textbox_name": "p<Abstract>_t0",
252
- "num_chars": 160
253
- },
254
- {
255
- "panel_id": 1,
256
- "x": 3.0,
257
- "y": 125.0,
258
- "width": 473.52708207863833,
259
- "height": 149.5160294515261,
260
- "textbox_id": 1,
261
- "textbox_name": "p<Abstract>_t1",
262
- "num_chars": 320
263
- },
264
- {
265
- "panel_id": 2,
266
- "x": 3.0,
267
- "y": 280.5160294515261,
268
- "width": 473.52708207863833,
269
- "height": 32.0,
270
- "textbox_id": 0,
271
- "textbox_name": "p<Preliminaries>_t0",
272
- "num_chars": 160
273
- },
274
- {
275
- "panel_id": 2,
276
- "x": 3.0,
277
- "y": 312.5160294515261,
278
- "width": 473.52708207863833,
279
- "height": 140.36242443434304,
280
- "textbox_id": 1,
281
- "textbox_name": "p<Preliminaries>_t1",
282
- "num_chars": 320
283
- },
284
- {
285
- "panel_id": 2,
286
- "x": 3.0,
287
- "y": 724.637575565657,
288
- "width": 473.52708207863833,
289
- "height": 172.36242443434298,
290
- "textbox_id": 2,
291
- "textbox_name": "p<Preliminaries>_t2",
292
- "num_chars": 480
293
- },
294
- {
295
- "panel_id": 3,
296
- "x": 482.52708207863833,
297
- "y": 93.0,
298
- "width": 233.02855954850014,
299
- "height": 32.0,
300
- "textbox_id": 0,
301
- "textbox_name": "p<Experiments>_t0",
302
- "num_chars": 80
303
- },
304
- {
305
- "panel_id": 3,
306
- "x": 482.52708207863833,
307
- "y": 125.0,
308
- "width": 233.02855954850014,
309
- "height": 255.4233135625409,
310
- "textbox_id": 1,
311
- "textbox_name": "p<Experiments>_t1",
312
- "num_chars": 320
313
- },
314
- {
315
- "panel_id": 4,
316
- "x": 721.5556416271385,
317
- "y": 93.0,
318
- "width": 475.4443583728615,
319
- "height": 32.0,
320
- "textbox_id": 0,
321
- "textbox_name": "p<TEBOpt>_t0",
322
- "num_chars": 160
323
- },
324
- {
325
- "panel_id": 4,
326
- "x": 721.5556416271385,
327
- "y": 125.0,
328
- "width": 475.4443583728615,
329
- "height": 31.54877849539963,
330
- "textbox_id": 1,
331
- "textbox_name": "p<TEBOpt>_t1",
332
- "num_chars": 160
333
- },
334
- {
335
- "panel_id": 4,
336
- "x": 721.5556416271385,
337
- "y": 316.8745350671413,
338
- "width": 475.4443583728615,
339
- "height": 63.548778495399574,
340
- "textbox_id": 2,
341
- "textbox_name": "p<TEBOpt>_t2",
342
- "num_chars": 160
343
- },
344
- {
345
- "panel_id": 5,
346
- "x": 482.52708207863833,
347
- "y": 386.4233135625409,
348
- "width": 257.8336129444118,
349
- "height": 32.0,
350
- "textbox_id": 0,
351
- "textbox_name": "p<Qualitative & Quantitative Results>_t0",
352
- "num_chars": 80
353
- },
354
- {
355
- "panel_id": 5,
356
- "x": 482.52708207863833,
357
- "y": 418.4233135625409,
358
- "width": 257.8336129444118,
359
- "height": 160.8284799126045,
360
- "textbox_id": 1,
361
- "textbox_name": "p<Qualitative & Quantitative Results>_t1",
362
- "num_chars": 240
363
- },
364
- {
365
- "panel_id": 5,
366
- "x": 482.52708207863833,
367
- "y": 704.1715200873954,
368
- "width": 257.8336129444118,
369
- "height": 192.82847991260462,
370
- "textbox_id": 2,
371
- "textbox_name": "p<Qualitative & Quantitative Results>_t2",
372
- "num_chars": 240
373
- },
374
- {
375
- "panel_id": 6,
376
- "x": 746.3606950230501,
377
- "y": 386.4233135625409,
378
- "width": 450.6393049769498,
379
- "height": 32.0,
380
- "textbox_id": 0,
381
- "textbox_name": "p<Introduction>_t0",
382
- "num_chars": 150
383
- },
384
- {
385
- "panel_id": 6,
386
- "x": 746.3606950230501,
387
- "y": 418.4233135625409,
388
- "width": 450.6393049769498,
389
- "height": 155.69246285577307,
390
- "textbox_id": 1,
391
- "textbox_name": "p<Introduction>_t1",
392
- "num_chars": 300
393
- },
394
- {
395
- "panel_id": 7,
396
- "x": 746.3606950230501,
397
- "y": 580.115776418314,
398
- "width": 450.6393049769498,
399
- "height": 32.0,
400
- "textbox_id": 0,
401
- "textbox_name": "p<Discussion>_t0",
402
- "num_chars": 150
403
- },
404
- {
405
- "panel_id": 7,
406
- "x": 746.3606950230501,
407
- "y": 612.115776418314,
408
- "width": 450.6393049769498,
409
- "height": 2.1619999495821958,
410
- "textbox_id": 1,
411
- "textbox_name": "p<Discussion>_t1",
412
- "num_chars": 150
413
- },
414
- {
415
- "panel_id": 7,
416
- "x": 746.3606950230501,
417
- "y": 716.7637762166429,
418
- "width": 450.6393049769498,
419
- "height": 34.16199994958231,
420
- "textbox_id": 2,
421
- "textbox_name": "p<Discussion>_t2",
422
- "num_chars": 150
423
- },
424
- {
425
- "panel_id": 8,
426
- "x": 746.3606950230501,
427
- "y": 756.9257761662252,
428
- "width": 450.6393049769498,
429
- "height": 32.0,
430
- "textbox_id": 0,
431
- "textbox_name": "p<Conclusion>_t0",
432
- "num_chars": 150
433
- },
434
- {
435
- "panel_id": 8,
436
- "x": 746.3606950230501,
437
- "y": 788.9257761662252,
438
- "width": 450.6393049769498,
439
- "height": 108.07422383377482,
440
- "textbox_id": 1,
441
- "textbox_name": "p<Conclusion>_t1",
442
- "num_chars": 300
443
- }
444
- ],
445
- "panel_arrangement_inches": [
446
- {
447
- "panel_name": "Poster Title & Author",
448
- "panel_id": 0,
449
- "x": 0.0,
450
- "y": 0.0,
451
- "width": 48.0,
452
- "height": 3.6
453
- },
454
- {
455
- "panel_name": "Abstract",
456
- "panel_id": 1,
457
- "x": 0.0,
458
- "y": 3.6,
459
- "width": 19.181083283145533,
460
- "height": 7.500641178061044
461
- },
462
- {
463
- "panel_name": "Preliminaries",
464
- "panel_id": 2,
465
- "x": 0.0,
466
- "y": 11.100641178061045,
467
- "width": 19.181083283145533,
468
- "height": 24.899358821938957
469
- },
470
- {
471
- "panel_name": "Experiments",
472
- "panel_id": 3,
473
- "x": 19.181083283145533,
474
- "y": 3.6,
475
- "width": 9.561142381940005,
476
- "height": 11.736932542501636
477
- },
478
- {
479
- "panel_name": "TEBOpt",
480
- "panel_id": 4,
481
- "x": 28.742225665085538,
482
- "y": 3.6,
483
- "width": 19.25777433491446,
484
- "height": 11.736932542501636
485
- },
486
- {
487
- "panel_name": "Qualitative & Quantitative Results",
488
- "panel_id": 5,
489
- "x": 19.181083283145533,
490
- "y": 15.336932542501636,
491
- "width": 10.553344517776473,
492
- "height": 20.663067457498364
493
- },
494
- {
495
- "panel_name": "Introduction",
496
- "panel_id": 6,
497
- "x": 29.734427800922003,
498
- "y": 15.336932542501636,
499
- "width": 18.265572199077994,
500
- "height": 7.7476985142309225
501
- },
502
- {
503
- "panel_name": "Discussion",
504
- "panel_id": 7,
505
- "x": 29.734427800922003,
506
- "y": 23.08463105673256,
507
- "width": 18.265572199077994,
508
- "height": 7.072399989916447
509
- },
510
- {
511
- "panel_name": "Conclusion",
512
- "panel_id": 8,
513
- "x": 29.734427800922003,
514
- "y": 30.15703104664901,
515
- "width": 18.265572199077994,
516
- "height": 5.842968953350993
517
- }
518
- ],
519
- "figure_arrangement_inches": [
520
- {
521
- "panel_id": 2,
522
- "x": 2.014108328314553,
523
- "y": 18.115138155434764,
524
- "width": 15.152866626516428,
525
- "height": 10.870364867191515,
526
- "figure_id": 0,
527
- "figure_name": "p<Preliminaries>_f0",
528
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-2.png"
529
- },
530
- {
531
- "panel_id": 4,
532
- "x": 30.764003098576985,
533
- "y": 6.261951139815985,
534
- "width": 15.214219467931569,
535
- "height": 6.4130302628696665,
536
- "figure_id": 0,
537
- "figure_name": "p<TEBOpt>_f0",
538
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-4.png"
539
- },
540
- {
541
- "panel_id": 5,
542
- "x": 20.33241773492318,
543
- "y": 23.170071739005817,
544
- "width": 8.250675614221178,
545
- "height": 4.996789064489999,
546
- "figure_id": 0,
547
- "figure_name": "p<Qualitative & Quantitative Results>_f0",
548
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-13.png"
549
- },
550
- {
551
- "panel_id": 7,
552
- "x": 36.79927417017962,
553
- "y": 24.571111054715846,
554
- "width": 4.135879460562757,
555
- "height": 4.0994399939498685,
556
- "figure_id": 0,
557
- "figure_name": "p<Discussion>_f0",
558
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-9.png"
559
- }
560
- ],
561
- "text_arrangement_inches": [
562
- {
563
- "panel_id": 0,
564
- "x": 0.12,
565
- "y": 0.12,
566
- "width": 47.76,
567
- "height": 1.4933333333333334,
568
- "textbox_id": 0,
569
- "textbox_name": "p<Poster Title & Author>_t0",
570
- "num_chars": 410
571
- },
572
- {
573
- "panel_id": 0,
574
- "x": 0.12,
575
- "y": 1.6133333333333335,
576
- "width": 47.76,
577
- "height": 1.8666666666666665,
578
- "textbox_id": 0,
579
- "textbox_name": "p<Poster Title & Author>_t1",
580
- "num_chars": 410
581
- },
582
- {
583
- "panel_id": 1,
584
- "x": 0.12,
585
- "y": 3.72,
586
- "width": 18.941083283145534,
587
- "height": 1.28,
588
- "textbox_id": 0,
589
- "textbox_name": "p<Abstract>_t0",
590
- "num_chars": 160
591
- },
592
- {
593
- "panel_id": 1,
594
- "x": 0.12,
595
- "y": 5.0,
596
- "width": 18.941083283145534,
597
- "height": 5.980641178061044,
598
- "textbox_id": 1,
599
- "textbox_name": "p<Abstract>_t1",
600
- "num_chars": 320
601
- },
602
- {
603
- "panel_id": 2,
604
- "x": 0.12,
605
- "y": 11.220641178061044,
606
- "width": 18.941083283145534,
607
- "height": 1.28,
608
- "textbox_id": 0,
609
- "textbox_name": "p<Preliminaries>_t0",
610
- "num_chars": 160
611
- },
612
- {
613
- "panel_id": 2,
614
- "x": 0.12,
615
- "y": 12.500641178061043,
616
- "width": 18.941083283145534,
617
- "height": 5.614496977373721,
618
- "textbox_id": 1,
619
- "textbox_name": "p<Preliminaries>_t1",
620
- "num_chars": 320
621
- },
622
- {
623
- "panel_id": 2,
624
- "x": 0.12,
625
- "y": 28.98550302262628,
626
- "width": 18.941083283145534,
627
- "height": 6.894496977373719,
628
- "textbox_id": 2,
629
- "textbox_name": "p<Preliminaries>_t2",
630
- "num_chars": 480
631
- },
632
- {
633
- "panel_id": 3,
634
- "x": 19.301083283145534,
635
- "y": 3.72,
636
- "width": 9.321142381940005,
637
- "height": 1.28,
638
- "textbox_id": 0,
639
- "textbox_name": "p<Experiments>_t0",
640
- "num_chars": 80
641
- },
642
- {
643
- "panel_id": 3,
644
- "x": 19.301083283145534,
645
- "y": 5.0,
646
- "width": 9.321142381940005,
647
- "height": 10.216932542501636,
648
- "textbox_id": 1,
649
- "textbox_name": "p<Experiments>_t1",
650
- "num_chars": 320
651
- },
652
- {
653
- "panel_id": 4,
654
- "x": 28.86222566508554,
655
- "y": 3.72,
656
- "width": 19.01777433491446,
657
- "height": 1.28,
658
- "textbox_id": 0,
659
- "textbox_name": "p<TEBOpt>_t0",
660
- "num_chars": 160
661
- },
662
- {
663
- "panel_id": 4,
664
- "x": 28.86222566508554,
665
- "y": 5.0,
666
- "width": 19.01777433491446,
667
- "height": 1.2619511398159853,
668
- "textbox_id": 1,
669
- "textbox_name": "p<TEBOpt>_t1",
670
- "num_chars": 160
671
- },
672
- {
673
- "panel_id": 4,
674
- "x": 28.86222566508554,
675
- "y": 12.674981402685653,
676
- "width": 19.01777433491446,
677
- "height": 2.541951139815983,
678
- "textbox_id": 2,
679
- "textbox_name": "p<TEBOpt>_t2",
680
- "num_chars": 160
681
- },
682
- {
683
- "panel_id": 5,
684
- "x": 19.301083283145534,
685
- "y": 15.456932542501637,
686
- "width": 10.313344517776473,
687
- "height": 1.28,
688
- "textbox_id": 0,
689
- "textbox_name": "p<Qualitative & Quantitative Results>_t0",
690
- "num_chars": 80
691
- },
692
- {
693
- "panel_id": 5,
694
- "x": 19.301083283145534,
695
- "y": 16.736932542501634,
696
- "width": 10.313344517776473,
697
- "height": 6.433139196504181,
698
- "textbox_id": 1,
699
- "textbox_name": "p<Qualitative & Quantitative Results>_t1",
700
- "num_chars": 240
701
- },
702
- {
703
- "panel_id": 5,
704
- "x": 19.301083283145534,
705
- "y": 28.166860803495815,
706
- "width": 10.313344517776473,
707
- "height": 7.7131391965041844,
708
- "textbox_id": 2,
709
- "textbox_name": "p<Qualitative & Quantitative Results>_t2",
710
- "num_chars": 240
711
- },
712
- {
713
- "panel_id": 6,
714
- "x": 29.854427800922004,
715
- "y": 15.456932542501637,
716
- "width": 18.025572199077992,
717
- "height": 1.28,
718
- "textbox_id": 0,
719
- "textbox_name": "p<Introduction>_t0",
720
- "num_chars": 150
721
- },
722
- {
723
- "panel_id": 6,
724
- "x": 29.854427800922004,
725
- "y": 16.736932542501634,
726
- "width": 18.025572199077992,
727
- "height": 6.227698514230923,
728
- "textbox_id": 1,
729
- "textbox_name": "p<Introduction>_t1",
730
- "num_chars": 300
731
- },
732
- {
733
- "panel_id": 7,
734
- "x": 29.854427800922004,
735
- "y": 23.20463105673256,
736
- "width": 18.025572199077992,
737
- "height": 1.28,
738
- "textbox_id": 0,
739
- "textbox_name": "p<Discussion>_t0",
740
- "num_chars": 150
741
- },
742
- {
743
- "panel_id": 7,
744
- "x": 29.854427800922004,
745
- "y": 24.48463105673256,
746
- "width": 18.025572199077992,
747
- "height": 0.08647999798328783,
748
- "textbox_id": 1,
749
- "textbox_name": "p<Discussion>_t1",
750
- "num_chars": 150
751
- },
752
- {
753
- "panel_id": 7,
754
- "x": 29.854427800922004,
755
- "y": 28.670551048665715,
756
- "width": 18.025572199077992,
757
- "height": 1.3664799979832924,
758
- "textbox_id": 2,
759
- "textbox_name": "p<Discussion>_t2",
760
- "num_chars": 150
761
- },
762
- {
763
- "panel_id": 8,
764
- "x": 29.854427800922004,
765
- "y": 30.277031046649007,
766
- "width": 18.025572199077992,
767
- "height": 1.28,
768
- "textbox_id": 0,
769
- "textbox_name": "p<Conclusion>_t0",
770
- "num_chars": 150
771
- },
772
- {
773
- "panel_id": 8,
774
- "x": 29.854427800922004,
775
- "y": 31.55703104664901,
776
- "width": 18.025572199077992,
777
- "height": 4.322968953350993,
778
- "textbox_id": 1,
779
- "textbox_name": "p<Conclusion>_t1",
780
- "num_chars": 300
781
- }
782
- ]
783
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
posterbuilder/contents copy/figure_caption.json DELETED
@@ -1,258 +0,0 @@
1
- {
2
- "1": {
3
- "caption": "a lion and elephant chicken and a dog an",
4
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png",
5
- "width": 947,
6
- "height": 845,
7
- "figure_size": 800215,
8
- "figure_aspect": 1.1207100591715977
9
- },
10
- "2": {
11
- "caption": "Figure 2: Overview of the text-to-image generative model, including the details of the causal manner in attention mechanism. Because of the causal nature of the embedding, information is accumulated from the starting token through the end of the sequence, resulting in bias in the earlier token. To balance the critical information, we propose text embedding optimization for purifying the object token with equal weights within their corresponding embedding dimension.",
12
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-2.png",
13
- "width": 1939,
14
- "height": 1391,
15
- "figure_size": 2697149,
16
- "figure_aspect": 1.393961179007908
17
- },
18
- "5": {
19
- "caption": "<sot> A cat and a <eot> dog",
20
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-5.png",
21
- "width": 453,
22
- "height": 454,
23
- "figure_size": 205662,
24
- "figure_aspect": 0.9977973568281938
25
- },
26
- "9": {
27
- "caption": "Figure 3: Masking text embedding to identify the contribution of critical tokens, e.g., cat/dog, and special tokens, e.g., <sot>, <eot>, <pad>. The first row and the second row both contain cat and dog inside prompt but in different order. The analysis shows that special tokens contain general information about the given prompt. However, the cat/dog tokens carry more weight than the special tokens. In the last two columns, where one of the animal token embeddings is masked while retaining the special tokens' embedding, the generated image is predominantly influenced by the remaining animal's token embedding.",
28
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-9.png",
29
- "width": 454,
30
- "height": 450,
31
- "figure_size": 204300,
32
- "figure_aspect": 1.008888888888889
33
- },
34
- "10": {
35
- "caption": "<sot> A cat and a <eot> dog",
36
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-10.png",
37
- "width": 456,
38
- "height": 457,
39
- "figure_size": 208392,
40
- "figure_aspect": 0.9978118161925602
41
- },
42
- "11": {
43
- "caption": "Figure 4: Qualitative comparison of all methods. Every prompt uses the same seed.",
44
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-11.png",
45
- "width": 1952,
46
- "height": 644,
47
- "figure_size": 1257088,
48
- "figure_aspect": 3.031055900621118
49
- },
50
- "12": {
51
- "caption": "Figure 5: Qualitative comparison for the generated image with vs. without L TEB in Stable Diffusion 1.4. Every prompt uses the same seed.",
52
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-12.png",
53
- "width": 1947,
54
- "height": 794,
55
- "figure_size": 1545918,
56
- "figure_aspect": 2.452141057934509
57
- },
58
- "13": {
59
- "caption": "Figure 6: (a) The cosine similarity of text embedding from single word. (b) The KL distance of cross-attention maps that are triggered by two words. The data is ordered by their text embedding similarity.",
60
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-13.png",
61
- "width": 1174,
62
- "height": 711,
63
- "figure_size": 834714,
64
- "figure_aspect": 1.651195499296765
65
- },
66
- "14": {
67
- "caption": "Figure 8: Text-text similarity of the left one is 8.68% higher than that of the right one. It indicates that the metric cannot identify the mixture issue.",
68
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-14.png",
69
- "width": 481,
70
- "height": 485,
71
- "figure_size": 233285,
72
- "figure_aspect": 0.9917525773195877
73
- },
74
- "18": {
75
- "caption": "Figure 9: In two images both with mixed objects, full prompt similarity , minimum object similarity , and text-text similarity all vary greatly, making the evaluation metrics unreliable for object mixture and missing.",
76
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-18.png",
77
- "width": 488,
78
- "height": 489,
79
- "figure_size": 238632,
80
- "figure_aspect": 0.9979550102249489
81
- },
82
- "19": {
83
- "caption": "Figure 10: Demonstrating the 90% bounding box overlapping and corresponding object mixture in generated image and cross-attention maps during denoising steps.",
84
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-19.png",
85
- "width": 1981,
86
- "height": 840,
87
- "figure_size": 1664040,
88
- "figure_aspect": 2.3583333333333334
89
- },
90
- "21": {
91
- "caption": "SD 1.4 Missing BoyA bear and a 'frog",
92
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-21.png",
93
- "width": 660,
94
- "height": 331,
95
- "figure_size": 218460,
96
- "figure_aspect": 1.9939577039274925
97
- },
98
- "22": {
99
- "caption": "A red bench and a green bird",
100
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-22.png",
101
- "width": 654,
102
- "height": 329,
103
- "figure_size": 215166,
104
- "figure_aspect": 1.987841945288754
105
- },
106
- "23": {
107
- "caption": "SD 1.4 Cat Missingred bird and a brown boat",
108
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-23.png",
109
- "width": 655,
110
- "height": 332,
111
- "figure_size": 217460,
112
- "figure_aspect": 1.9728915662650603
113
- },
114
- "24": {
115
- "caption": "Figure 11: More qualitative results on SD 1.4 in complex prompts from color and spatial sets within T2I-CompBench [5].",
116
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-24.png",
117
- "width": 658,
118
- "height": 332,
119
- "figure_size": 218456,
120
- "figure_aspect": 1.9819277108433735
121
- },
122
- "26": {
123
- "caption": "SD 1.4 + LTEB",
124
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-26.png",
125
- "width": 660,
126
- "height": 333,
127
- "figure_size": 219780,
128
- "figure_aspect": 1.981981981981982
129
- },
130
- "27": {
131
- "caption": "SD 1.4 Car MissingA sheep near a car",
132
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-27.png",
133
- "width": 660,
134
- "height": 328,
135
- "figure_size": 216480,
136
- "figure_aspect": 2.0121951219512195
137
- },
138
- "29": {
139
- "caption": "brown cat and a red suitcase",
140
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-29.png",
141
- "width": 655,
142
- "height": 330,
143
- "figure_size": 216150,
144
- "figure_aspect": 1.9848484848484849
145
- },
146
- "31": {
147
- "caption": "SD 1.4 LTEB",
148
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-31.png",
149
- "width": 654,
150
- "height": 331,
151
- "figure_size": 216474,
152
- "figure_aspect": 1.9758308157099698
153
- },
154
- "32": {
155
- "caption": "SD 1.4 +LTEB",
156
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-32.png",
157
- "width": 657,
158
- "height": 347,
159
- "figure_size": 227979,
160
- "figure_aspect": 1.893371757925072
161
- },
162
- "33": {
163
- "caption": "Figure 12: More qualitative results on ELLA on SD 1.5 in complex prompts from color set within T2I-CompBench [5]. Reference: ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment (ArXiv'24)",
164
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-33.png",
165
- "width": 979,
166
- "height": 332,
167
- "figure_size": 325028,
168
- "figure_aspect": 2.9487951807228914
169
- },
170
- "34": {
171
- "caption": "SDXL-Turbo Fork MissingA black dog and a brown cat",
172
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-34.png",
173
- "width": 658,
174
- "height": 330,
175
- "figure_size": 217140,
176
- "figure_aspect": 1.993939393939394
177
- },
178
- "35": {
179
- "caption": "SDXL-Turbo LTEBA blue chair and a red cup",
180
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-35.png",
181
- "width": 654,
182
- "height": 327,
183
- "figure_size": 213858,
184
- "figure_aspect": 2.0
185
- },
186
- "36": {
187
- "caption": "SDXL-Turbo LTEB",
188
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-36.png",
189
- "width": 653,
190
- "height": 332,
191
- "figure_size": 216796,
192
- "figure_aspect": 1.966867469879518
193
- },
194
- "37": {
195
- "caption": "wooden spoon and a metal fork",
196
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-37.png",
197
- "width": 652,
198
- "height": 327,
199
- "figure_size": 213204,
200
- "figure_aspect": 1.9938837920489296
201
- },
202
- "38": {
203
- "caption": "A green bench and a red book",
204
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-38.png",
205
- "width": 656,
206
- "height": 333,
207
- "figure_size": 218448,
208
- "figure_aspect": 1.96996996996997
209
- },
210
- "39": {
211
- "caption": "brown bench and a clock green",
212
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-39.png",
213
- "width": 649,
214
- "height": 326,
215
- "figure_size": 211574,
216
- "figure_aspect": 1.99079754601227
217
- },
218
- "40": {
219
- "caption": "SD 3 Color Mixture Orange MissingA blue bowl and a yellow orange",
220
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-40.png",
221
- "width": 641,
222
- "height": 322,
223
- "figure_size": 206402,
224
- "figure_aspect": 1.9906832298136645
225
- },
226
- "41": {
227
- "caption": "Figure 14: More qualitative results on SD3 [2] in complex prompts from color set within T2ICompBench [5].",
228
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-41.png",
229
- "width": 305,
230
- "height": 310,
231
- "figure_size": 94550,
232
- "figure_aspect": 0.9838709677419355
233
- },
234
- "43": {
235
- "caption": "brown backpack and a blue cow",
236
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-43.png",
237
- "width": 303,
238
- "height": 302,
239
- "figure_size": 91506,
240
- "figure_aspect": 1.0033112582781456
241
- },
242
- "45": {
243
- "caption": "A green acorn and a brown leaf",
244
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-45.png",
245
- "width": 644,
246
- "height": 321,
247
- "figure_size": 206724,
248
- "figure_aspect": 2.0062305295950154
249
- },
250
- "47": {
251
- "caption": "Figure 15: The screenshot of the human evaluation, containing the information and options that are given to participants.",
252
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-47.png",
253
- "width": 1588,
254
- "height": 1122,
255
- "figure_size": 1781736,
256
- "figure_aspect": 1.4153297682709447
257
- }
258
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
posterbuilder/contents copy/poster_content.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "meta": {
3
- "poster_title": "A Cat Is A Cat (Not A Dog!): Unraveling Information Mix-ups in Text-to-Image Encoders through Causal Analysis and Embedding Optimization",
4
- "authors": "Chieh-Yun Chen, Chiang Tseng, Li-Wu Tsao, Hong-Han Shuai",
5
- "affiliations": "National Yang Ming Chiao Tung University, Georgia Institute of Technology"
6
- },
7
- "sections": [
8
- {
9
- "title": "Poster Title & Author",
10
- "content": "A Cat Is A Cat (Not A Dog!): Unraveling Information Mix-ups in Text-to-Image Encoders through Causal Analysis and Embedding Optimization by Chieh-Yun Chen, Chiang Tseng, Li-Wu Tsao, Hong-Han Shuai from National Yang Ming Chiao Tung University and Georgia Institute of Technology."
11
- },
12
- {
13
- "title": "Abstract",
14
- "content": "This paper analyzes the impact of causal manner in the text encoder of text-to-image (T2I) diffusion models, which can lead to information bias and loss. We propose a text embedding balance optimization method with a 125.42% improvement on information balance in stable diffusion. A new automatic evaluation metric is introduced, achieving 81% concordance with human assessments."
15
- },
16
- {
17
- "title": "Preliminaries",
18
- "content": "Text-to-image diffusion models include a text encoder, a variational autoencoder, and a denoising UNet. The causal masking manner in the text encoder causes information bias, as each token only has information from previous tokens."
19
- },
20
- {
21
- "title": "Experiments",
22
- "content": "We compare our method with baselines like Stable Diffusion and SynGen, focusing on information balance rather than surpassing existing methods. Our automatic evaluation metric, validated by human assessment, effectively measures object presence and accuracy."
23
- },
24
- {
25
- "title": "TEBOpt",
26
- "content": "TEBOpt aims to balance critical information in text embeddings by optimizing object token embeddings to prevent mixing and work alongside image latent optimization techniques to address object disappearance."
27
- },
28
- {
29
- "title": "Qualitative & Quantitative Results",
30
- "content": "TEBOpt improves object balance in generated images, reducing mixture and missing issues. It enhances token embedding similarity and cross-attention map distance, confirming its effectiveness in addressing information bias."
31
- },
32
- {
33
- "title": "Introduction",
34
- "content": "Text-to-image diffusion models have gained attention, but the role of text embedding in generating multiple objects remains underexplored. This paper investigates how text embeddings influence semantic outcomes, identifying issues of information bias and loss. We propose Text Embedding Balance Optimization (TEBOpt) to address these issues and improve image generation."
35
- },
36
- {
37
- "title": "Discussion",
38
- "content": "Text embedding similarity affects cross-attention maps' distance, with similar embeddings leading to object mixture. Our findings highlight the need for optimized text embeddings to improve image generation quality."
39
- },
40
- {
41
- "title": "Conclusion",
42
- "content": "Our study reveals that causal processing of text embedding leads to biases and loss. TEBOpt effectively eliminates problematic information, improving information balance in stable diffusion by 125.42% while preserving object coexistence."
43
- }
44
- ]
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
posterbuilder/contents/arrangement.json CHANGED
@@ -1,542 +1,115 @@
1
  {
2
- "poster_width": 1200,
3
- "poster_height": 900,
4
- "poster_width_inches": 48.0,
5
- "poster_height_inches": 36.0,
6
  "panels": [
7
  {
8
  "panel_id": 0,
9
- "section_name": "Poster Title & Author",
10
- "tp": 0.12971887550200803,
11
- "text_len": 323,
12
- "gp": 0,
13
- "figure_size": 0,
14
- "figure_aspect": 1,
15
- "sp": 0.06301447323913961,
16
- "rp": 2.505748069071783
17
  },
18
  {
19
  "panel_id": 1,
20
- "section_name": "Introduction",
21
- "tp": 0.1859437751004016,
22
- "text_len": 463,
23
- "gp": 0,
24
- "figure_size": 0,
25
- "figure_aspect": 1,
26
- "sp": 0.08063905395956796,
27
- "rp": 2.359873888191933
28
  },
29
  {
30
  "panel_id": 2,
31
- "section_name": "Benchmark & Metrics",
32
- "tp": 0.15903614457831325,
33
- "text_len": 396,
34
- "gp": 0.016682202105281593,
35
- "figure_size": 64769,
36
- "figure_aspect": 0.8819188191881919,
37
- "sp": 0.07756528917306713,
38
- "rp": 2.386019900332315
39
  },
40
  {
41
  "panel_id": 3,
42
- "section_name": "PosterAgent Framework",
43
- "tp": 0.1859437751004016,
44
- "text_len": 463,
45
- "gp": 0.49217196764679444,
46
  "figure_size": 1910868,
47
- "figure_aspect": 2.0350877192982457,
48
- "sp": 0.23879941088764153,
49
- "rp": 1.0716273641356449
50
  },
51
  {
52
  "panel_id": 4,
53
- "section_name": "Evaluation & Results",
54
- "tp": 0.1859437751004016,
55
- "text_len": 463,
56
- "gp": 0.49114583024792396,
57
- "figure_size": 1906884,
58
- "figure_aspect": 2.0434782608695654,
59
- "sp": 0.23846965976825418,
60
- "rp": 1.0743132504197004
61
- },
62
- {
63
- "panel_id": 5,
64
- "section_name": "Conclusion",
65
- "tp": 0.1534136546184739,
66
- "text_len": 382,
67
  "gp": 0,
68
  "figure_size": 0,
69
- "figure_aspect": 1,
70
- "sp": 0.07044197511417727,
71
- "rp": 2.4442725214152747
72
- }
73
- ],
74
- "panel_arrangement": [
75
- {
76
- "panel_name": "Poster Title & Author",
77
- "panel_id": 0,
78
- "x": 0,
79
- "y": 0,
80
- "width": 1200,
81
- "height": 90.0
82
- },
83
- {
84
- "panel_name": "Introduction",
85
- "panel_id": 1,
86
- "x": 0,
87
- "y": 90.0,
88
- "width": 550.621296168701,
89
- "height": 201.65362571734266
90
- },
91
- {
92
- "panel_name": "Benchmark & Metrics",
93
- "panel_id": 2,
94
- "x": 550.621296168701,
95
- "y": 90.0,
96
- "width": 529.6329503516805,
97
- "height": 201.65362571734266
98
- },
99
- {
100
- "panel_name": "PosterAgent Framework",
101
- "panel_id": 3,
102
- "x": 0,
103
- "y": 291.65362571734266,
104
- "width": 540.5003037876063,
105
- "height": 608.3463742826573
106
- },
107
- {
108
- "panel_name": "Evaluation & Results",
109
- "panel_id": 4,
110
- "x": 540.5003037876063,
111
- "y": 291.65362571734266,
112
- "width": 539.7539427327752,
113
- "height": 608.3463742826573
114
- },
115
- {
116
- "panel_name": "Conclusion",
117
- "panel_id": 5,
118
- "x": 1080.2542465203815,
119
- "y": 90.0,
120
- "width": 119.74575347961854,
121
- "height": 810.0
122
- }
123
- ],
124
- "figure_arrangement": [
125
- {
126
- "panel_id": 2,
127
- "x": 763.672586975783,
128
- "y": 132.13072514346854,
129
- "width": 103.53036873751637,
130
- "height": 117.39217543040559,
131
- "figure_id": 0,
132
- "figure_name": "p<Benchmark & Metrics>_f0",
133
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png"
134
- },
135
- {
136
- "panel_id": 3,
137
- "x": 56.45003037876063,
138
- "y": 490.76985659696936,
139
- "width": 427.60024303008504,
140
- "height": 210.11391252340385,
141
- "figure_id": 0,
142
- "figure_name": "p<PosterAgent Framework>_f0",
143
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png"
144
- },
145
- {
146
- "panel_id": 4,
147
- "x": 596.8756980608838,
148
- "y": 491.34731768544725,
149
- "width": 427.0031541862202,
150
- "height": 208.95899034644816,
151
- "figure_id": 0,
152
- "figure_name": "p<Evaluation & Results>_f0",
153
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-1.png"
154
- }
155
- ],
156
- "text_arrangement": [
157
- {
158
- "panel_id": 0,
159
- "x": 3.0,
160
- "y": 3.0,
161
- "width": 1194.0,
162
- "height": 37.333333333333336,
163
- "textbox_id": 0,
164
- "textbox_name": "p<Poster Title & Author>_t0",
165
- "num_chars": 410
166
- },
167
- {
168
- "panel_id": 0,
169
- "x": 3.0,
170
- "y": 40.333333333333336,
171
- "width": 1194.0,
172
- "height": 46.666666666666664,
173
- "textbox_id": 0,
174
- "textbox_name": "p<Poster Title & Author>_t1",
175
- "num_chars": 410
176
- },
177
- {
178
- "panel_id": 1,
179
- "x": 3.0,
180
- "y": 93.0,
181
- "width": 544.621296168701,
182
- "height": 32.0,
183
- "textbox_id": 0,
184
- "textbox_name": "p<Introduction>_t0",
185
- "num_chars": 180
186
- },
187
- {
188
- "panel_id": 1,
189
- "x": 3.0,
190
- "y": 125.0,
191
- "width": 544.621296168701,
192
- "height": 163.65362571734266,
193
- "textbox_id": 1,
194
- "textbox_name": "p<Introduction>_t1",
195
- "num_chars": 540
196
- },
197
- {
198
- "panel_id": 2,
199
- "x": 553.621296168701,
200
- "y": 93.0,
201
- "width": 523.6329503516805,
202
- "height": 32.0,
203
- "textbox_id": 0,
204
- "textbox_name": "p<Benchmark & Metrics>_t0",
205
- "num_chars": 180
206
- },
207
- {
208
- "panel_id": 2,
209
- "x": 553.621296168701,
210
- "y": 125.0,
211
- "width": 523.6329503516805,
212
- "height": 7.130725143468538,
213
- "textbox_id": 1,
214
- "textbox_name": "p<Benchmark & Metrics>_t1",
215
- "num_chars": 180
216
- },
217
- {
218
- "panel_id": 2,
219
- "x": 553.621296168701,
220
- "y": 249.52290057387413,
221
- "width": 523.6329503516805,
222
- "height": 39.13072514346854,
223
- "textbox_id": 2,
224
- "textbox_name": "p<Benchmark & Metrics>_t2",
225
- "num_chars": 180
226
- },
227
- {
228
- "panel_id": 3,
229
- "x": 3.0,
230
- "y": 294.65362571734266,
231
- "width": 534.5003037876063,
232
- "height": 32.0,
233
- "textbox_id": 0,
234
- "textbox_name": "p<PosterAgent Framework>_t0",
235
- "num_chars": 180
236
- },
237
- {
238
- "panel_id": 3,
239
- "x": 3.0,
240
- "y": 326.65362571734266,
241
- "width": 534.5003037876063,
242
- "height": 164.1162308796267,
243
- "textbox_id": 1,
244
- "textbox_name": "p<PosterAgent Framework>_t1",
245
- "num_chars": 540
246
- },
247
- {
248
- "panel_id": 3,
249
- "x": 3.0,
250
- "y": 700.8837691203732,
251
- "width": 534.5003037876063,
252
- "height": 196.11623087962676,
253
- "textbox_id": 2,
254
- "textbox_name": "p<PosterAgent Framework>_t2",
255
- "num_chars": 540
256
- },
257
- {
258
- "panel_id": 4,
259
- "x": 543.5003037876063,
260
- "y": 294.65362571734266,
261
- "width": 533.7539427327752,
262
- "height": 32.0,
263
- "textbox_id": 0,
264
- "textbox_name": "p<Evaluation & Results>_t0",
265
- "num_chars": 180
266
- },
267
- {
268
- "panel_id": 4,
269
- "x": 543.5003037876063,
270
- "y": 326.65362571734266,
271
- "width": 533.7539427327752,
272
- "height": 164.6936919681046,
273
- "textbox_id": 1,
274
- "textbox_name": "p<Evaluation & Results>_t1",
275
- "num_chars": 540
276
- },
277
- {
278
- "panel_id": 4,
279
- "x": 543.5003037876063,
280
- "y": 700.3063080318955,
281
- "width": 533.7539427327752,
282
- "height": 196.69369196810453,
283
- "textbox_id": 2,
284
- "textbox_name": "p<Evaluation & Results>_t2",
285
- "num_chars": 540
286
  },
287
  {
288
  "panel_id": 5,
289
- "x": 1083.2542465203815,
290
- "y": 93.0,
291
- "width": 113.74575347961854,
292
- "height": 32.0,
293
- "textbox_id": 0,
294
- "textbox_name": "p<Conclusion>_t0",
295
- "num_chars": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  },
297
  {
298
- "panel_id": 5,
299
- "x": 1083.2542465203815,
300
- "y": 125.0,
301
- "width": 113.74575347961854,
302
- "height": 772.0,
303
- "textbox_id": 1,
304
- "textbox_name": "p<Conclusion>_t1",
305
- "num_chars": 420
306
  }
307
  ],
308
- "panel_arrangement_inches": [
309
  {
310
- "panel_name": "Poster Title & Author",
311
  "panel_id": 0,
312
- "x": 0.0,
313
- "y": 0.0,
314
- "width": 48.0,
315
- "height": 3.6
316
  },
317
  {
318
- "panel_name": "Introduction",
319
  "panel_id": 1,
320
- "x": 0.0,
321
- "y": 3.6,
322
- "width": 22.02485184674804,
323
- "height": 8.066145028693706
324
  },
325
  {
326
- "panel_name": "Benchmark & Metrics",
327
  "panel_id": 2,
328
- "x": 22.02485184674804,
329
- "y": 3.6,
330
- "width": 21.18531801406722,
331
- "height": 8.066145028693706
332
  },
333
  {
334
- "panel_name": "PosterAgent Framework",
335
  "panel_id": 3,
336
- "x": 0.0,
337
- "y": 11.666145028693707,
338
- "width": 21.620012151504252,
339
- "height": 24.33385497130629
340
- },
341
- {
342
- "panel_name": "Evaluation & Results",
343
- "panel_id": 4,
344
- "x": 21.620012151504252,
345
- "y": 11.666145028693707,
346
- "width": 21.590157709311008,
347
- "height": 24.33385497130629
348
  },
349
  {
350
- "panel_name": "Conclusion",
351
  "panel_id": 5,
352
- "x": 43.210169860815256,
353
- "y": 3.6,
354
- "width": 4.789830139184741,
355
- "height": 32.4
356
- }
357
- ],
358
- "figure_arrangement_inches": [
359
- {
360
- "panel_id": 2,
361
- "x": 30.54690347903132,
362
- "y": 5.285229005738741,
363
- "width": 4.141214749500655,
364
- "height": 4.6956870172162235,
365
- "figure_id": 0,
366
- "figure_name": "p<Benchmark & Metrics>_f0",
367
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png"
368
- },
369
- {
370
- "panel_id": 3,
371
- "x": 2.258001215150425,
372
- "y": 19.630794263878773,
373
- "width": 17.1040097212034,
374
- "height": 8.404556500936154,
375
- "figure_id": 0,
376
- "figure_name": "p<PosterAgent Framework>_f0",
377
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png"
378
- },
379
- {
380
- "panel_id": 4,
381
- "x": 23.87502792243535,
382
- "y": 19.65389270741789,
383
- "width": 17.080126167448807,
384
- "height": 8.358359613857926,
385
- "figure_id": 0,
386
- "figure_name": "p<Evaluation & Results>_f0",
387
- "figure_path": "<4o_4o>_images_and_tables/paper/paper-table-1.png"
388
- }
389
- ],
390
- "text_arrangement_inches": [
391
- {
392
- "panel_id": 0,
393
- "x": 0.12,
394
- "y": 0.12,
395
- "width": 47.76,
396
- "height": 1.4933333333333334,
397
- "textbox_id": 0,
398
- "textbox_name": "p<Poster Title & Author>_t0",
399
- "num_chars": 410
400
- },
401
- {
402
- "panel_id": 0,
403
- "x": 0.12,
404
- "y": 1.6133333333333335,
405
- "width": 47.76,
406
- "height": 1.8666666666666665,
407
- "textbox_id": 0,
408
- "textbox_name": "p<Poster Title & Author>_t1",
409
- "num_chars": 410
410
  },
411
  {
412
- "panel_id": 1,
413
- "x": 0.12,
414
- "y": 3.72,
415
- "width": 21.784851846748037,
416
- "height": 1.28,
417
- "textbox_id": 0,
418
- "textbox_name": "p<Introduction>_t0",
419
- "num_chars": 180
420
- },
421
- {
422
- "panel_id": 1,
423
- "x": 0.12,
424
- "y": 5.0,
425
- "width": 21.784851846748037,
426
- "height": 6.546145028693706,
427
- "textbox_id": 1,
428
- "textbox_name": "p<Introduction>_t1",
429
- "num_chars": 540
430
- },
431
- {
432
- "panel_id": 2,
433
- "x": 22.14485184674804,
434
- "y": 3.72,
435
- "width": 20.94531801406722,
436
- "height": 1.28,
437
- "textbox_id": 0,
438
- "textbox_name": "p<Benchmark & Metrics>_t0",
439
- "num_chars": 180
440
- },
441
- {
442
- "panel_id": 2,
443
- "x": 22.14485184674804,
444
- "y": 5.0,
445
- "width": 20.94531801406722,
446
- "height": 0.28522900573874155,
447
- "textbox_id": 1,
448
- "textbox_name": "p<Benchmark & Metrics>_t1",
449
- "num_chars": 180
450
  },
451
  {
452
- "panel_id": 2,
453
- "x": 22.14485184674804,
454
- "y": 9.980916022954965,
455
- "width": 20.94531801406722,
456
- "height": 1.5652290057387415,
457
- "textbox_id": 2,
458
- "textbox_name": "p<Benchmark & Metrics>_t2",
459
- "num_chars": 180
460
- },
461
- {
462
- "panel_id": 3,
463
- "x": 0.12,
464
- "y": 11.786145028693706,
465
- "width": 21.380012151504253,
466
- "height": 1.28,
467
- "textbox_id": 0,
468
- "textbox_name": "p<PosterAgent Framework>_t0",
469
- "num_chars": 180
470
- },
471
- {
472
- "panel_id": 3,
473
- "x": 0.12,
474
- "y": 13.066145028693706,
475
- "width": 21.380012151504253,
476
- "height": 6.564649235185068,
477
- "textbox_id": 1,
478
- "textbox_name": "p<PosterAgent Framework>_t1",
479
- "num_chars": 540
480
- },
481
- {
482
- "panel_id": 3,
483
- "x": 0.12,
484
- "y": 28.03535076481493,
485
- "width": 21.380012151504253,
486
- "height": 7.84464923518507,
487
- "textbox_id": 2,
488
- "textbox_name": "p<PosterAgent Framework>_t2",
489
- "num_chars": 540
490
- },
491
- {
492
- "panel_id": 4,
493
- "x": 21.740012151504253,
494
- "y": 11.786145028693706,
495
- "width": 21.350157709311006,
496
- "height": 1.28,
497
- "textbox_id": 0,
498
- "textbox_name": "p<Evaluation & Results>_t0",
499
- "num_chars": 180
500
- },
501
- {
502
- "panel_id": 4,
503
- "x": 21.740012151504253,
504
- "y": 13.066145028693706,
505
- "width": 21.350157709311006,
506
- "height": 6.587747678724184,
507
- "textbox_id": 1,
508
- "textbox_name": "p<Evaluation & Results>_t1",
509
- "num_chars": 540
510
- },
511
- {
512
- "panel_id": 4,
513
- "x": 21.740012151504253,
514
- "y": 28.01225232127582,
515
- "width": 21.350157709311006,
516
- "height": 7.867747678724181,
517
- "textbox_id": 2,
518
- "textbox_name": "p<Evaluation & Results>_t2",
519
- "num_chars": 540
520
- },
521
- {
522
- "panel_id": 5,
523
- "x": 43.33016986081526,
524
- "y": 3.72,
525
- "width": 4.549830139184742,
526
- "height": 1.28,
527
- "textbox_id": 0,
528
- "textbox_name": "p<Conclusion>_t0",
529
- "num_chars": 30
530
- },
531
- {
532
- "panel_id": 5,
533
- "x": 43.33016986081526,
534
- "y": 5.0,
535
- "width": 4.549830139184742,
536
- "height": 30.88,
537
- "textbox_id": 1,
538
- "textbox_name": "p<Conclusion>_t1",
539
- "num_chars": 420
540
  }
541
  ]
542
  }
 
1
  {
 
 
 
 
2
  "panels": [
3
  {
4
  "panel_id": 0,
5
+ "section_name": "Why Posters Are Hard",
6
+ "tp": 0.12082710513203787,
7
+ "text_len": 485,
8
+ "gp": 0.009888851380803912,
9
+ "figure_size": 64769,
10
+ "figure_aspect": 0.8819188191881919
 
 
11
  },
12
  {
13
  "panel_id": 1,
14
+ "section_name": "Benchmark and Data",
15
+ "tp": 0.12531141006477328,
16
+ "text_len": 503,
17
+ "gp": 0.04796373085236436,
18
+ "figure_size": 314148,
19
+ "figure_aspect": 1.0125673249551166
 
 
20
  },
21
  {
22
  "panel_id": 2,
23
+ "section_name": "PaperQuiz: What Matters",
24
+ "tp": 0.11285500747384156,
25
+ "text_len": 453,
26
+ "gp": 0.1192882298865948,
27
+ "figure_size": 781302,
28
+ "figure_aspect": 5.032994923857868
 
 
29
  },
30
  {
31
  "panel_id": 3,
32
+ "section_name": "PosterAgent Pipeline",
33
+ "tp": 0.10637767812655705,
34
+ "text_len": 427,
35
+ "gp": 0.29174897960959734,
36
  "figure_size": 1910868,
37
+ "figure_aspect": 2.0350877192982457
 
 
38
  },
39
  {
40
  "panel_id": 4,
41
+ "section_name": "Parser: Structured Assets",
42
+ "tp": 0.10612855007473841,
43
+ "text_len": 426,
 
 
 
 
 
 
 
 
 
 
 
44
  "gp": 0,
45
  "figure_size": 0,
46
+ "figure_aspect": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  },
48
  {
49
  "panel_id": 5,
50
+ "section_name": "Planner: Layout Mastery",
51
+ "tp": 0.10089686098654709,
52
+ "text_len": 405,
53
+ "gp": 0.08839429109643054,
54
+ "figure_size": 578956,
55
+ "figure_aspect": 1.3959627329192548
56
+ },
57
+ {
58
+ "panel_id": 6,
59
+ "section_name": "Painter\u2013Commenter Loop",
60
+ "tp": 0.10662680617837568,
61
+ "text_len": 428,
62
+ "gp": 0.15157520979208358,
63
+ "figure_size": 992772,
64
+ "figure_aspect": 1.4480676328502415
65
+ },
66
+ {
67
+ "panel_id": 7,
68
+ "section_name": "Results: Stronger, Leaner",
69
+ "tp": 0.10986547085201794,
70
+ "text_len": 441,
71
+ "gp": 0.2911407073821255,
72
+ "figure_size": 1906884,
73
+ "figure_aspect": 2.0434782608695654
74
  },
75
  {
76
+ "panel_id": 8,
77
+ "section_name": "Limits and Next Steps",
78
+ "tp": 0.1111111111111111,
79
+ "text_len": 446,
80
+ "gp": 0,
81
+ "figure_size": 0,
82
+ "figure_aspect": 1
 
83
  }
84
  ],
85
+ "figure_arrangement": [
86
  {
 
87
  "panel_id": 0,
88
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-1.png"
 
 
 
89
  },
90
  {
 
91
  "panel_id": 1,
92
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-6.png"
 
 
 
93
  },
94
  {
 
95
  "panel_id": 2,
96
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-7.png"
 
 
 
97
  },
98
  {
 
99
  "panel_id": 3,
100
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-8.png"
 
 
 
 
 
 
 
 
 
 
 
101
  },
102
  {
 
103
  "panel_id": 5,
104
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-30.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  },
106
  {
107
+ "panel_id": 6,
108
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-61.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  },
110
  {
111
+ "panel_id": 7,
112
+ "figure_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-table-1.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  }
114
  ]
115
  }
posterbuilder/contents/figure_caption.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "1": {
3
  "caption": "Figure 1: Overview of this work. We address two core challenges in scientific poster generation: Left: How to create a poster from a paper -we propose PosterAgent (Sec. 4), a framework that transforms long-context scientific papers (20K+ tokens) into structured visual posters; and Right: How to evaluate poster quality -weintroduce the Paper2Poster benchmark (Sec. 3), which enables systematic comparison between agent-generated and author-designed posters.",
4
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png",
5
  "width": 239,
6
  "height": 271,
7
  "figure_size": 64769,
@@ -9,7 +9,7 @@
9
  },
10
  "3": {
11
  "caption": "Paper ( 20K tokens )",
12
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-3.png",
13
  "width": 398,
14
  "height": 265,
15
  "figure_size": 105470,
@@ -17,7 +17,7 @@
17
  },
18
  "6": {
19
  "caption": "Figure 2: Data Statistics of Paper2Poster. (a) Word cloud illustrating the diversity of research topics. (b) Textual Token statistics and Figure count statistics for input papers vs. posters provided by authors. Overall, these statistics highlight that Paper2Poster is a multimodal context compression task, requiring effective abstraction of both textual and visual content.",
20
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-6.png",
21
  "width": 564,
22
  "height": 557,
23
  "figure_size": 314148,
@@ -25,7 +25,7 @@
25
  },
26
  "7": {
27
  "caption": "Figure 3: Left : Overview of the evaluation framework in Paper2Poster. Middle : We automatically generate multiple-choice questions from each paper using an LLM (o3), forming the our PaperQuiz evaluation. Right : In PaperQuiz, we simulate multiple reader by allowing VLMs-representing different expertise levels ( e.g., student, professor)-to read each generated poster and answer the quiz. The poster that achieves the highest average score is considered the most effective in conveying the paper's content.",
28
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-7.png",
29
  "width": 1983,
30
  "height": 394,
31
  "figure_size": 781302,
@@ -33,7 +33,7 @@
33
  },
34
  "8": {
35
  "caption": "Figure 4: Illustration of the PosterAgent pipeline. Given an input paper, PosterAgent generates a structured academic poster through three modules: 1. Parser: Extracts key textual and visual assets using a combination of tools and LLM-based summarization, resulting in a structured asset library. 2. Planner: Matches assets and arranges them into coherent layouts, iteratively generating panels with a zoom-in operation. 3. Painter-Commenter: The Painter generates panel-level bullet-content along with executable code, and renders the visual output, while the Commenter-a VLM with in-context reference-provides feedback to ensure layout coherence and prevent content overflow.",
36
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png",
37
  "width": 1972,
38
  "height": 969,
39
  "figure_size": 1910868,
@@ -41,7 +41,7 @@
41
  },
42
  "9": {
43
  "caption": "Figure 5: PaperQuiz's Avg. scores across different Reader VLMs (x-axis) for each poster type (legend lines). Refer to Append. Tab. 3 for full model names.",
44
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-9.png",
45
  "width": 769,
46
  "height": 505,
47
  "figure_size": 388345,
@@ -49,7 +49,7 @@
49
  },
50
  "10": {
51
  "caption": "Figure 7 presents the average token cost per poster across different methods. Our PosterAgent achieves great token efficiency, using only 101 . 1 K (4o-based) and 47 . 6 K (Qwen-based) tokens-reducing cost by 60% -87% compared to OWL-4o [6]. This translates to just $0 . 55 for 4o and $0 . 0045 for Qwen per poster, highlighting its effectiveness, (see Append. E.2 for further details).",
52
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-10.png",
53
  "width": 1948,
54
  "height": 1100,
55
  "figure_size": 2142800,
@@ -57,7 +57,7 @@
57
  },
58
  "11": {
59
  "caption": "Figure 7: Average token consumptions for different methods. Details are provided in Appendix E.1.",
60
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-11.png",
61
  "width": 701,
62
  "height": 505,
63
  "figure_size": 354005,
@@ -65,7 +65,7 @@
65
  },
66
  "12": {
67
  "caption": "Figure 6: PaperQuiz's Avg scores across different types of posters (x-axis) for readers (colored lines) on human evaluation subset.",
68
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-12.png",
69
  "width": 661,
70
  "height": 428,
71
  "figure_size": 282908,
@@ -73,7 +73,7 @@
73
  },
74
  "13": {
75
  "caption": "Figure 10: Posters for MuSc: Zero-Shot Industrial Anomaly Classification and Segmentation with Mutual Scoring of the Unlabeled Images.",
76
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-13.png",
77
  "width": 960,
78
  "height": 521,
79
  "figure_size": 500160,
@@ -81,7 +81,7 @@
81
  },
82
  "15": {
83
  "caption": "(b) PosterAgent-generated poster.(a) Author-designed poster.",
84
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-15.png",
85
  "width": 1993,
86
  "height": 810,
87
  "figure_size": 1614330,
@@ -89,7 +89,7 @@
89
  },
90
  "16": {
91
  "caption": "(a) Author-designed poster.",
92
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-16.png",
93
  "width": 945,
94
  "height": 680,
95
  "figure_size": 642600,
@@ -97,7 +97,7 @@
97
  },
98
  "17": {
99
  "caption": "(b) PosterAgent-generated poster.",
100
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-17.png",
101
  "width": 957,
102
  "height": 708,
103
  "figure_size": 677556,
@@ -105,7 +105,7 @@
105
  },
106
  "18": {
107
  "caption": "Figure 11: Posters for Neuroformer: Multimodal and Multitask Generative Pretraining for Brain Data.(a) Author-designed poster.",
108
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-18.png",
109
  "width": 938,
110
  "height": 620,
111
  "figure_size": 581560,
@@ -113,7 +113,7 @@
113
  },
114
  "19": {
115
  "caption": "Figure 12: Posters for Conformal Semantic Keypoint Detection with Statistical Guarantees.(a) Author-designed poster.",
116
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-19.png",
117
  "width": 1176,
118
  "height": 596,
119
  "figure_size": 700896,
@@ -121,7 +121,7 @@
121
  },
122
  "20": {
123
  "caption": "Figure 13: Posters for Neural Tangent Kernels for Axis-Aligned Tree Ensembles.",
124
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-20.png",
125
  "width": 790,
126
  "height": 598,
127
  "figure_size": 472420,
@@ -129,7 +129,7 @@
129
  },
130
  "22": {
131
  "caption": "(a) Author-designed poster.",
132
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-22.png",
133
  "width": 929,
134
  "height": 583,
135
  "figure_size": 541607,
@@ -137,7 +137,7 @@
137
  },
138
  "23": {
139
  "caption": "Figure 16: Posters for Identifying the Context Shift between Test Benchmarks and Production Data.",
140
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-23.png",
141
  "width": 958,
142
  "height": 646,
143
  "figure_size": 618868,
@@ -145,7 +145,7 @@
145
  },
146
  "24": {
147
  "caption": "(a) Author-designed poster.",
148
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-24.png",
149
  "width": 1190,
150
  "height": 567,
151
  "figure_size": 674730,
@@ -153,7 +153,7 @@
153
  },
154
  "29": {
155
  "caption": "(a) Direct.",
156
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-29.png",
157
  "width": 896,
158
  "height": 323,
159
  "figure_size": 289408,
@@ -161,7 +161,7 @@
161
  },
162
  "30": {
163
  "caption": "(b) Tree.(c) Tree + Commenter.",
164
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-30.png",
165
  "width": 899,
166
  "height": 644,
167
  "figure_size": 578956,
@@ -169,7 +169,7 @@
169
  },
170
  "31": {
171
  "caption": "Figure 17: Ablation study on Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval. Text overflow areas are highlighted with red bounding boxes.",
172
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-31.png",
173
  "width": 897,
174
  "height": 679,
175
  "figure_size": 609063,
@@ -177,7 +177,7 @@
177
  },
178
  "33": {
179
  "caption": "Figure 18: Ablation study on Visual Correspondence Hallucination. Text overflow areas are highlighted with red bounding boxes.",
180
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-33.png",
181
  "width": 895,
182
  "height": 274,
183
  "figure_size": 245230,
@@ -185,7 +185,7 @@
185
  },
186
  "34": {
187
  "caption": "(b) Tree.",
188
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-34.png",
189
  "width": 900,
190
  "height": 511,
191
  "figure_size": 459900,
@@ -193,7 +193,7 @@
193
  },
194
  "35": {
195
  "caption": "(c) Tree + Commenter.",
196
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-35.png",
197
  "width": 901,
198
  "height": 513,
199
  "figure_size": 462213,
@@ -201,7 +201,7 @@
201
  },
202
  "37": {
203
  "caption": "Figure 19: Ablation study on DARTFormer: Finding The Best Type Of Attention. Text overflow areas are highlighted with red bounding boxes, large blank regions are highlighted with purple bounding boxes.",
204
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-37.png",
205
  "width": 895,
206
  "height": 747,
207
  "figure_size": 668565,
@@ -209,7 +209,7 @@
209
  },
210
  "39": {
211
  "caption": "(c) Tree + Commenter.",
212
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-39.png",
213
  "width": 899,
214
  "height": 1187,
215
  "figure_size": 1067113,
@@ -217,7 +217,7 @@
217
  },
218
  "41": {
219
  "caption": "Figure 20: Ablation study on CW-ERM: Improving Autonomous Driving Planning with Closed-loop Weighted Empirical Risk Minimization. Text overflow areas are highlighted with red bounding boxes, and large blank regions are highlighted with purple bounding boxes.",
220
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-41.png",
221
  "width": 898,
222
  "height": 1345,
223
  "figure_size": 1207810,
@@ -225,7 +225,7 @@
225
  },
226
  "43": {
227
  "caption": "(c) Tree + Commenter.",
228
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-43.png",
229
  "width": 908,
230
  "height": 1341,
231
  "figure_size": 1217628,
@@ -233,7 +233,7 @@
233
  },
234
  "45": {
235
  "caption": "Figure 21: Ablation study on DeepJoint: Robust Survival Modelling Under Clinical Presence Shift. Text overflow areas are highlighted with red bounding boxes.",
236
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-45.png",
237
  "width": 894,
238
  "height": 1234,
239
  "figure_size": 1103196,
@@ -241,7 +241,7 @@
241
  },
242
  "48": {
243
  "caption": "(c) Tree + Commenter.",
244
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-48.png",
245
  "width": 902,
246
  "height": 1266,
247
  "figure_size": 1141932,
@@ -249,7 +249,7 @@
249
  },
250
  "49": {
251
  "caption": "(a) A poster generated by 4o-Image , where substantial corrupted text is generated.",
252
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-49.png",
253
  "width": 949,
254
  "height": 1409,
255
  "figure_size": 1337141,
@@ -257,7 +257,7 @@
257
  },
258
  "50": {
259
  "caption": "(b) A poster generated by PPTAgent , where meaningless template placeholder text is remained.",
260
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-50.png",
261
  "width": 956,
262
  "height": 1433,
263
  "figure_size": 1369948,
@@ -265,7 +265,7 @@
265
  },
266
  "51": {
267
  "caption": "Figure 22: Examples of posters with corrupted text.(a) A poster generated by 4o-Image , where the poster is cutoff horizontally due to incomplete generation.",
268
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-51.png",
269
  "width": 966,
270
  "height": 887,
271
  "figure_size": 856842,
@@ -273,7 +273,7 @@
273
  },
274
  "52": {
275
  "caption": "Figure 23: Examples of posters with cutoff.",
276
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-52.png",
277
  "width": 948,
278
  "height": 962,
279
  "figure_size": 911976,
@@ -281,7 +281,7 @@
281
  },
282
  "53": {
283
  "caption": "(a) A poster produced by 4o-Image , featuring a figure that is low-resolution, visually corrupted, and unintelligible.",
284
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-53.png",
285
  "width": 968,
286
  "height": 951,
287
  "figure_size": 920568,
@@ -289,7 +289,7 @@
289
  },
290
  "54": {
291
  "caption": "(b) A poster generated by PPTAgent , where figures are rendered too small to be legible.",
292
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-54.png",
293
  "width": 958,
294
  "height": 1277,
295
  "figure_size": 1223366,
@@ -297,7 +297,7 @@
297
  },
298
  "55": {
299
  "caption": "Figure 24: Examples of posters with obscure figures.(a) A poster generated by OWL-4o , where there are large blanks on the poster.",
300
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-55.png",
301
  "width": 954,
302
  "height": 680,
303
  "figure_size": 648720,
@@ -305,7 +305,7 @@
305
  },
306
  "56": {
307
  "caption": "Figure 25: Examples of posters with large blanks.",
308
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-56.png",
309
  "width": 955,
310
  "height": 723,
311
  "figure_size": 690465,
@@ -313,7 +313,7 @@
313
  },
314
  "57": {
315
  "caption": "(a) A poster generated by OWL-4o , where no figures are inserted into poster.",
316
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-57.png",
317
  "width": 959,
318
  "height": 549,
319
  "figure_size": 526491,
@@ -321,7 +321,7 @@
321
  },
322
  "58": {
323
  "caption": "Figure 26: Examples of posters without figures.",
324
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-58.png",
325
  "width": 962,
326
  "height": 1435,
327
  "figure_size": 1380470,
@@ -329,7 +329,7 @@
329
  },
330
  "59": {
331
  "caption": "(a) A poster generated by PosterAgent-Qwen , where there is text overflowing outside textbox.",
332
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-59.png",
333
  "width": 957,
334
  "height": 1277,
335
  "figure_size": 1222089,
@@ -337,7 +337,7 @@
337
  },
338
  "60": {
339
  "caption": "Figure 27: Examples of posters with textual overflow.",
340
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-60.png",
341
  "width": 956,
342
  "height": 640,
343
  "figure_size": 611840,
@@ -345,7 +345,7 @@
345
  },
346
  "61": {
347
  "caption": "Figure 29: In-context references for the commenter help the VLM better identify whether the current panel falls into a failure case.",
348
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-61.png",
349
  "width": 1199,
350
  "height": 828,
351
  "figure_size": 992772,
@@ -353,7 +353,7 @@
353
  },
354
  "63": {
355
  "caption": "Figure 28: Failure generation examples by Stable Diffusion Ultra model [28].",
356
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-63.png",
357
  "width": 1193,
358
  "height": 785,
359
  "figure_size": 936505,
 
1
  {
2
  "1": {
3
  "caption": "Figure 1: Overview of this work. We address two core challenges in scientific poster generation: Left: How to create a poster from a paper -we propose PosterAgent (Sec. 4), a framework that transforms long-context scientific papers (20K+ tokens) into structured visual posters; and Right: How to evaluate poster quality -weintroduce the Paper2Poster benchmark (Sec. 3), which enables systematic comparison between agent-generated and author-designed posters.",
4
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-1.png",
5
  "width": 239,
6
  "height": 271,
7
  "figure_size": 64769,
 
9
  },
10
  "3": {
11
  "caption": "Paper ( 20K tokens )",
12
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-3.png",
13
  "width": 398,
14
  "height": 265,
15
  "figure_size": 105470,
 
17
  },
18
  "6": {
19
  "caption": "Figure 2: Data Statistics of Paper2Poster. (a) Word cloud illustrating the diversity of research topics. (b) Textual Token statistics and Figure count statistics for input papers vs. posters provided by authors. Overall, these statistics highlight that Paper2Poster is a multimodal context compression task, requiring effective abstraction of both textual and visual content.",
20
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-6.png",
21
  "width": 564,
22
  "height": 557,
23
  "figure_size": 314148,
 
25
  },
26
  "7": {
27
  "caption": "Figure 3: Left : Overview of the evaluation framework in Paper2Poster. Middle : We automatically generate multiple-choice questions from each paper using an LLM (o3), forming the our PaperQuiz evaluation. Right : In PaperQuiz, we simulate multiple reader by allowing VLMs-representing different expertise levels ( e.g., student, professor)-to read each generated poster and answer the quiz. The poster that achieves the highest average score is considered the most effective in conveying the paper's content.",
28
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-7.png",
29
  "width": 1983,
30
  "height": 394,
31
  "figure_size": 781302,
 
33
  },
34
  "8": {
35
  "caption": "Figure 4: Illustration of the PosterAgent pipeline. Given an input paper, PosterAgent generates a structured academic poster through three modules: 1. Parser: Extracts key textual and visual assets using a combination of tools and LLM-based summarization, resulting in a structured asset library. 2. Planner: Matches assets and arranges them into coherent layouts, iteratively generating panels with a zoom-in operation. 3. Painter-Commenter: The Painter generates panel-level bullet-content along with executable code, and renders the visual output, while the Commenter-a VLM with in-context reference-provides feedback to ensure layout coherence and prevent content overflow.",
36
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-8.png",
37
  "width": 1972,
38
  "height": 969,
39
  "figure_size": 1910868,
 
41
  },
42
  "9": {
43
  "caption": "Figure 5: PaperQuiz's Avg. scores across different Reader VLMs (x-axis) for each poster type (legend lines). Refer to Append. Tab. 3 for full model names.",
44
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-9.png",
45
  "width": 769,
46
  "height": 505,
47
  "figure_size": 388345,
 
49
  },
50
  "10": {
51
  "caption": "Figure 7 presents the average token cost per poster across different methods. Our PosterAgent achieves great token efficiency, using only 101 . 1 K (4o-based) and 47 . 6 K (Qwen-based) tokens-reducing cost by 60% -87% compared to OWL-4o [6]. This translates to just $0 . 55 for 4o and $0 . 0045 for Qwen per poster, highlighting its effectiveness, (see Append. E.2 for further details).",
52
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-10.png",
53
  "width": 1948,
54
  "height": 1100,
55
  "figure_size": 2142800,
 
57
  },
58
  "11": {
59
  "caption": "Figure 7: Average token consumptions for different methods. Details are provided in Appendix E.1.",
60
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-11.png",
61
  "width": 701,
62
  "height": 505,
63
  "figure_size": 354005,
 
65
  },
66
  "12": {
67
  "caption": "Figure 6: PaperQuiz's Avg scores across different types of posters (x-axis) for readers (colored lines) on human evaluation subset.",
68
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-12.png",
69
  "width": 661,
70
  "height": 428,
71
  "figure_size": 282908,
 
73
  },
74
  "13": {
75
  "caption": "Figure 10: Posters for MuSc: Zero-Shot Industrial Anomaly Classification and Segmentation with Mutual Scoring of the Unlabeled Images.",
76
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-13.png",
77
  "width": 960,
78
  "height": 521,
79
  "figure_size": 500160,
 
81
  },
82
  "15": {
83
  "caption": "(b) PosterAgent-generated poster.(a) Author-designed poster.",
84
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-15.png",
85
  "width": 1993,
86
  "height": 810,
87
  "figure_size": 1614330,
 
89
  },
90
  "16": {
91
  "caption": "(a) Author-designed poster.",
92
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-16.png",
93
  "width": 945,
94
  "height": 680,
95
  "figure_size": 642600,
 
97
  },
98
  "17": {
99
  "caption": "(b) PosterAgent-generated poster.",
100
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-17.png",
101
  "width": 957,
102
  "height": 708,
103
  "figure_size": 677556,
 
105
  },
106
  "18": {
107
  "caption": "Figure 11: Posters for Neuroformer: Multimodal and Multitask Generative Pretraining for Brain Data.(a) Author-designed poster.",
108
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-18.png",
109
  "width": 938,
110
  "height": 620,
111
  "figure_size": 581560,
 
113
  },
114
  "19": {
115
  "caption": "Figure 12: Posters for Conformal Semantic Keypoint Detection with Statistical Guarantees.(a) Author-designed poster.",
116
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-19.png",
117
  "width": 1176,
118
  "height": 596,
119
  "figure_size": 700896,
 
121
  },
122
  "20": {
123
  "caption": "Figure 13: Posters for Neural Tangent Kernels for Axis-Aligned Tree Ensembles.",
124
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-20.png",
125
  "width": 790,
126
  "height": 598,
127
  "figure_size": 472420,
 
129
  },
130
  "22": {
131
  "caption": "(a) Author-designed poster.",
132
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-22.png",
133
  "width": 929,
134
  "height": 583,
135
  "figure_size": 541607,
 
137
  },
138
  "23": {
139
  "caption": "Figure 16: Posters for Identifying the Context Shift between Test Benchmarks and Production Data.",
140
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-23.png",
141
  "width": 958,
142
  "height": 646,
143
  "figure_size": 618868,
 
145
  },
146
  "24": {
147
  "caption": "(a) Author-designed poster.",
148
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-24.png",
149
  "width": 1190,
150
  "height": 567,
151
  "figure_size": 674730,
 
153
  },
154
  "29": {
155
  "caption": "(a) Direct.",
156
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-29.png",
157
  "width": 896,
158
  "height": 323,
159
  "figure_size": 289408,
 
161
  },
162
  "30": {
163
  "caption": "(b) Tree.(c) Tree + Commenter.",
164
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-30.png",
165
  "width": 899,
166
  "height": 644,
167
  "figure_size": 578956,
 
169
  },
170
  "31": {
171
  "caption": "Figure 17: Ablation study on Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval. Text overflow areas are highlighted with red bounding boxes.",
172
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-31.png",
173
  "width": 897,
174
  "height": 679,
175
  "figure_size": 609063,
 
177
  },
178
  "33": {
179
  "caption": "Figure 18: Ablation study on Visual Correspondence Hallucination. Text overflow areas are highlighted with red bounding boxes.",
180
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-33.png",
181
  "width": 895,
182
  "height": 274,
183
  "figure_size": 245230,
 
185
  },
186
  "34": {
187
  "caption": "(b) Tree.",
188
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-34.png",
189
  "width": 900,
190
  "height": 511,
191
  "figure_size": 459900,
 
193
  },
194
  "35": {
195
  "caption": "(c) Tree + Commenter.",
196
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-35.png",
197
  "width": 901,
198
  "height": 513,
199
  "figure_size": 462213,
 
201
  },
202
  "37": {
203
  "caption": "Figure 19: Ablation study on DARTFormer: Finding The Best Type Of Attention. Text overflow areas are highlighted with red bounding boxes, large blank regions are highlighted with purple bounding boxes.",
204
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-37.png",
205
  "width": 895,
206
  "height": 747,
207
  "figure_size": 668565,
 
209
  },
210
  "39": {
211
  "caption": "(c) Tree + Commenter.",
212
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-39.png",
213
  "width": 899,
214
  "height": 1187,
215
  "figure_size": 1067113,
 
217
  },
218
  "41": {
219
  "caption": "Figure 20: Ablation study on CW-ERM: Improving Autonomous Driving Planning with Closed-loop Weighted Empirical Risk Minimization. Text overflow areas are highlighted with red bounding boxes, and large blank regions are highlighted with purple bounding boxes.",
220
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-41.png",
221
  "width": 898,
222
  "height": 1345,
223
  "figure_size": 1207810,
 
225
  },
226
  "43": {
227
  "caption": "(c) Tree + Commenter.",
228
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-43.png",
229
  "width": 908,
230
  "height": 1341,
231
  "figure_size": 1217628,
 
233
  },
234
  "45": {
235
  "caption": "Figure 21: Ablation study on DeepJoint: Robust Survival Modelling Under Clinical Presence Shift. Text overflow areas are highlighted with red bounding boxes.",
236
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-45.png",
237
  "width": 894,
238
  "height": 1234,
239
  "figure_size": 1103196,
 
241
  },
242
  "48": {
243
  "caption": "(c) Tree + Commenter.",
244
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-48.png",
245
  "width": 902,
246
  "height": 1266,
247
  "figure_size": 1141932,
 
249
  },
250
  "49": {
251
  "caption": "(a) A poster generated by 4o-Image , where substantial corrupted text is generated.",
252
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-49.png",
253
  "width": 949,
254
  "height": 1409,
255
  "figure_size": 1337141,
 
257
  },
258
  "50": {
259
  "caption": "(b) A poster generated by PPTAgent , where meaningless template placeholder text is remained.",
260
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-50.png",
261
  "width": 956,
262
  "height": 1433,
263
  "figure_size": 1369948,
 
265
  },
266
  "51": {
267
  "caption": "Figure 22: Examples of posters with corrupted text.(a) A poster generated by 4o-Image , where the poster is cutoff horizontally due to incomplete generation.",
268
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-51.png",
269
  "width": 966,
270
  "height": 887,
271
  "figure_size": 856842,
 
273
  },
274
  "52": {
275
  "caption": "Figure 23: Examples of posters with cutoff.",
276
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-52.png",
277
  "width": 948,
278
  "height": 962,
279
  "figure_size": 911976,
 
281
  },
282
  "53": {
283
  "caption": "(a) A poster produced by 4o-Image , featuring a figure that is low-resolution, visually corrupted, and unintelligible.",
284
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-53.png",
285
  "width": 968,
286
  "height": 951,
287
  "figure_size": 920568,
 
289
  },
290
  "54": {
291
  "caption": "(b) A poster generated by PPTAgent , where figures are rendered too small to be legible.",
292
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-54.png",
293
  "width": 958,
294
  "height": 1277,
295
  "figure_size": 1223366,
 
297
  },
298
  "55": {
299
  "caption": "Figure 24: Examples of posters with obscure figures.(a) A poster generated by OWL-4o , where there are large blanks on the poster.",
300
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-55.png",
301
  "width": 954,
302
  "height": 680,
303
  "figure_size": 648720,
 
305
  },
306
  "56": {
307
  "caption": "Figure 25: Examples of posters with large blanks.",
308
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-56.png",
309
  "width": 955,
310
  "height": 723,
311
  "figure_size": 690465,
 
313
  },
314
  "57": {
315
  "caption": "(a) A poster generated by OWL-4o , where no figures are inserted into poster.",
316
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-57.png",
317
  "width": 959,
318
  "height": 549,
319
  "figure_size": 526491,
 
321
  },
322
  "58": {
323
  "caption": "Figure 26: Examples of posters without figures.",
324
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-58.png",
325
  "width": 962,
326
  "height": 1435,
327
  "figure_size": 1380470,
 
329
  },
330
  "59": {
331
  "caption": "(a) A poster generated by PosterAgent-Qwen , where there is text overflowing outside textbox.",
332
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-59.png",
333
  "width": 957,
334
  "height": 1277,
335
  "figure_size": 1222089,
 
337
  },
338
  "60": {
339
  "caption": "Figure 27: Examples of posters with textual overflow.",
340
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-60.png",
341
  "width": 956,
342
  "height": 640,
343
  "figure_size": 611840,
 
345
  },
346
  "61": {
347
  "caption": "Figure 29: In-context references for the commenter help the VLM better identify whether the current panel falls into a failure case.",
348
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-61.png",
349
  "width": 1199,
350
  "height": 828,
351
  "figure_size": 992772,
 
353
  },
354
  "63": {
355
  "caption": "Figure 28: Failure generation examples by Stable Diffusion Ultra model [28].",
356
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-63.png",
357
  "width": 1193,
358
  "height": 785,
359
  "figure_size": 936505,
posterbuilder/contents/poster_content.json CHANGED
@@ -1,33 +1,45 @@
1
  {
2
  "meta": {
3
- "poster_title": "Paper2Poster: Towards Multimodal Poster Automation from Scientific Papers",
4
- "authors": "Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, Philip Torr",
5
- "affiliations": "1 University of Waterloo, 2 National University of Singapore, 3 University of Oxford"
6
  },
7
  "sections": [
8
  {
9
- "title": "Poster Title & Author",
10
- "content": "This poster presents \textbf{Paper2Poster}, a novel approach for generating academic posters from scientific papers. Authors include Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, and Philip Torr, affiliated with \textit{University of Waterloo}, \textit{National University of Singapore}, and \textit{University of Oxford}."
11
  },
12
  {
13
- "title": "Introduction",
14
- "content": "Academic posters are crucial for \textbf{scientific communication}, allowing rapid dissemination of key findings. Unlike slide decks, posters must condense entire papers into a single page, requiring \textit{multi-modal context handling}, \textcolor{red}{tight text-graphics interleaving}, and \textcolor{red}{spatial constraint respect}. Existing VLM- or LLM-only approaches lack explicit visual feedback, making it difficult to maintain logical flow and legibility."
15
  },
16
  {
17
- "title": "Benchmark & Metrics",
18
- "content": "We introduce the \textbf{Paper2Poster Benchmark}, the first benchmark for poster generation, evaluating outputs on \textcolor{blue}{Visual Quality}, \textcolor{blue}{Textual Coherence}, \textcolor{blue}{Holistic Assessment}, and \textcolor{blue}{PaperQuiz}. This benchmark pairs recent conference papers with author-designed posters, enabling systematic comparison and evaluation of generated posters."
19
  },
20
  {
21
- "title": "PosterAgent Framework",
22
- "content": "Our proposed \textbf{PosterAgent} framework is a \textit{multi-agent pipeline} that transforms scientific papers into structured visual posters. It consists of three components: \textcolor{blue}{Parser}, \textcolor{blue}{Planner}, and \textcolor{blue}{Painter-Commenter}. The Parser distills the paper into a structured asset library, the Planner aligns text-visual pairs into a binary-tree layout, and the Painter-Commenter loop refines each panel using VLM feedback."
23
  },
24
  {
25
- "title": "Evaluation & Results",
26
- "content": "Our comprehensive evaluation reveals that \textbf{PosterAgent} outperforms existing systems across nearly all metrics, using \textcolor{blue}{87\\% fewer tokens}. While GPT-4o outputs are visually appealing, they suffer from \textcolor{red}{noisy text} and poor PaperQuiz scores. Our open-source variants, based on Qwen-2.5, achieve superior performance, highlighting the effectiveness of our \textit{visual-semantic-aware asset library} and \textit{layout generation}."
27
  },
28
  {
29
- "title": "Conclusion",
30
- "content": "We present \textbf{Paper2Poster}, a new benchmark for poster generation, and the \textbf{PosterAgent} framework, which significantly enhances generation quality. Our findings chart clear directions for the next generation of fully automated poster-generation models, emphasizing the importance of \textit{structured parsing}, \textit{hierarchical planning}, and \textit{visual feedback}."
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  ]
33
  }
 
1
  {
2
  "meta": {
3
+ "poster_title": "Paper2Poster: Towards Multimodal Poster",
4
+ "authors": "Wei Pang\\textsuperscript{1}, Kevin Qinghong Lin\\textsuperscript{2}, Xiangru Jian\\textsuperscript{1}, Xi He\\textsuperscript{1}, Philip Torr\\textsuperscript{3}",
5
+ "affiliations": "1 University of Waterloo; 2 National University of Singapore; 3 University of Oxford"
6
  },
7
  "sections": [
8
  {
9
+ "title": "Why Posters Are Hard",
10
+ "content": "We target \\textbf{single-page, multimodal compression} of \\textit{20K+ tokens} into clear panels. Posters demand \\textcolor{blue}{tight text\u2013visual coupling}, \\textbf{layout balance}, and \\textit{readable density}. Pure LLM/VLM approaches \\textcolor{red}{miss spatial feedback}, causing overflow and incoherence. We reveal that \\textbf{visual-in-the-loop planning} is essential to preserve reading order, keep figures relevant, and sustain \\textit{engagement} within hard space limits."
11
  },
12
  {
13
+ "title": "Benchmark and Data",
14
+ "content": "We launch the \\textbf{Paper2Poster Benchmark}: \\textcolor{blue}{100 paper\u2013poster pairs} spanning \\textit{280 topics}. Average input: \\textcolor{blue}{20,370 tokens, 22.6 pages}. Output posters compress text by \\textcolor{blue}{14.4\u00d7} and figures by \\textcolor{blue}{2.6\u00d7}. Evaluation covers \\textbf{Visual Quality}, \\textbf{Textual Coherence}, \\textbf{VLM-as-Judge}, and \\textbf{PaperQuiz}. This suite spotlights \\textit{semantic alignment}, \\textbf{fluency}, and \\textcolor{blue}{reader comprehension}."
15
  },
16
  {
17
+ "title": "PaperQuiz: What Matters",
18
+ "content": "We generate \\textcolor{blue}{100 MCQs/paper}: \\textbf{50 verbatim} + \\textbf{50 interpretive}. Multiple VLM readers simulate \\textit{novice-to-expert} audiences and answer from the poster only. Scores are length-penalized to reward \\textbf{dense clarity}. Results \\textbf{correlate with human judgment}, proving PaperQuiz captures \\textcolor{blue}{information delivery} beyond surface visuals and discourages \\textcolor{red}{verbose, unfocused designs}."
19
  },
20
  {
21
+ "title": "PosterAgent Pipeline",
22
+ "content": "Our \\textbf{top-down, visual-in-the-loop} agent compresses long papers into coherent posters. \u2022 \\textbf{Parser} builds a structured asset library. \u2022 \\textbf{Planner} aligns text\u2013visual pairs and produces a \\textcolor{blue}{binary-tree layout}. \u2022 \\textbf{Painter\u2013Commenter} renders panels via code and uses VLM feedback to fix \\textcolor{red}{overflow} and misalignment. The result: \\textbf{balanced, legible}, editable posters."
23
  },
24
  {
25
+ "title": "Parser: Structured Assets",
26
+ "content": "We distill PDFs into \\textbf{section synopses} and \\textit{figure/table assets} using \\textcolor{blue}{MARKER} and \\textcolor{blue}{DOCLING}, then LLM summarization. The asset library preserves \\textbf{hierarchy} and \\textit{semantics} while shrinking context for efficient planning. This step boosts \\textbf{visual-semantic matching} and reduces \\textcolor{red}{noise}, enabling reliable downstream \\textit{layout reasoning}."
27
  },
28
  {
29
+ "title": "Planner: Layout Mastery",
30
+ "content": "We semantically match \\textbf{sections \u2194 figures} and allocate space via a \\textcolor{blue}{binary-tree layout} that preserves \\textit{reading order}, aspect ratios, and \\textbf{content length} estimates. Panels are populated iteratively, ensuring \\textbf{text brevity} and \\textit{visual balance}. This strategy stabilizes coordinates and avoids \\textcolor{red}{LLM numeric drift} in absolute placements."
31
+ },
32
+ {
33
+ "title": "Painter\u2013Commenter Loop",
34
+ "content": "The \\textbf{Painter} turns section\u2013figure pairs into crisp bullets and executable \\textcolor{blue}{python-pptx} code, rendering draft panels. The \\textbf{Commenter} VLM zooms into panels, using \\textit{in-context examples} to flag \\textcolor{red}{overflow} or \\textcolor{red}{blankness}. Iterations continue until \\textbf{fit and alignment} are achieved, producing \\textit{readable, compact} panels with minimal revision cycles."
35
+ },
36
+ {
37
+ "title": "Results: Stronger, Leaner",
38
+ "content": "Our open-source variants beat \\textcolor{blue}{4o-driven multi-agents} on most metrics, with \\textcolor{blue}{87\\% fewer tokens}. We hit \\textbf{state-of-the-art figure relevance}, near-\\textit{GT} visual similarity, and \\textbf{high VLM-as-Judge} scores. PaperQuiz confirms \\textbf{better knowledge transfer}. Cost is tiny: \\textcolor{blue}{\\$0.0045\u2013\\$0.55/poster}. Key bottleneck remains \\textcolor{red}{Engagement}, guiding future design."
39
+ },
40
+ {
41
+ "title": "Limits and Next Steps",
42
+ "content": "Current bottleneck: \\textbf{sequential panel refinement} slows throughput (~\\textcolor{blue}{4.5 min/doc}). We plan \\textbf{panel-level parallelism}, \\textit{external knowledge} integration (e.g., OpenReview), and \\textbf{human-in-the-loop} editing for higher \\textcolor{blue}{engagement}. These upgrades aim to boost \\textbf{runtime, interactivity}, and \\textit{visual storytelling}, pushing toward fully automated \\textbf{author-grade posters}."
43
  }
44
  ]
45
  }
posterbuilder/convert.py CHANGED
@@ -3,7 +3,7 @@
3
  import json, re, pathlib, shutil, os, math
4
 
5
  # ===================== 自动定位项目根 =====================
6
- IMAGES_DIR_NAME = "<4o_4o>_images_and_tables" # 蓝色文件夹名
7
 
8
  def find_project_root(start: pathlib.Path) -> pathlib.Path:
9
  cur = start.resolve()
@@ -33,7 +33,7 @@ OUTPUT_PATH = OUTPUT_DIR / "poster_output.tex"
33
  IMAGES_PARENTS = [ROOT_DIR / "Paper2Poster", ROOT_DIR]
34
 
35
  # ============ 放大与排版参数 ============
36
- BEAMER_SCALE_TARGET = 1.05 # 模板 \usepackage{beamerposter}[... scale=...] 的新值
37
  # 标题字号策略:单行、两行、3+ 行
38
  TITLE_SIZE_SINGLE = r"\Huge"
39
  TITLE_SIZE_WRAP1 = r"\huge"
@@ -46,9 +46,9 @@ BLOCK_BODY_SIZE_CMD = r"\large"
46
  CAPTION_SIZE_CMD = r"\small"
47
 
48
  # 图像放大基础参数(初值)
49
- FIG_ENLARGE_FACTOR = 1.08
50
- FIG_MIN_FRAC = 0.60
51
- FIG_MAX_FRAC = 0.98
52
 
53
  # 预算控制:每个 section 内,图像累计“高度占 panel 高度”的允许上限(会根据字数自适应)
54
  BASE_FIG_RATIO_LIMIT = 0.58 # 基准阈值
@@ -235,9 +235,9 @@ def inject_font_tweaks(tex: str, title_size_cmd: str) -> str:
235
  f"\\setbeamerfont{{institute}}{{size={INSTITUTE_SIZE_CMD}}}\n"
236
  f"\\setbeamerfont{{block title}}{{size={BLOCK_TITLE_SIZE_CMD}}}\n"
237
  f"\\setbeamerfont{{block body}}{{size={BLOCK_BODY_SIZE_CMD}}}\n"
238
- f"\\setbeamerfont{{caption}}{{size={CAPTION_SIZE_CMD}}}\n"
239
- "\\setlength{\\abovecaptionskip}{4pt}\n"
240
- "\\setlength{\\belowcaptionskip}{3pt}\n"
241
  )
242
  pos_doc = tex.find(r"\begin{document}")
243
  return tex[:pos_doc] + tweaks + tex[pos_doc:] if pos_doc != -1 else tex + tweaks
@@ -278,7 +278,7 @@ def inject_right_logo(tex: str) -> str:
278
  # ===================== 图片与 captions(相对 PaperShow/) =====================
279
  def load_arrangement_and_captions():
280
  arr = json.loads(ARRANGEMENT_PATH.read_text(encoding="utf-8"))
281
- panels = arr.get("panel_arrangement", [])
282
  figures = arr.get("figure_arrangement", [])
283
  panels_by_id = {p["panel_id"]: p for p in panels if "panel_id" in p}
284
 
@@ -341,7 +341,7 @@ def build_figures_for_sections(sections, panels_by_id, figures, cap_full, cap_ba
341
  if norm_title(sec.get("title","")) != norm_title("Poster Title & Author")}
342
  panelid_to_secidx = {}
343
  for p in panels_by_id.values():
344
- pname = norm_title(p.get("panel_name",""))
345
  if pname in sec_name_to_idx:
346
  panelid_to_secidx[p["panel_id"]] = sec_name_to_idx[pname]
347
 
@@ -414,7 +414,7 @@ def figures_to_latex(fig_list, out_tex_path: pathlib.Path, images_parent: pathli
414
  "\\begin{figure}\n"
415
  +"\\centering\n"
416
  +f"\\includegraphics[width={w:.2f}\\linewidth]{{{rel}}}\n"
417
- + (f"\\caption{{{cap}}}\n" if cap else "")
418
  +"\\end{figure}\n"
419
  )
420
  return "\n".join(chunks)
@@ -429,6 +429,8 @@ def build():
429
  sections = [s for s in sections_all if norm_title(s.get("title","")) != norm_title("Poster Title & Author")]
430
 
431
  panels_by_id, figures, cap_full, cap_base = load_arrangement_and_captions()
 
 
432
  sample_paths = [pathlib.Path(f.get("figure_path","")) for f in figures if f.get("figure_path")]
433
  images_parent = resolve_images_parent_dir(sample_paths)
434
 
@@ -463,6 +465,7 @@ def build():
463
  # 注意:要先处理上面的大括号再处理反斜杠,否则会提前破坏结构
464
  cleaned_tex = cleaned_tex.replace(r"\\\\", r"\\") # 避免双转义干扰
465
  cleaned_tex = cleaned_tex.replace(r"\\", "\\") # 最终将 \\ → \
 
466
 
467
  OUTPUT_PATH.write_text(cleaned_tex, encoding="utf-8")
468
  print(f"✅ Wrote: {OUTPUT_PATH.relative_to(ROOT_DIR)}")
 
3
  import json, re, pathlib, shutil, os, math
4
 
5
  # ===================== 自动定位项目根 =====================
6
+ IMAGES_DIR_NAME = "<gpt-5_gpt-5>_images_and_tables" # 蓝色文件夹名
7
 
8
  def find_project_root(start: pathlib.Path) -> pathlib.Path:
9
  cur = start.resolve()
 
33
  IMAGES_PARENTS = [ROOT_DIR / "Paper2Poster", ROOT_DIR]
34
 
35
  # ============ 放大与排版参数 ============
36
+ BEAMER_SCALE_TARGET = 1.0 # 模板 \usepackage{beamerposter}[... scale=...] 的新值
37
  # 标题字号策略:单行、两行、3+ 行
38
  TITLE_SIZE_SINGLE = r"\Huge"
39
  TITLE_SIZE_WRAP1 = r"\huge"
 
46
  CAPTION_SIZE_CMD = r"\small"
47
 
48
  # 图像放大基础参数(初值)
49
+ FIG_ENLARGE_FACTOR = 1.18
50
+ FIG_MIN_FRAC = 0.80
51
+ FIG_MAX_FRAC = 0.90
52
 
53
  # 预算控制:每个 section 内,图像累计“高度占 panel 高度”的允许上限(会根据字数自适应)
54
  BASE_FIG_RATIO_LIMIT = 0.58 # 基准阈值
 
235
  f"\\setbeamerfont{{institute}}{{size={INSTITUTE_SIZE_CMD}}}\n"
236
  f"\\setbeamerfont{{block title}}{{size={BLOCK_TITLE_SIZE_CMD}}}\n"
237
  f"\\setbeamerfont{{block body}}{{size={BLOCK_BODY_SIZE_CMD}}}\n"
238
+ # f"\\setbeamerfont{{caption}}{{size={CAPTION_SIZE_CMD}}}\n"
239
+ # "\\setlength{\\abovecaptionskip}{4pt}\n"
240
+ # "\\setlength{\\belowcaptionskip}{3pt}\n"
241
  )
242
  pos_doc = tex.find(r"\begin{document}")
243
  return tex[:pos_doc] + tweaks + tex[pos_doc:] if pos_doc != -1 else tex + tweaks
 
278
  # ===================== 图片与 captions(相对 PaperShow/) =====================
279
  def load_arrangement_and_captions():
280
  arr = json.loads(ARRANGEMENT_PATH.read_text(encoding="utf-8"))
281
+ panels = arr.get("panels", [])
282
  figures = arr.get("figure_arrangement", [])
283
  panels_by_id = {p["panel_id"]: p for p in panels if "panel_id" in p}
284
 
 
341
  if norm_title(sec.get("title","")) != norm_title("Poster Title & Author")}
342
  panelid_to_secidx = {}
343
  for p in panels_by_id.values():
344
+ pname = norm_title(p.get("section_name",""))
345
  if pname in sec_name_to_idx:
346
  panelid_to_secidx[p["panel_id"]] = sec_name_to_idx[pname]
347
 
 
414
  "\\begin{figure}\n"
415
  +"\\centering\n"
416
  +f"\\includegraphics[width={w:.2f}\\linewidth]{{{rel}}}\n"
417
+ # + (f"\\caption{{{cap}}}\n" if cap else "")
418
  +"\\end{figure}\n"
419
  )
420
  return "\n".join(chunks)
 
429
  sections = [s for s in sections_all if norm_title(s.get("title","")) != norm_title("Poster Title & Author")]
430
 
431
  panels_by_id, figures, cap_full, cap_base = load_arrangement_and_captions()
432
+ print(f"✅ Loaded arrangement and captions.")
433
+ print(panels_by_id.keys(),figures[:2])
434
  sample_paths = [pathlib.Path(f.get("figure_path","")) for f in figures if f.get("figure_path")]
435
  images_parent = resolve_images_parent_dir(sample_paths)
436
 
 
465
  # 注意:要先处理上面的大括号再处理反斜杠,否则会提前破坏结构
466
  cleaned_tex = cleaned_tex.replace(r"\\\\", r"\\") # 避免双转义干扰
467
  cleaned_tex = cleaned_tex.replace(r"\\", "\\") # 最终将 \\ → \
468
+ cleaned_tex = cleaned_tex.replace(r"\t\t", "\\t")
469
 
470
  OUTPUT_PATH.write_text(cleaned_tex, encoding="utf-8")
471
  print(f"✅ Wrote: {OUTPUT_PATH.relative_to(ROOT_DIR)}")
posterbuilder/figure_caption.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "1": {
3
  "caption": "Figure 1: Overview of this work. We address two core challenges in scientific poster generation: Left: How to create a poster from a paper -we propose PosterAgent (Sec. 4), a framework that transforms long-context scientific papers (20K+ tokens) into structured visual posters; and Right: How to evaluate poster quality -weintroduce the Paper2Poster benchmark (Sec. 3), which enables systematic comparison between agent-generated and author-designed posters.",
4
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-1.png",
5
  "width": 239,
6
  "height": 271,
7
  "figure_size": 64769,
@@ -9,7 +9,7 @@
9
  },
10
  "3": {
11
  "caption": "Paper ( 20K tokens )",
12
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-3.png",
13
  "width": 398,
14
  "height": 265,
15
  "figure_size": 105470,
@@ -17,7 +17,7 @@
17
  },
18
  "6": {
19
  "caption": "Figure 2: Data Statistics of Paper2Poster. (a) Word cloud illustrating the diversity of research topics. (b) Textual Token statistics and Figure count statistics for input papers vs. posters provided by authors. Overall, these statistics highlight that Paper2Poster is a multimodal context compression task, requiring effective abstraction of both textual and visual content.",
20
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-6.png",
21
  "width": 564,
22
  "height": 557,
23
  "figure_size": 314148,
@@ -25,7 +25,7 @@
25
  },
26
  "7": {
27
  "caption": "Figure 3: Left : Overview of the evaluation framework in Paper2Poster. Middle : We automatically generate multiple-choice questions from each paper using an LLM (o3), forming the our PaperQuiz evaluation. Right : In PaperQuiz, we simulate multiple reader by allowing VLMs-representing different expertise levels ( e.g., student, professor)-to read each generated poster and answer the quiz. The poster that achieves the highest average score is considered the most effective in conveying the paper's content.",
28
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-7.png",
29
  "width": 1983,
30
  "height": 394,
31
  "figure_size": 781302,
@@ -33,7 +33,7 @@
33
  },
34
  "8": {
35
  "caption": "Figure 4: Illustration of the PosterAgent pipeline. Given an input paper, PosterAgent generates a structured academic poster through three modules: 1. Parser: Extracts key textual and visual assets using a combination of tools and LLM-based summarization, resulting in a structured asset library. 2. Planner: Matches assets and arranges them into coherent layouts, iteratively generating panels with a zoom-in operation. 3. Painter-Commenter: The Painter generates panel-level bullet-content along with executable code, and renders the visual output, while the Commenter-a VLM with in-context reference-provides feedback to ensure layout coherence and prevent content overflow.",
36
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-8.png",
37
  "width": 1972,
38
  "height": 969,
39
  "figure_size": 1910868,
@@ -41,7 +41,7 @@
41
  },
42
  "9": {
43
  "caption": "Figure 5: PaperQuiz's Avg. scores across different Reader VLMs (x-axis) for each poster type (legend lines). Refer to Append. Tab. 3 for full model names.",
44
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-9.png",
45
  "width": 769,
46
  "height": 505,
47
  "figure_size": 388345,
@@ -49,7 +49,7 @@
49
  },
50
  "10": {
51
  "caption": "Figure 7 presents the average token cost per poster across different methods. Our PosterAgent achieves great token efficiency, using only 101 . 1 K (4o-based) and 47 . 6 K (Qwen-based) tokens-reducing cost by 60% -87% compared to OWL-4o [6]. This translates to just $0 . 55 for 4o and $0 . 0045 for Qwen per poster, highlighting its effectiveness, (see Append. E.2 for further details).",
52
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-10.png",
53
  "width": 1948,
54
  "height": 1100,
55
  "figure_size": 2142800,
@@ -57,7 +57,7 @@
57
  },
58
  "11": {
59
  "caption": "Figure 7: Average token consumptions for different methods. Details are provided in Appendix E.1.",
60
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-11.png",
61
  "width": 701,
62
  "height": 505,
63
  "figure_size": 354005,
@@ -65,7 +65,7 @@
65
  },
66
  "12": {
67
  "caption": "Figure 6: PaperQuiz's Avg scores across different types of posters (x-axis) for readers (colored lines) on human evaluation subset.",
68
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-12.png",
69
  "width": 661,
70
  "height": 428,
71
  "figure_size": 282908,
@@ -73,7 +73,7 @@
73
  },
74
  "13": {
75
  "caption": "Figure 10: Posters for MuSc: Zero-Shot Industrial Anomaly Classification and Segmentation with Mutual Scoring of the Unlabeled Images.",
76
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-13.png",
77
  "width": 960,
78
  "height": 521,
79
  "figure_size": 500160,
@@ -81,7 +81,7 @@
81
  },
82
  "15": {
83
  "caption": "(b) PosterAgent-generated poster.(a) Author-designed poster.",
84
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-15.png",
85
  "width": 1993,
86
  "height": 810,
87
  "figure_size": 1614330,
@@ -89,7 +89,7 @@
89
  },
90
  "16": {
91
  "caption": "(a) Author-designed poster.",
92
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-16.png",
93
  "width": 945,
94
  "height": 680,
95
  "figure_size": 642600,
@@ -97,7 +97,7 @@
97
  },
98
  "17": {
99
  "caption": "(b) PosterAgent-generated poster.",
100
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-17.png",
101
  "width": 957,
102
  "height": 708,
103
  "figure_size": 677556,
@@ -105,7 +105,7 @@
105
  },
106
  "18": {
107
  "caption": "Figure 11: Posters for Neuroformer: Multimodal and Multitask Generative Pretraining for Brain Data.(a) Author-designed poster.",
108
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-18.png",
109
  "width": 938,
110
  "height": 620,
111
  "figure_size": 581560,
@@ -113,7 +113,7 @@
113
  },
114
  "19": {
115
  "caption": "Figure 12: Posters for Conformal Semantic Keypoint Detection with Statistical Guarantees.(a) Author-designed poster.",
116
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-19.png",
117
  "width": 1176,
118
  "height": 596,
119
  "figure_size": 700896,
@@ -121,7 +121,7 @@
121
  },
122
  "20": {
123
  "caption": "Figure 13: Posters for Neural Tangent Kernels for Axis-Aligned Tree Ensembles.",
124
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-20.png",
125
  "width": 790,
126
  "height": 598,
127
  "figure_size": 472420,
@@ -129,7 +129,7 @@
129
  },
130
  "22": {
131
  "caption": "(a) Author-designed poster.",
132
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-22.png",
133
  "width": 929,
134
  "height": 583,
135
  "figure_size": 541607,
@@ -137,7 +137,7 @@
137
  },
138
  "23": {
139
  "caption": "Figure 16: Posters for Identifying the Context Shift between Test Benchmarks and Production Data.",
140
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-23.png",
141
  "width": 958,
142
  "height": 646,
143
  "figure_size": 618868,
@@ -145,7 +145,7 @@
145
  },
146
  "24": {
147
  "caption": "(a) Author-designed poster.",
148
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-24.png",
149
  "width": 1190,
150
  "height": 567,
151
  "figure_size": 674730,
@@ -153,7 +153,7 @@
153
  },
154
  "29": {
155
  "caption": "(a) Direct.",
156
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-29.png",
157
  "width": 896,
158
  "height": 323,
159
  "figure_size": 289408,
@@ -161,7 +161,7 @@
161
  },
162
  "30": {
163
  "caption": "(b) Tree.(c) Tree + Commenter.",
164
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-30.png",
165
  "width": 899,
166
  "height": 644,
167
  "figure_size": 578956,
@@ -169,7 +169,7 @@
169
  },
170
  "31": {
171
  "caption": "Figure 17: Ablation study on Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval. Text overflow areas are highlighted with red bounding boxes.",
172
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-31.png",
173
  "width": 897,
174
  "height": 679,
175
  "figure_size": 609063,
@@ -177,7 +177,7 @@
177
  },
178
  "33": {
179
  "caption": "Figure 18: Ablation study on Visual Correspondence Hallucination. Text overflow areas are highlighted with red bounding boxes.",
180
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-33.png",
181
  "width": 895,
182
  "height": 274,
183
  "figure_size": 245230,
@@ -185,7 +185,7 @@
185
  },
186
  "34": {
187
  "caption": "(b) Tree.",
188
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-34.png",
189
  "width": 900,
190
  "height": 511,
191
  "figure_size": 459900,
@@ -193,7 +193,7 @@
193
  },
194
  "35": {
195
  "caption": "(c) Tree + Commenter.",
196
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-35.png",
197
  "width": 901,
198
  "height": 513,
199
  "figure_size": 462213,
@@ -201,7 +201,7 @@
201
  },
202
  "37": {
203
  "caption": "Figure 19: Ablation study on DARTFormer: Finding The Best Type Of Attention. Text overflow areas are highlighted with red bounding boxes, large blank regions are highlighted with purple bounding boxes.",
204
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-37.png",
205
  "width": 895,
206
  "height": 747,
207
  "figure_size": 668565,
@@ -209,7 +209,7 @@
209
  },
210
  "39": {
211
  "caption": "(c) Tree + Commenter.",
212
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-39.png",
213
  "width": 899,
214
  "height": 1187,
215
  "figure_size": 1067113,
@@ -217,7 +217,7 @@
217
  },
218
  "41": {
219
  "caption": "Figure 20: Ablation study on CW-ERM: Improving Autonomous Driving Planning with Closed-loop Weighted Empirical Risk Minimization. Text overflow areas are highlighted with red bounding boxes, and large blank regions are highlighted with purple bounding boxes.",
220
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-41.png",
221
  "width": 898,
222
  "height": 1345,
223
  "figure_size": 1207810,
@@ -225,7 +225,7 @@
225
  },
226
  "43": {
227
  "caption": "(c) Tree + Commenter.",
228
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-43.png",
229
  "width": 908,
230
  "height": 1341,
231
  "figure_size": 1217628,
@@ -233,7 +233,7 @@
233
  },
234
  "45": {
235
  "caption": "Figure 21: Ablation study on DeepJoint: Robust Survival Modelling Under Clinical Presence Shift. Text overflow areas are highlighted with red bounding boxes.",
236
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-45.png",
237
  "width": 894,
238
  "height": 1234,
239
  "figure_size": 1103196,
@@ -241,7 +241,7 @@
241
  },
242
  "48": {
243
  "caption": "(c) Tree + Commenter.",
244
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-48.png",
245
  "width": 902,
246
  "height": 1266,
247
  "figure_size": 1141932,
@@ -249,7 +249,7 @@
249
  },
250
  "49": {
251
  "caption": "(a) A poster generated by 4o-Image , where substantial corrupted text is generated.",
252
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-49.png",
253
  "width": 949,
254
  "height": 1409,
255
  "figure_size": 1337141,
@@ -257,7 +257,7 @@
257
  },
258
  "50": {
259
  "caption": "(b) A poster generated by PPTAgent , where meaningless template placeholder text is remained.",
260
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-50.png",
261
  "width": 956,
262
  "height": 1433,
263
  "figure_size": 1369948,
@@ -265,7 +265,7 @@
265
  },
266
  "51": {
267
  "caption": "Figure 22: Examples of posters with corrupted text.(a) A poster generated by 4o-Image , where the poster is cutoff horizontally due to incomplete generation.",
268
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-51.png",
269
  "width": 966,
270
  "height": 887,
271
  "figure_size": 856842,
@@ -273,7 +273,7 @@
273
  },
274
  "52": {
275
  "caption": "Figure 23: Examples of posters with cutoff.",
276
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-52.png",
277
  "width": 948,
278
  "height": 962,
279
  "figure_size": 911976,
@@ -281,7 +281,7 @@
281
  },
282
  "53": {
283
  "caption": "(a) A poster produced by 4o-Image , featuring a figure that is low-resolution, visually corrupted, and unintelligible.",
284
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-53.png",
285
  "width": 968,
286
  "height": 951,
287
  "figure_size": 920568,
@@ -289,7 +289,7 @@
289
  },
290
  "54": {
291
  "caption": "(b) A poster generated by PPTAgent , where figures are rendered too small to be legible.",
292
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-54.png",
293
  "width": 958,
294
  "height": 1277,
295
  "figure_size": 1223366,
@@ -297,7 +297,7 @@
297
  },
298
  "55": {
299
  "caption": "Figure 24: Examples of posters with obscure figures.(a) A poster generated by OWL-4o , where there are large blanks on the poster.",
300
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-55.png",
301
  "width": 954,
302
  "height": 680,
303
  "figure_size": 648720,
@@ -305,7 +305,7 @@
305
  },
306
  "56": {
307
  "caption": "Figure 25: Examples of posters with large blanks.",
308
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-56.png",
309
  "width": 955,
310
  "height": 723,
311
  "figure_size": 690465,
@@ -313,7 +313,7 @@
313
  },
314
  "57": {
315
  "caption": "(a) A poster generated by OWL-4o , where no figures are inserted into poster.",
316
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-57.png",
317
  "width": 959,
318
  "height": 549,
319
  "figure_size": 526491,
@@ -321,7 +321,7 @@
321
  },
322
  "58": {
323
  "caption": "Figure 26: Examples of posters without figures.",
324
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-58.png",
325
  "width": 962,
326
  "height": 1435,
327
  "figure_size": 1380470,
@@ -329,7 +329,7 @@
329
  },
330
  "59": {
331
  "caption": "(a) A poster generated by PosterAgent-Qwen , where there is text overflowing outside textbox.",
332
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-59.png",
333
  "width": 957,
334
  "height": 1277,
335
  "figure_size": 1222089,
@@ -337,7 +337,7 @@
337
  },
338
  "60": {
339
  "caption": "Figure 27: Examples of posters with textual overflow.",
340
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-60.png",
341
  "width": 956,
342
  "height": 640,
343
  "figure_size": 611840,
@@ -345,7 +345,7 @@
345
  },
346
  "61": {
347
  "caption": "Figure 29: In-context references for the commenter help the VLM better identify whether the current panel falls into a failure case.",
348
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-61.png",
349
  "width": 1199,
350
  "height": 828,
351
  "figure_size": 992772,
@@ -353,7 +353,7 @@
353
  },
354
  "63": {
355
  "caption": "Figure 28: Failure generation examples by Stable Diffusion Ultra model [28].",
356
- "image_path": "<4o_4o>_images_and_tables/paper/paper-picture-63.png",
357
  "width": 1193,
358
  "height": 785,
359
  "figure_size": 936505,
 
1
  {
2
  "1": {
3
  "caption": "Figure 1: Overview of this work. We address two core challenges in scientific poster generation: Left: How to create a poster from a paper -we propose PosterAgent (Sec. 4), a framework that transforms long-context scientific papers (20K+ tokens) into structured visual posters; and Right: How to evaluate poster quality -weintroduce the Paper2Poster benchmark (Sec. 3), which enables systematic comparison between agent-generated and author-designed posters.",
4
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-1.png",
5
  "width": 239,
6
  "height": 271,
7
  "figure_size": 64769,
 
9
  },
10
  "3": {
11
  "caption": "Paper ( 20K tokens )",
12
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-3.png",
13
  "width": 398,
14
  "height": 265,
15
  "figure_size": 105470,
 
17
  },
18
  "6": {
19
  "caption": "Figure 2: Data Statistics of Paper2Poster. (a) Word cloud illustrating the diversity of research topics. (b) Textual Token statistics and Figure count statistics for input papers vs. posters provided by authors. Overall, these statistics highlight that Paper2Poster is a multimodal context compression task, requiring effective abstraction of both textual and visual content.",
20
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-6.png",
21
  "width": 564,
22
  "height": 557,
23
  "figure_size": 314148,
 
25
  },
26
  "7": {
27
  "caption": "Figure 3: Left : Overview of the evaluation framework in Paper2Poster. Middle : We automatically generate multiple-choice questions from each paper using an LLM (o3), forming the our PaperQuiz evaluation. Right : In PaperQuiz, we simulate multiple reader by allowing VLMs-representing different expertise levels ( e.g., student, professor)-to read each generated poster and answer the quiz. The poster that achieves the highest average score is considered the most effective in conveying the paper's content.",
28
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-7.png",
29
  "width": 1983,
30
  "height": 394,
31
  "figure_size": 781302,
 
33
  },
34
  "8": {
35
  "caption": "Figure 4: Illustration of the PosterAgent pipeline. Given an input paper, PosterAgent generates a structured academic poster through three modules: 1. Parser: Extracts key textual and visual assets using a combination of tools and LLM-based summarization, resulting in a structured asset library. 2. Planner: Matches assets and arranges them into coherent layouts, iteratively generating panels with a zoom-in operation. 3. Painter-Commenter: The Painter generates panel-level bullet-content along with executable code, and renders the visual output, while the Commenter-a VLM with in-context reference-provides feedback to ensure layout coherence and prevent content overflow.",
36
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-8.png",
37
  "width": 1972,
38
  "height": 969,
39
  "figure_size": 1910868,
 
41
  },
42
  "9": {
43
  "caption": "Figure 5: PaperQuiz's Avg. scores across different Reader VLMs (x-axis) for each poster type (legend lines). Refer to Append. Tab. 3 for full model names.",
44
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-9.png",
45
  "width": 769,
46
  "height": 505,
47
  "figure_size": 388345,
 
49
  },
50
  "10": {
51
  "caption": "Figure 7 presents the average token cost per poster across different methods. Our PosterAgent achieves great token efficiency, using only 101 . 1 K (4o-based) and 47 . 6 K (Qwen-based) tokens-reducing cost by 60% -87% compared to OWL-4o [6]. This translates to just $0 . 55 for 4o and $0 . 0045 for Qwen per poster, highlighting its effectiveness, (see Append. E.2 for further details).",
52
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-10.png",
53
  "width": 1948,
54
  "height": 1100,
55
  "figure_size": 2142800,
 
57
  },
58
  "11": {
59
  "caption": "Figure 7: Average token consumptions for different methods. Details are provided in Appendix E.1.",
60
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-11.png",
61
  "width": 701,
62
  "height": 505,
63
  "figure_size": 354005,
 
65
  },
66
  "12": {
67
  "caption": "Figure 6: PaperQuiz's Avg scores across different types of posters (x-axis) for readers (colored lines) on human evaluation subset.",
68
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-12.png",
69
  "width": 661,
70
  "height": 428,
71
  "figure_size": 282908,
 
73
  },
74
  "13": {
75
  "caption": "Figure 10: Posters for MuSc: Zero-Shot Industrial Anomaly Classification and Segmentation with Mutual Scoring of the Unlabeled Images.",
76
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-13.png",
77
  "width": 960,
78
  "height": 521,
79
  "figure_size": 500160,
 
81
  },
82
  "15": {
83
  "caption": "(b) PosterAgent-generated poster.(a) Author-designed poster.",
84
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-15.png",
85
  "width": 1993,
86
  "height": 810,
87
  "figure_size": 1614330,
 
89
  },
90
  "16": {
91
  "caption": "(a) Author-designed poster.",
92
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-16.png",
93
  "width": 945,
94
  "height": 680,
95
  "figure_size": 642600,
 
97
  },
98
  "17": {
99
  "caption": "(b) PosterAgent-generated poster.",
100
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-17.png",
101
  "width": 957,
102
  "height": 708,
103
  "figure_size": 677556,
 
105
  },
106
  "18": {
107
  "caption": "Figure 11: Posters for Neuroformer: Multimodal and Multitask Generative Pretraining for Brain Data.(a) Author-designed poster.",
108
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-18.png",
109
  "width": 938,
110
  "height": 620,
111
  "figure_size": 581560,
 
113
  },
114
  "19": {
115
  "caption": "Figure 12: Posters for Conformal Semantic Keypoint Detection with Statistical Guarantees.(a) Author-designed poster.",
116
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-19.png",
117
  "width": 1176,
118
  "height": 596,
119
  "figure_size": 700896,
 
121
  },
122
  "20": {
123
  "caption": "Figure 13: Posters for Neural Tangent Kernels for Axis-Aligned Tree Ensembles.",
124
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-20.png",
125
  "width": 790,
126
  "height": 598,
127
  "figure_size": 472420,
 
129
  },
130
  "22": {
131
  "caption": "(a) Author-designed poster.",
132
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-22.png",
133
  "width": 929,
134
  "height": 583,
135
  "figure_size": 541607,
 
137
  },
138
  "23": {
139
  "caption": "Figure 16: Posters for Identifying the Context Shift between Test Benchmarks and Production Data.",
140
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-23.png",
141
  "width": 958,
142
  "height": 646,
143
  "figure_size": 618868,
 
145
  },
146
  "24": {
147
  "caption": "(a) Author-designed poster.",
148
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-24.png",
149
  "width": 1190,
150
  "height": 567,
151
  "figure_size": 674730,
 
153
  },
154
  "29": {
155
  "caption": "(a) Direct.",
156
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-29.png",
157
  "width": 896,
158
  "height": 323,
159
  "figure_size": 289408,
 
161
  },
162
  "30": {
163
  "caption": "(b) Tree.(c) Tree + Commenter.",
164
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-30.png",
165
  "width": 899,
166
  "height": 644,
167
  "figure_size": 578956,
 
169
  },
170
  "31": {
171
  "caption": "Figure 17: Ablation study on Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval. Text overflow areas are highlighted with red bounding boxes.",
172
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-31.png",
173
  "width": 897,
174
  "height": 679,
175
  "figure_size": 609063,
 
177
  },
178
  "33": {
179
  "caption": "Figure 18: Ablation study on Visual Correspondence Hallucination. Text overflow areas are highlighted with red bounding boxes.",
180
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-33.png",
181
  "width": 895,
182
  "height": 274,
183
  "figure_size": 245230,
 
185
  },
186
  "34": {
187
  "caption": "(b) Tree.",
188
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-34.png",
189
  "width": 900,
190
  "height": 511,
191
  "figure_size": 459900,
 
193
  },
194
  "35": {
195
  "caption": "(c) Tree + Commenter.",
196
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-35.png",
197
  "width": 901,
198
  "height": 513,
199
  "figure_size": 462213,
 
201
  },
202
  "37": {
203
  "caption": "Figure 19: Ablation study on DARTFormer: Finding The Best Type Of Attention. Text overflow areas are highlighted with red bounding boxes, large blank regions are highlighted with purple bounding boxes.",
204
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-37.png",
205
  "width": 895,
206
  "height": 747,
207
  "figure_size": 668565,
 
209
  },
210
  "39": {
211
  "caption": "(c) Tree + Commenter.",
212
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-39.png",
213
  "width": 899,
214
  "height": 1187,
215
  "figure_size": 1067113,
 
217
  },
218
  "41": {
219
  "caption": "Figure 20: Ablation study on CW-ERM: Improving Autonomous Driving Planning with Closed-loop Weighted Empirical Risk Minimization. Text overflow areas are highlighted with red bounding boxes, and large blank regions are highlighted with purple bounding boxes.",
220
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-41.png",
221
  "width": 898,
222
  "height": 1345,
223
  "figure_size": 1207810,
 
225
  },
226
  "43": {
227
  "caption": "(c) Tree + Commenter.",
228
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-43.png",
229
  "width": 908,
230
  "height": 1341,
231
  "figure_size": 1217628,
 
233
  },
234
  "45": {
235
  "caption": "Figure 21: Ablation study on DeepJoint: Robust Survival Modelling Under Clinical Presence Shift. Text overflow areas are highlighted with red bounding boxes.",
236
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-45.png",
237
  "width": 894,
238
  "height": 1234,
239
  "figure_size": 1103196,
 
241
  },
242
  "48": {
243
  "caption": "(c) Tree + Commenter.",
244
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-48.png",
245
  "width": 902,
246
  "height": 1266,
247
  "figure_size": 1141932,
 
249
  },
250
  "49": {
251
  "caption": "(a) A poster generated by 4o-Image , where substantial corrupted text is generated.",
252
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-49.png",
253
  "width": 949,
254
  "height": 1409,
255
  "figure_size": 1337141,
 
257
  },
258
  "50": {
259
  "caption": "(b) A poster generated by PPTAgent , where meaningless template placeholder text is remained.",
260
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-50.png",
261
  "width": 956,
262
  "height": 1433,
263
  "figure_size": 1369948,
 
265
  },
266
  "51": {
267
  "caption": "Figure 22: Examples of posters with corrupted text.(a) A poster generated by 4o-Image , where the poster is cutoff horizontally due to incomplete generation.",
268
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-51.png",
269
  "width": 966,
270
  "height": 887,
271
  "figure_size": 856842,
 
273
  },
274
  "52": {
275
  "caption": "Figure 23: Examples of posters with cutoff.",
276
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-52.png",
277
  "width": 948,
278
  "height": 962,
279
  "figure_size": 911976,
 
281
  },
282
  "53": {
283
  "caption": "(a) A poster produced by 4o-Image , featuring a figure that is low-resolution, visually corrupted, and unintelligible.",
284
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-53.png",
285
  "width": 968,
286
  "height": 951,
287
  "figure_size": 920568,
 
289
  },
290
  "54": {
291
  "caption": "(b) A poster generated by PPTAgent , where figures are rendered too small to be legible.",
292
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-54.png",
293
  "width": 958,
294
  "height": 1277,
295
  "figure_size": 1223366,
 
297
  },
298
  "55": {
299
  "caption": "Figure 24: Examples of posters with obscure figures.(a) A poster generated by OWL-4o , where there are large blanks on the poster.",
300
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-55.png",
301
  "width": 954,
302
  "height": 680,
303
  "figure_size": 648720,
 
305
  },
306
  "56": {
307
  "caption": "Figure 25: Examples of posters with large blanks.",
308
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-56.png",
309
  "width": 955,
310
  "height": 723,
311
  "figure_size": 690465,
 
313
  },
314
  "57": {
315
  "caption": "(a) A poster generated by OWL-4o , where no figures are inserted into poster.",
316
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-57.png",
317
  "width": 959,
318
  "height": 549,
319
  "figure_size": 526491,
 
321
  },
322
  "58": {
323
  "caption": "Figure 26: Examples of posters without figures.",
324
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-58.png",
325
  "width": 962,
326
  "height": 1435,
327
  "figure_size": 1380470,
 
329
  },
330
  "59": {
331
  "caption": "(a) A poster generated by PosterAgent-Qwen , where there is text overflowing outside textbox.",
332
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-59.png",
333
  "width": 957,
334
  "height": 1277,
335
  "figure_size": 1222089,
 
337
  },
338
  "60": {
339
  "caption": "Figure 27: Examples of posters with textual overflow.",
340
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-60.png",
341
  "width": 956,
342
  "height": 640,
343
  "figure_size": 611840,
 
345
  },
346
  "61": {
347
  "caption": "Figure 29: In-context references for the commenter help the VLM better identify whether the current panel falls into a failure case.",
348
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-61.png",
349
  "width": 1199,
350
  "height": 828,
351
  "figure_size": 992772,
 
353
  },
354
  "63": {
355
  "caption": "Figure 28: Failure generation examples by Stable Diffusion Ultra model [28].",
356
+ "image_path": "<gpt-5_gpt-5>_images_and_tables/paper/paper-picture-63.png",
357
  "width": 1193,
358
  "height": 785,
359
  "figure_size": 936505,
posterbuilder/latex_proj/figures.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b121c93aec90deeba8b04260f15d228536bfda456001dd9499d97cb35ff3d17b
3
- size 1911874
 
 
 
 
posterbuilder/latex_proj/poster_output.tex CHANGED
@@ -11,7 +11,7 @@
11
 
12
  \usepackage[T1]{fontenc}
13
  \usepackage{lmodern}
14
- \usepackage[size=custom,width=120,height=72,scale=1.05]{beamerposter}
15
  \usetheme{gemini}
16
  \usecolortheme{cam}
17
  \usepackage{graphicx}
@@ -22,6 +22,13 @@
22
  \pgfplotsset{compat=1.14}
23
  \usepackage{anyfontsize}
24
 
 
 
 
 
 
 
 
25
  % ====================
26
  % Lengths
27
  % ====================
@@ -39,11 +46,11 @@
39
  % Title
40
  % ====================
41
 
42
- \title{Paper2Poster: \\ Towards Multimodal Poster Automation from Scientific Papers}
43
 
44
- \author{Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, Philip Torr}
45
 
46
- \institute[shortinst]{1 University of Waterloo, 2 National University of Singapore, 3 University of Oxford}
47
 
48
  % ====================
49
  % Footer (optional)
@@ -60,8 +67,8 @@
60
  % ====================
61
 
62
  % use this to include logos on the left and/or right side of the header:
63
- % \logoright{\includegraphics[height=7cm]{logo1.pdf}}
64
- % \logoleft{\includegraphics[height=7cm]{logo2.pdf}}
65
 
66
  % ====================
67
  % Body
@@ -69,14 +76,11 @@
69
 
70
 
71
  % --- injected font tweaks ---
72
- \setbeamerfont{title}{size=\huge}
73
  \setbeamerfont{author}{size=\Large}
74
  \setbeamerfont{institute}{size=\large}
75
  \setbeamerfont{block title}{size=\Large}
76
  \setbeamerfont{block body}{size=\large}
77
- \setbeamerfont{caption}{size=\small}
78
- \setlength{\abovecaptionskip}{4pt}
79
- \setlength{\belowcaptionskip}{3pt}
80
  \begin{document}
81
 
82
  % Refer to https://github.com/k4rtik/uchicago-poster
@@ -85,28 +89,39 @@
85
  {
86
  \begin{tikzpicture}[remember picture,overlay]
87
  \node [anchor=north west, inner sep=3cm] at ([xshift=0.0cm,yshift=1.0cm]current page.north west)
88
- {\includegraphics[height=4.5cm]{logos/cambridge-reversed-color-logo.eps}};
89
-
90
- \node[anchor=north east, inner sep=2.0cm] at ([xshift=-2.0cm,yshift=0.0cm]current page.north east)
91
- {\includegraphics[height=6.0cm]{logo.png}};
92
- \end{tikzpicture}
93
  }
94
 
95
  \begin{frame}[t]
96
  \begin{columns}[t]
97
  \separatorcolumn
98
  \begin{column}{\colwidth}
99
- \begin{block}{Introduction}
100
- Academic posters are crucial for \textbf\{scientific communication\}, allowing rapid dissemination of key findings. Unlike slide decks, posters must condense entire papers into a single page, requiring \textit\{multi-modal context handling\}, \textcolor\{red\}\{tight text-graphics interleaving\}, and \textcolor\{red\}\{spatial constraint respect\}. Existing VLM- or LLM-only approaches lack explicit visual feedback, making it difficult to maintain logical flow and legibility.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  \end{block}
102
 
103
- \begin{block}{Benchmark \& Metrics}
104
- We introduce the \textbf\{Paper2Poster Benchmark\}, the first benchmark for poster generation, evaluating outputs on \textcolor\{blue\}\{Visual Quality\}, \textcolor\{blue\}\{Textual Coherence\}, \textcolor\{blue\}\{Holistic Assessment\}, and \textcolor\{blue\}\{PaperQuiz\}. This benchmark pairs recent conference papers with author-designed posters, enabling systematic comparison and evaluation of generated posters.
105
 
106
  \begin{figure}
107
  \centering
108
- \includegraphics[width=0.60\linewidth]{figures/paper-picture-1.png}
109
- \caption{Overview of this work. We address two core challenges in scientific poster generation: Left: How to create a poster from a paper -we propose PosterAgent (Sec. 4), a framework that transforms long-context scientific papers (20K+ tokens) into structured visual posters; and Right: How to evaluate poster quality -weintroduce the Paper2Poster benchmark (Sec. 3), which enables systematic comparison between agent-generated and author-designed posters.}
110
  \end{figure}
111
 
112
  \end{block}
@@ -114,23 +129,26 @@ We introduce the \textbf\{Paper2Poster Benchmark\}, the first benchmark for pos
114
  \end{column}
115
  \separatorcolumn
116
  \begin{column}{\colwidth}
117
- \begin{block}{PosterAgent Framework}
118
- Our proposed \textbf\{PosterAgent\} framework is a \textit\{multi-agent pipeline\} that transforms scientific papers into structured visual posters. It consists of three components: \textcolor\{blue\}\{Parser\}, \textcolor\{blue\}\{Planner\}, and \textcolor\{blue\}\{Painter-Commenter\}. The Parser distills the paper into a structured asset library, the Planner aligns text-visual pairs into a binary-tree layout, and the Painter-Commenter loop refines each panel using VLM feedback.
119
 
120
  \begin{figure}
121
  \centering
122
- \includegraphics[width=0.78\linewidth]{figures/paper-picture-8.png}
123
- \caption{Illustration of the PosterAgent pipeline. Given an input paper, PosterAgent generates a structured academic poster through three modules: 1. Parser: Extracts key textual and visual assets using a combination of tools and LLM-based summarization, resulting in a structured asset library. 2. Planner: Matches assets and arranges them into coherent layouts, iteratively generating panels with a zoom-in operation. 3. Painter-Commenter: The Painter generates panel-level bullet-content along with executable code, and renders the visual output, while the Commenter-a VLM with in-context reference-provides feedback to ensure layout coherence and prevent content overflow.}
124
  \end{figure}
125
 
126
  \end{block}
127
 
128
- \begin{block}{Evaluation \& Results}
129
- Our comprehensive evaluation reveals that \textbf\{PosterAgent\} outperforms existing systems across nearly all metrics, using \textcolor\{blue\}\{87\\% fewer tokens\}. While GPT-4o outputs are visually appealing, they suffer from \textcolor\{red\}\{noisy text\} and poor PaperQuiz scores. Our open-source variants, based on Qwen-2.5, achieve superior performance, highlighting the effectiveness of our \textit\{visual-semantic-aware asset library\} and \textit\{layout generation\}.
 
 
 
 
130
 
131
  \begin{figure}
132
  \centering
133
- \includegraphics[width=0.79\linewidth]{figures/paper-table-1.png}
134
  \end{figure}
135
 
136
  \end{block}
@@ -138,8 +156,28 @@ Our comprehensive evaluation reveals that \textbf\{PosterAgent\} outperforms ex
138
  \end{column}
139
  \separatorcolumn
140
  \begin{column}{\colwidth}
141
- \begin{block}{Conclusion}
142
- We present \textbf\{Paper2Poster\}, a new benchmark for poster generation, and the \textbf\{PosterAgent\} framework, which significantly enhances generation quality. Our findings chart clear directions for the next generation of fully automated poster-generation models, emphasizing the importance of \textit\{structured parsing\}, \textit\{hierarchical planning\}, and \textit\{visual feedback\}.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  \end{block}
144
 
145
  \end{column}
 
11
 
12
  \usepackage[T1]{fontenc}
13
  \usepackage{lmodern}
14
+ \usepackage[size=custom,width=120,height=72,scale=1.0]{beamerposter}
15
  \usetheme{gemini}
16
  \usecolortheme{cam}
17
  \usepackage{graphicx}
 
22
  \pgfplotsset{compat=1.14}
23
  \usepackage{anyfontsize}
24
 
25
+ \definecolor{nipspurple}{RGB}{94,46,145}
26
+ \setbeamercolor{headline}{bg=white, fg=black}
27
+ \setbeamercolor{block title}{bg=nipspurple, fg=white}
28
+ \addtobeamertemplate{block begin}{
29
+ \setlength{\textpaddingtop}{0.2em}%
30
+ \setlength{\textpaddingbottom}{0.2em}%
31
+ }{}
32
  % ====================
33
  % Lengths
34
  % ====================
 
46
  % Title
47
  % ====================
48
 
49
+ \title{Paper2Poster: Towards Multimodal Poster}
50
 
51
+ \author{Wei Pang\textsuperscript{1}, Kevin Qinghong Lin\textsuperscript{2}, Xiangru Jian\textsuperscript{1}, Xi He\textsuperscript{1}, Philip Torr\textsuperscript{3}}
52
 
53
+ \institute[shortinst]{1 University of Waterloo; 2 National University of Singapore; 3 University of Oxford}
54
 
55
  % ====================
56
  % Footer (optional)
 
67
  % ====================
68
 
69
  % use this to include logos on the left and/or right side of the header:
70
+ \logoright{\includegraphics[height=5cm]{logos/right_logo.png}}
71
+ \logoleft{\includegraphics[height=4cm]{logos/left_logo.png}}
72
 
73
  % ====================
74
  % Body
 
76
 
77
 
78
  % --- injected font tweaks ---
79
+ \setbeamerfont{title}{size=\Huge}
80
  \setbeamerfont{author}{size=\Large}
81
  \setbeamerfont{institute}{size=\large}
82
  \setbeamerfont{block title}{size=\Large}
83
  \setbeamerfont{block body}{size=\large}
 
 
 
84
  \begin{document}
85
 
86
  % Refer to https://github.com/k4rtik/uchicago-poster
 
89
  {
90
  \begin{tikzpicture}[remember picture,overlay]
91
  \node [anchor=north west, inner sep=3cm] at ([xshift=0.0cm,yshift=1.0cm]current page.north west)
92
+ \end{tikzpicture}
 
 
 
 
93
  }
94
 
95
  \begin{frame}[t]
96
  \begin{columns}[t]
97
  \separatorcolumn
98
  \begin{column}{\colwidth}
99
+ \begin{block}{Why Posters Are Hard}
100
+ We target \textbf{single-page, multimodal compression} of \textit{20K+ tokens} into clear panels. Posters demand \textcolor{blue}{tight text–visual coupling}, \textbf{layout balance}, and \textit{readable density}. Pure LLM/VLM approaches \textcolor{red}{miss spatial feedback}, causing overflow and incoherence. We reveal that \textbf{visual-in-the-loop planning} is essential to preserve reading order, keep figures relevant, and sustain \textit{engagement} within hard space limits.
101
+
102
+ \begin{figure}
103
+ \centering
104
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-1.png}
105
+ \end{figure}
106
+
107
+ \end{block}
108
+
109
+ \begin{block}{Benchmark and Data}
110
+ We launch the \textbf{Paper2Poster Benchmark}: \textcolor{blue}{100 paper–poster pairs} spanning \textit{280 topics}. Average input: \textcolor{blue}{20,370 tokens, 22.6 pages}. Output posters compress text by \textcolor{blue}{14.4×} and figures by \textcolor{blue}{2.6×}. Evaluation covers \textbf{Visual Quality}, \textbf{Textual Coherence}, \textbf{VLM-as-Judge}, and \textbf{PaperQuiz}. This suite spotlights \textit{semantic alignment}, \textbf{fluency}, and \textcolor{blue}{reader comprehension}.
111
+
112
+ \begin{figure}
113
+ \centering
114
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-6.png}
115
+ \end{figure}
116
+
117
  \end{block}
118
 
119
+ \begin{block}{PaperQuiz: What Matters}
120
+ We generate \textcolor{blue}{100 MCQs/paper}: \textbf{50 verbatim} + \textbf{50 interpretive}. Multiple VLM readers simulate \textit{novice-to-expert} audiences and answer from the poster only. Scores are length-penalized to reward \textbf{dense clarity}. Results \textbf{correlate with human judgment}, proving PaperQuiz captures \textcolor{blue}{information delivery} beyond surface visuals and discourages \textcolor{red}{verbose, unfocused designs}.
121
 
122
  \begin{figure}
123
  \centering
124
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-7.png}
 
125
  \end{figure}
126
 
127
  \end{block}
 
129
  \end{column}
130
  \separatorcolumn
131
  \begin{column}{\colwidth}
132
+ \begin{block}{PosterAgent Pipeline}
133
+ Our \textbf{top-down, visual-in-the-loop} agent compresses long papers into coherent posters. \textbf{Parser} builds a structured asset library. \textbf{Planner} aligns textvisual pairs and produces a \textcolor{blue}{binary-tree layout}. \textbf{PainterCommenter} renders panels via code and uses VLM feedback to fix \textcolor{red}{overflow} and misalignment. The result: \textbf{balanced, legible}, editable posters.
134
 
135
  \begin{figure}
136
  \centering
137
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-8.png}
 
138
  \end{figure}
139
 
140
  \end{block}
141
 
142
+ \begin{block}{Parser: Structured Assets}
143
+ We distill PDFs into \textbf{section synopses} and \textit{figure/table assets} using \textcolor{blue}{MARKER} and \textcolor{blue}{DOCLING}, then LLM summarization. The asset library preserves \textbf{hierarchy} and \textit{semantics} while shrinking context for efficient planning. This step boosts \textbf{visual-semantic matching} and reduces \textcolor{red}{noise}, enabling reliable downstream \textit{layout reasoning}.
144
+ \end{block}
145
+
146
+ \begin{block}{Planner: Layout Mastery}
147
+ We semantically match \textbf{sections ↔ figures} and allocate space via a \textcolor{blue}{binary-tree layout} that preserves \textit{reading order}, aspect ratios, and \textbf{content length} estimates. Panels are populated iteratively, ensuring \textbf{text brevity} and \textit{visual balance}. This strategy stabilizes coordinates and avoids \textcolor{red}{LLM numeric drift} in absolute placements.
148
 
149
  \begin{figure}
150
  \centering
151
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-30.png}
152
  \end{figure}
153
 
154
  \end{block}
 
156
  \end{column}
157
  \separatorcolumn
158
  \begin{column}{\colwidth}
159
+ \begin{block}{Painter–Commenter Loop}
160
+ The \textbf{Painter} turns section–figure pairs into crisp bullets and executable \textcolor{blue}{python-pptx} code, rendering draft panels. The \textbf{Commenter} VLM zooms into panels, using \textit{in-context examples} to flag \textcolor{red}{overflow} or \textcolor{red}{blankness}. Iterations continue until \textbf{fit and alignment} are achieved, producing \textit{readable, compact} panels with minimal revision cycles.
161
+
162
+ \begin{figure}
163
+ \centering
164
+ \includegraphics[width=0.80\linewidth]{figures/paper-picture-61.png}
165
+ \end{figure}
166
+
167
+ \end{block}
168
+
169
+ \begin{block}{Results: Stronger, Leaner}
170
+ Our open-source variants beat \textcolor{blue}{4o-driven multi-agents} on most metrics, with \textcolor{blue}{87\% fewer tokens}. We hit \textbf{state-of-the-art figure relevance}, near-\textit{GT} visual similarity, and \textbf{high VLM-as-Judge} scores. PaperQuiz confirms \textbf{better knowledge transfer}. Cost is tiny: \textcolor{blue}{\$0.0045–\$0.55/poster}. Key bottleneck remains \textcolor{red}{Engagement}, guiding future design.
171
+
172
+ \begin{figure}
173
+ \centering
174
+ \includegraphics[width=0.80\linewidth]{figures/paper-table-1.png}
175
+ \end{figure}
176
+
177
+ \end{block}
178
+
179
+ \begin{block}{Limits and Next Steps}
180
+ Current bottleneck: \textbf{sequential panel refinement} slows throughput (\textasciitilde{}\textcolor{blue}{4.5 min/doc}). We plan \textbf{panel-level parallelism}, \textit{external knowledge} integration (e.g., OpenReview), and \textbf{human-in-the-loop} editing for higher \textcolor{blue}{engagement}. These upgrades aim to boost \textbf{runtime, interactivity}, and \textit{visual storytelling}, pushing toward fully automated \textbf{author-grade posters}.
181
  \end{block}
182
 
183
  \end{column}
posterbuilder/latex_proj/poster_output_fix.tex DELETED
@@ -1,139 +0,0 @@
1
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2
-
3
- % LaTeX Template for IAHR YPN Congress
4
-
5
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
6
-
7
- %----------------------------------------------------------------------------------------
8
- % PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
9
- %----------------------------------------------------------------------------------------
10
-
11
- \documentclass[landscape,a0paper,fontscale=0.31,margin=7mm]{baposter} % Adjust the font scale/size here
12
-
13
- \usepackage{graphicx} % Required for including images
14
- \graphicspath{{figures/}} % Directory in which figures are stored
15
-
16
- \usepackage{hyperref}
17
- \hypersetup{colorlinks, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
18
-
19
- \usepackage{amsmath} % For typesetting math
20
- \usepackage{amssymb} % Adds new symbols to be used in math mode
21
-
22
- \usepackage{booktabs} % Top and bottom rules for tables
23
- \usepackage{enumitem} % Used to reduce itemize/enumerate spacing
24
- \usepackage{palatino} % Use the Palatino font
25
- \usepackage[font=small,labelfont=bf]{caption} % Required for specifying captions to tables and figures
26
-
27
- \usepackage{multicol} % Required for multiple columns
28
- \setlength{\columnsep}{1.5em} % Slightly increase the space between columns
29
- \setlength{\columnseprule}{0mm} % No horizontal rule between columns
30
-
31
- \usepackage{tikz} % Required for flow chart
32
- \usetikzlibrary{shapes,arrows} % Tikz libraries required for the flow chart in the template
33
-
34
- \newcommand{\compresslist}{ % Define a command to reduce spacing within itemize/enumerate environments, this is used right after \begin{itemize} or \begin{enumerate}
35
- \setlength{\itemsep}{1pt}
36
- \setlength{\parskip}{0pt}
37
- \setlength{\parsep}{0pt}
38
- }
39
-
40
- \definecolor{lightblue}{rgb}{0.145,0.6666,1} % Defines the color used for content box headers
41
-
42
- \begin{document}
43
-
44
- \begin{poster}
45
- {headerborder=closed, % Adds a border around the header of content boxes
46
- colspacing=0.6em, % Column spacing
47
- bgColorOne=white, % Background color for the gradient on the left side of the poster
48
- bgColorTwo=white, % Background color for the gradient on the right side of the poster
49
- borderColor=lightblue, % Border color
50
- headerColorOne=black, % Background color for the header in the content boxes (left side)
51
- headerColorTwo=lightblue, % Background color for the header in the content boxes (right side)
52
- headerFontColor=white, % Text color for the header text in the content boxes
53
- boxColorOne=white, % Background color of the content boxes
54
- textborder=roundedleft, % Format of the border around content boxes, can be: none, bars, coils, triangles, rectangle, rounded, roundedsmall, roundedright or faded
55
- eyecatcher=true, % Set to false for ignoring the left logo in the title and move the title left
56
- headerheight=0.1\textheight, % Height of the header
57
- headershape=roundedright, % Specify the rounded corner in the content box headers, can be: rectangle, small-rounded, roundedright, roundedleft or rounded
58
- headerfont=\Large\bf\textsc, % Large, bold and sans serif font in the headers of content boxes
59
- %textfont={\setlength{\parindent}{1.5em}}, % Uncomment for paragraph indentation
60
- linewidth=2pt % Width of the border lines around content boxes,
61
- columns=3}
62
- %----------------------------------------------------------------------------------------
63
- % TITLE SECTION
64
- %----------------------------------------------------------------------------------------
65
- %
66
- {\includegraphics[height=6em]{YPN_logo.jpg}} % First university/lab logo on the left
67
- {\bfseries \LARGE \textsc{A Cat Is A Cat (Not A Dog!): \\ Unraveling Information Mix-ups in Text-to-Image Encoders through Causal \\ Analysis and Embedding Optimization}} % Poster title
68
- {\textsc{Chieh-Yun Chen, Chiang Tseng, Li-Wu Tsao, Hong-Han Shuai}\\ \textsc{National Yang Ming Chiao Tung University, Georgia Institute of Technology}} % Author names and institution
69
- {\includegraphics[height=6em]{Institution_logo.png}}
70
- \headerbox{Abstract}{name=abstract,column=0,row=0,span=1}{
71
- This paper analyzes the impact of causal manner in the text encoder of text-to-image (T2I) diffusion models, which can lead to information bias and loss. We propose a text embedding balance optimization method with a 125.42\% improvement on information balance in stable diffusion. A new automatic evaluation metric is introduced, achieving 81\% concordance with human assessments.
72
- }
73
-
74
-
75
- \headerbox{Preliminaries}{name=preliminaries,column=0,below=abstract,span=1}{
76
- Text-to-image diffusion models include a text encoder, a variational autoencoder, and a denoising UNet. The causal masking manner in the text encoder causes information bias, as each token only has information from previous tokens.
77
-
78
- \begin{center}
79
- \includegraphics[width=0.90\linewidth]{paper-picture-2.png}
80
- \captionof{figure}{Overview of the text-to-image generative model, including the details of the causal manner in attention mechanism. Because of the causal nature of the embedding, information is accumulated from the starting token through the end of the sequence, resulting in bias in the earlier token. To balance the critical information, we propose text embedding optimization for purifying the object token with equal weights within their corresponding embedding dimension.}
81
- \vspace{-0.2em}
82
- \end{center}
83
-
84
- }
85
-
86
-
87
- \headerbox{Experiments}{name=experiments,column=0,below=preliminaries,span=1,above=bottom}{
88
- We compare our method with baselines like Stable Diffusion and SynGen, focusing on information balance rather than surpassing existing methods. Our automatic evaluation metric, validated by human assessment, effectively measures object presence and accuracy.
89
- }
90
-
91
-
92
- \headerbox{TEBOpt}{name=tebopt,column=1,row=0,span=1}{
93
- TEBOpt aims to balance critical information in text embeddings by optimizing object token embeddings to prevent mixing and work alongside image latent optimization techniques to address object disappearance.
94
-
95
- \begin{center}
96
- \includegraphics[width=0.90\linewidth]{paper-table-4.png}
97
- \vspace{-0.2em}
98
- \end{center}
99
-
100
- }
101
-
102
-
103
- \headerbox{Qualitative \& Quantitative \\ Results}{name=qualitativeandquantitativeresults,column=1,below=tebopt,span=1}{
104
- TEBOpt improves object balance in generated images, reducing mixture and missing issues. It enhances token embedding similarity and cross-attention map distance, confirming its effectiveness in addressing information bias.
105
-
106
- \begin{center}
107
- \includegraphics[width=0.90\linewidth]{paper-picture-13.png}
108
- \captionof{figure}{(a) The cosine similarity of text embedding from single word. (b) The KL distance of cross-attention maps that are triggered by two words. The data is ordered by their text embedding similarity.}
109
- \vspace{-0.2em}
110
- \end{center}
111
-
112
- }
113
-
114
-
115
- \headerbox{Introduction}{name=introduction,column=1,below=qualitativeandquantitativeresults,span=1}{
116
- Text-to-image diffusion models have gained attention, but the role of text embedding in generating multiple objects remains underexplored. This paper investigates how text embeddings influence semantic outcomes, identifying issues of information bias and loss. We propose Text Embedding Balance Optimization (TEBOpt) to address these issues and improve image generation.
117
- }
118
-
119
-
120
- \headerbox{Discussion}{name=discussion,column=2,row=0,span=1}{
121
- Text embedding similarity affects cross-attention maps' distance, with similar embeddings leading to object mixture. Our findings highlight the need for optimized text embeddings to improve image generation quality.
122
-
123
- \begin{center}
124
- \includegraphics[width=0.90\linewidth]{paper-picture-9.png}
125
- \captionof{figure}{Masking text embedding to identify the contribution of critical tokens, e.g., cat/dog, and special tokens, e.g., <sot>, <eot>, <pad>. The first row and the second row both contain cat and dog inside prompt but in different order. The analysis shows that special tokens contain general information about the given prompt. However, the cat/dog tokens carry more weight than the special tokens. In the last two columns, where one of the animal token embeddings is masked while retaining the special tokens' embedding, the generated image is predominantly influenced by the remaining animal's token embedding.}
126
- \vspace{-0.2em}
127
- \end{center}
128
-
129
- }
130
-
131
-
132
- \headerbox{Conclusion}{name=conclusion,column=2,below=discussion,span=1,above=bottom}{
133
- Our study reveals that causal processing of text embedding leads to biases and loss. TEBOpt effectively eliminates problematic information, improving information balance in stable diffusion by 125.42\% while preserving object coexistence.
134
- }
135
-
136
-
137
- \end{poster}
138
-
139
- \end{document}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
posterbuilder/latex_proj/poster_output_new.tex DELETED
@@ -1,193 +0,0 @@
1
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2
-
3
- % LaTeX Template for IAHR YPN Congress
4
-
5
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
6
-
7
- %----------------------------------------------------------------------------------------
8
- % PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
9
- %----------------------------------------------------------------------------------------
10
-
11
- \documentclass[landscape,a0paper,fontscale=0.31]{baposter} % Adjust the font scale/size here
12
-
13
- \usepackage{graphicx} % Required for including images
14
- \graphicspath{{figures/}} % Directory in which figures are stored
15
-
16
- \usepackage{hyperref}
17
- \hypersetup{colorlinks, citecolor=blue, filecolor=blue, linkcolor=blue, urlcolor=blue}
18
-
19
- \usepackage{amsmath} % For typesetting math
20
- \usepackage{amssymb} % Adds new symbols to be used in math mode
21
-
22
- \usepackage{booktabs} % Top and bottom rules for tables
23
- \usepackage{enumitem} % Used to reduce itemize/enumerate spacing
24
- \usepackage{palatino} % Use the Palatino font
25
- \usepackage[font=small,labelfont=bf]{caption} % Required for specifying captions to tables and figures
26
-
27
- \usepackage{multicol} % Required for multiple columns
28
- \setlength{\columnsep}{1.5em} % Slightly increase the space between columns
29
- \setlength{\columnseprule}{0mm} % No horizontal rule between columns
30
-
31
- \usepackage{tikz} % Required for flow chart
32
- \usetikzlibrary{shapes,arrows} % Tikz libraries required for the flow chart in the template
33
-
34
- \newcommand{\compresslist}{ % Define a command to reduce spacing within itemize/enumerate environments, this is used right after \begin{itemize} or \begin{enumerate}
35
- \setlength{\itemsep}{1pt}
36
- \setlength{\parskip}{0pt}
37
- \setlength{\parsep}{0pt}
38
- }
39
-
40
- \definecolor{lightblue}{rgb}{0.145,0.6666,1} % Defines the color used for content box headers
41
-
42
- \begin{document}
43
-
44
- \begin{poster}
45
- {
46
- headerborder=closed, % Adds a border around the header of content boxes
47
- colspacing=1em, % Column spacing
48
- bgColorOne=white, % Background color for the gradient on the left side of the poster
49
- bgColorTwo=white, % Background color for the gradient on the right side of the poster
50
- borderColor=lightblue, % Border color
51
- headerColorOne=black, % Background color for the header in the content boxes (left side)
52
- headerColorTwo=lightblue, % Background color for the header in the content boxes (right side)
53
- headerFontColor=white, % Text color for the header text in the content boxes
54
- boxColorOne=white, % Background color of the content boxes
55
- textborder=roundedleft, % Format of the border around content boxes, can be: none, bars, coils, triangles, rectangle, rounded, roundedsmall, roundedright or faded
56
- eyecatcher=true, % Set to false for ignoring the left logo in the title and move the title left
57
- headerheight=0.1\textheight, % Height of the header
58
- headershape=roundedright, % Specify the rounded corner in the content box headers, can be: rectangle, small-rounded, roundedright, roundedleft or rounded
59
- headerfont=\Large\bf\textsc, % Large, bold and sans serif font in the headers of content boxes
60
- %textfont={\setlength{\parindent}{1.5em}}, % Uncomment for paragraph indentation
61
- linewidth=2pt % Width of the border lines around content boxes
62
- }
63
- %----------------------------------------------------------------------------------------
64
- % TITLE SECTION
65
- %----------------------------------------------------------------------------------------
66
- %
67
- {\includegraphics[height=6em]{YPN_logo.jpg}} % First university/lab logo on the left
68
- {\bfseries \LARGE \textsc{A Cat Is A Cat (Not A Dog!): \\ Unraveling Information Mix-ups in Text-to-Image Encoders through Causal \\ Analysis and Embedding Optimization}} % Poster title
69
- {\textsc{Chieh-Yun Chen, Chiang Tseng, Li-Wu Tsao, Hong-Han Shuai}\\ \textsc{National Yang Ming Chiao Tung University, Georgia Institute of Technology}} % Author names and institution
70
- {\includegraphics[height=6em]{Institution_logo.png}
71
- \headerbox{Abstract}{name=abstract,column=0,row=0}{{
72
- This paper analyzes the impact of causal manner in the text encoder of text-to-image (T2I) diffusion models, which can lead to information bias and loss. We propose a text embedding balance optimization method with a 125.42\% improvement on information balance in stable diffusion. A new automatic evaluation metric is introduced, achieving 81\% concordance with human assessments.
73
- }}
74
-
75
-
76
- \headerbox{Preliminaries}{name=preliminaries,column=0,row=1}{{
77
- Text-to-image diffusion models include a text encoder, a variational autoencoder, and a denoising UNet. The causal masking manner in the text encoder causes information bias, as each token only has information from previous tokens.
78
-
79
- \begin{center}
80
- \includegraphics[width=0.76\linewidth]{figures/paper-picture-2.png}
81
- \captionof{figure}{Overview of the text-to-image generative model, including the details of the causal manner in attention mechanism. Because of the causal nature of the embedding, information is accumulated from the starting token through the end of the sequence, resulting in bias in the earlier token. To balance the critical information, we propose text embedding optimization for purifying the object token with equal weights within their corresponding embedding dimension.}
82
- \end{center}
83
-
84
- }}
85
-
86
-
87
- \headerbox{Experiments}{name=experiments,column=0,row=2}{{
88
- We compare our method with baselines like Stable Diffusion and SynGen, focusing on information balance rather than surpassing existing methods. Our automatic evaluation metric, validated by human assessment, effectively measures object presence and accuracy.
89
- }}
90
-
91
-
92
- \headerbox{TEBOpt}{name=tebopt,column=1,row=0}{{
93
- TEBOpt aims to balance critical information in text embeddings by optimizing object token embeddings to prevent mixing and work alongside image latent optimization techniques to address object disappearance.
94
-
95
- \begin{center}
96
- \includegraphics[width=0.61\linewidth]{figures/paper-table-4.png}
97
- \end{center}
98
-
99
- }}
100
-
101
-
102
- \headerbox{Qualitative \& Quantitative Results}{name=qualitativeandquantitativeresults,column=1,row=1}{{
103
- TEBOpt improves object balance in generated images, reducing mixture and missing issues. It enhances token embedding similarity and cross-attention map distance, confirming its effectiveness in addressing information bias.
104
-
105
- \begin{center}
106
- \includegraphics[width=0.80\linewidth]{figures/paper-picture-13.png}
107
- \captionof{figure}{(a) The cosine similarity of text embedding from single word. (b) The KL distance of cross-attention maps that are triggered by two words. The data is ordered by their text embedding similarity.}
108
- \end{center}
109
-
110
- }}
111
-
112
-
113
- \headerbox{Introduction}{name=introduction,column=1,row=2}{{
114
- Text-to-image diffusion models have gained attention, but the role of text embedding in generating multiple objects remains underexplored. This paper investigates how text embeddings influence semantic outcomes, identifying issues of information bias and loss. We propose Text Embedding Balance Optimization (TEBOpt) to address these issues and improve image generation.
115
- }}
116
-
117
-
118
- \headerbox{Discussion}{name=discussion,column=2,row=0}{{
119
- Text embedding similarity affects cross-attention maps' distance, with similar embeddings leading to object mixture. Our findings highlight the need for optimized text embeddings to improve image generation quality.
120
-
121
- \begin{center}
122
- \includegraphics[width=0.60\linewidth]{figures/paper-picture-9.png}
123
- \captionof{figure}{Masking text embedding to identify the contribution of critical tokens, e.g., cat/dog, and special tokens, e.g., <sot>, <eot>, <pad>. The first row and the second row both contain cat and dog inside prompt but in different order. The analysis shows that special tokens contain general information about the given prompt. However, the cat/dog tokens carry more weight than the special tokens. In the last two columns, where one of the animal token embeddings is masked while retaining the special tokens' embedding, the generated image is predominantly influenced by the remaining animal's token embedding.}
124
- \end{center}
125
-
126
- }}
127
-
128
-
129
- \headerbox{Conclusion}{name=conclusion,column=2,row=1}{{
130
- Our study reveals that causal processing of text embedding leads to biases and loss. TEBOpt effectively eliminates problematic information, improving information balance in stable diffusion by 125.42\% while preserving object coexistence.
131
- }}
132
-
133
- } % Second university/lab logo on the right
134
-
135
- %----------------------------------------------------------------------------------------
136
- % ABSTRACT
137
- %----------------------------------------------------------------------------------------
138
-
139
-
140
-
141
- %----------------------------------------------------------------------------------------
142
- % INTRODUCTION
143
- %----------------------------------------------------------------------------------------
144
-
145
-
146
-
147
- %----------------------------------------------------------------------------------------
148
- % RESULTS 1
149
- %----------------------------------------------------------------------------------------
150
-
151
-
152
-
153
- %----------------------------------------------------------------------------------------
154
- % REFERENCES
155
- %----------------------------------------------------------------------------------------
156
-
157
-
158
-
159
- %----------------------------------------------------------------------------------------
160
- % FUTURE RESEARCH
161
- %----------------------------------------------------------------------------------------
162
-
163
-
164
-
165
- %----------------------------------------------------------------------------------------
166
- % CONTACT INFORMATION
167
- %----------------------------------------------------------------------------------------
168
-
169
-
170
-
171
- %----------------------------------------------------------------------------------------
172
- % CONCLUSION
173
- %----------------------------------------------------------------------------------------
174
-
175
-
176
-
177
- %----------------------------------------------------------------------------------------
178
- % MATERIALS AND METHODS
179
- %----------------------------------------------------------------------------------------
180
-
181
-
182
-
183
- %----------------------------------------------------------------------------------------
184
- % RESULTS 2
185
- %----------------------------------------------------------------------------------------
186
-
187
-
188
-
189
- %----------------------------------------------------------------------------------------
190
-
191
- \end{poster}
192
-
193
- \end{document}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
posterbuilder/poster_content.json CHANGED
@@ -1,33 +1,45 @@
1
  {
2
  "meta": {
3
- "poster_title": "Paper2Poster: Towards Multimodal Poster Automation from Scientific Papers",
4
- "authors": "Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, Philip Torr",
5
- "affiliations": "1 University of Waterloo, 2 National University of Singapore, 3 University of Oxford"
6
  },
7
  "sections": [
8
  {
9
- "title": "Poster Title & Author",
10
- "content": "This poster presents \textbf{Paper2Poster}, a novel approach for generating academic posters from scientific papers. Authors include Wei Pang, Kevin Qinghong Lin, Xiangru Jian, Xi He, and Philip Torr, affiliated with \textit{University of Waterloo}, \textit{National University of Singapore}, and \textit{University of Oxford}."
11
  },
12
  {
13
- "title": "Introduction",
14
- "content": "Academic posters are crucial for \textbf{scientific communication}, allowing rapid dissemination of key findings. Unlike slide decks, posters must condense entire papers into a single page, requiring \textit{multi-modal context handling}, \textcolor{red}{tight text-graphics interleaving}, and \textcolor{red}{spatial constraint respect}. Existing VLM- or LLM-only approaches lack explicit visual feedback, making it difficult to maintain logical flow and legibility."
15
  },
16
  {
17
- "title": "Benchmark & Metrics",
18
- "content": "We introduce the \textbf{Paper2Poster Benchmark}, the first benchmark for poster generation, evaluating outputs on \textcolor{blue}{Visual Quality}, \textcolor{blue}{Textual Coherence}, \textcolor{blue}{Holistic Assessment}, and \textcolor{blue}{PaperQuiz}. This benchmark pairs recent conference papers with author-designed posters, enabling systematic comparison and evaluation of generated posters."
19
  },
20
  {
21
- "title": "PosterAgent Framework",
22
- "content": "Our proposed \textbf{PosterAgent} framework is a \textit{multi-agent pipeline} that transforms scientific papers into structured visual posters. It consists of three components: \textcolor{blue}{Parser}, \textcolor{blue}{Planner}, and \textcolor{blue}{Painter-Commenter}. The Parser distills the paper into a structured asset library, the Planner aligns text-visual pairs into a binary-tree layout, and the Painter-Commenter loop refines each panel using VLM feedback."
23
  },
24
  {
25
- "title": "Evaluation & Results",
26
- "content": "Our comprehensive evaluation reveals that \textbf{PosterAgent} outperforms existing systems across nearly all metrics, using \textcolor{blue}{87\\% fewer tokens}. While GPT-4o outputs are visually appealing, they suffer from \textcolor{red}{noisy text} and poor PaperQuiz scores. Our open-source variants, based on Qwen-2.5, achieve superior performance, highlighting the effectiveness of our \textit{visual-semantic-aware asset library} and \textit{layout generation}."
27
  },
28
  {
29
- "title": "Conclusion",
30
- "content": "We present \textbf{Paper2Poster}, a new benchmark for poster generation, and the \textbf{PosterAgent} framework, which significantly enhances generation quality. Our findings chart clear directions for the next generation of fully automated poster-generation models, emphasizing the importance of \textit{structured parsing}, \textit{hierarchical planning}, and \textit{visual feedback}."
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  ]
33
  }
 
1
  {
2
  "meta": {
3
+ "poster_title": "Paper2Poster: Towards Multimodal Poster",
4
+ "authors": "Wei Pang\\textsuperscript{1}, Kevin Qinghong Lin\\textsuperscript{2}, Xiangru Jian\\textsuperscript{1}, Xi He\\textsuperscript{1}, Philip Torr\\textsuperscript{3}",
5
+ "affiliations": "1 University of Waterloo; 2 National University of Singapore; 3 University of Oxford"
6
  },
7
  "sections": [
8
  {
9
+ "title": "Why Posters Are Hard",
10
+ "content": "We target \\textbf{single-page, multimodal compression} of \\textit{20K+ tokens} into clear panels. Posters demand \\textcolor{blue}{tight text\u2013visual coupling}, \\textbf{layout balance}, and \\textit{readable density}. Pure LLM/VLM approaches \\textcolor{red}{miss spatial feedback}, causing overflow and incoherence. We reveal that \\textbf{visual-in-the-loop planning} is essential to preserve reading order, keep figures relevant, and sustain \\textit{engagement} within hard space limits."
11
  },
12
  {
13
+ "title": "Benchmark and Data",
14
+ "content": "We launch the \\textbf{Paper2Poster Benchmark}: \\textcolor{blue}{100 paper\u2013poster pairs} spanning \\textit{280 topics}. Average input: \\textcolor{blue}{20,370 tokens, 22.6 pages}. Output posters compress text by \\textcolor{blue}{14.4\u00d7} and figures by \\textcolor{blue}{2.6\u00d7}. Evaluation covers \\textbf{Visual Quality}, \\textbf{Textual Coherence}, \\textbf{VLM-as-Judge}, and \\textbf{PaperQuiz}. This suite spotlights \\textit{semantic alignment}, \\textbf{fluency}, and \\textcolor{blue}{reader comprehension}."
15
  },
16
  {
17
+ "title": "PaperQuiz: What Matters",
18
+ "content": "We generate \\textcolor{blue}{100 MCQs/paper}: \\textbf{50 verbatim} + \\textbf{50 interpretive}. Multiple VLM readers simulate \\textit{novice-to-expert} audiences and answer from the poster only. Scores are length-penalized to reward \\textbf{dense clarity}. Results \\textbf{correlate with human judgment}, proving PaperQuiz captures \\textcolor{blue}{information delivery} beyond surface visuals and discourages \\textcolor{red}{verbose, unfocused designs}."
19
  },
20
  {
21
+ "title": "PosterAgent Pipeline",
22
+ "content": "Our \\textbf{top-down, visual-in-the-loop} agent compresses long papers into coherent posters. \u2022 \\textbf{Parser} builds a structured asset library. \u2022 \\textbf{Planner} aligns text\u2013visual pairs and produces a \\textcolor{blue}{binary-tree layout}. \u2022 \\textbf{Painter\u2013Commenter} renders panels via code and uses VLM feedback to fix \\textcolor{red}{overflow} and misalignment. The result: \\textbf{balanced, legible}, editable posters."
23
  },
24
  {
25
+ "title": "Parser: Structured Assets",
26
+ "content": "We distill PDFs into \\textbf{section synopses} and \\textit{figure/table assets} using \\textcolor{blue}{MARKER} and \\textcolor{blue}{DOCLING}, then LLM summarization. The asset library preserves \\textbf{hierarchy} and \\textit{semantics} while shrinking context for efficient planning. This step boosts \\textbf{visual-semantic matching} and reduces \\textcolor{red}{noise}, enabling reliable downstream \\textit{layout reasoning}."
27
  },
28
  {
29
+ "title": "Planner: Layout Mastery",
30
+ "content": "We semantically match \\textbf{sections \u2194 figures} and allocate space via a \\textcolor{blue}{binary-tree layout} that preserves \\textit{reading order}, aspect ratios, and \\textbf{content length} estimates. Panels are populated iteratively, ensuring \\textbf{text brevity} and \\textit{visual balance}. This strategy stabilizes coordinates and avoids \\textcolor{red}{LLM numeric drift} in absolute placements."
31
+ },
32
+ {
33
+ "title": "Painter\u2013Commenter Loop",
34
+ "content": "The \\textbf{Painter} turns section\u2013figure pairs into crisp bullets and executable \\textcolor{blue}{python-pptx} code, rendering draft panels. The \\textbf{Commenter} VLM zooms into panels, using \\textit{in-context examples} to flag \\textcolor{red}{overflow} or \\textcolor{red}{blankness}. Iterations continue until \\textbf{fit and alignment} are achieved, producing \\textit{readable, compact} panels with minimal revision cycles."
35
+ },
36
+ {
37
+ "title": "Results: Stronger, Leaner",
38
+ "content": "Our open-source variants beat \\textcolor{blue}{4o-driven multi-agents} on most metrics, with \\textcolor{blue}{87\\% fewer tokens}. We hit \\textbf{state-of-the-art figure relevance}, near-\\textit{GT} visual similarity, and \\textbf{high VLM-as-Judge} scores. PaperQuiz confirms \\textbf{better knowledge transfer}. Cost is tiny: \\textcolor{blue}{\\$0.0045\u2013\\$0.55/poster}. Key bottleneck remains \\textcolor{red}{Engagement}, guiding future design."
39
+ },
40
+ {
41
+ "title": "Limits and Next Steps",
42
+ "content": "Current bottleneck: \\textbf{sequential panel refinement} slows throughput (~\\textcolor{blue}{4.5 min/doc}). We plan \\textbf{panel-level parallelism}, \\textit{external knowledge} integration (e.g., OpenReview), and \\textbf{human-in-the-loop} editing for higher \\textcolor{blue}{engagement}. These upgrades aim to boost \\textbf{runtime, interactivity}, and \\textit{visual storytelling}, pushing toward fully automated \\textbf{author-grade posters}."
43
  }
44
  ]
45
  }
requirements.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ========= Core Runtime =========
2
+ python>=3.10
3
+ numpy==1.26.4
4
+ pandas
5
+ torch==2.2.2
6
+ torchvision==0.17.2
7
+ Pillow==10.4.0
8
+ opencv-python==4.11.0.86
9
+ pdf2image==1.17.0
10
+ PyMuPDF==1.25.2
11
+ moviepy==1.0.3
12
+ asyncio==3.4.3
13
+ playwright==1.51.0
14
+ aiohttp==3.11.11
15
+ aiofiles==24.1.0
16
+ tqdm==4.67.1
17
+ matplotlib==3.10.0
18
+ scikit-learn==1.6.1
19
+ scipy==1.15.1
20
+ sentence-transformers==3.3.1
21
+ transformers==4.48.0
22
+
23
+ # ========= ML / LLM Frameworks =========
24
+ accelerate
25
+ huggingface-hub==0.27.1
26
+ openai==1.59.8
27
+ langchain==0.3.17
28
+ langchain-community==0.3.16
29
+ langchain-core==0.3.33
30
+ langchain-openai==0.3.3
31
+
32
+ # ========= Image / Layout / OCR =========
33
+ layoutparser==0.3.4
34
+ easyocr
35
+ pytesseract==0.3.13
36
+ shapely==2.0.7
37
+ WeasyPrint==52.5
38
+ CairoSVG==2.7.1
39
+
40
+ # ========= PDF / DOC / PPT =========
41
+ python-docx==1.1.2
42
+ python-pptx @ git+https://github.com/Force1ess/python-pptx@dc356685d4d210a10abe1ffab3c21315cdfae63d
43
+ pypdf==5.2.0
44
+ pypandoc==1.15
45
+ openpyxl==3.1.5
46
+
47
+ # ========= Web / API / Async =========
48
+ fastapi==0.115.6
49
+ uvicorn==0.32.1
50
+ starlette==0.41.3
51
+ requests==2.32.3
52
+ httpx==0.27.2
53
+ aiohttp-cors==0.7.0
54
+ nest-asyncio==1.6.0
55
+
56
+ # ========= Poster2Video Specific =========
57
+ # camel-ai>=0.2.0
58
+ # f5_tts==1.1.6
59
+ # whisper==1.1.10
60
+ # whisperx
61
+ # mcp==1.10.1
62
+ # pydantic==2.10.6
63
+ # pydantic-core==2.23.4
64
+ # pyarrow==19.0.0
65
+
66
+ # ========= Poster2Poster Specific =========
67
+ agentops==0.3.26
68
+ arxiv==2.1.3
69
+ arxiv2text==0.1.14
70
+ pymilvus==2.5.4
71
+ peft==0.14.0
72
+ diffusers==0.25.1
73
+ einops==0.8.0
74
+ xformers==0.0.28.post3
75
+
76
+ # ========= Utils =========
77
+ filelock==3.16.1
78
+ regex==2024.11.6
79
+ pytz==2024.2
80
+ PyYAML==6.0.2
81
+ python-dateutil==2.9.0.post0
82
+ typing_extensions==4.12.2
83
+ uuid7==0.1.0
84
+ rich==13.9.4
85
+ coloredlogs==15.0.1
86
+ tenacity==9.0.0
87
+
88
+ # ========= Optional (Audio, OCR, etc.) =========
89
+ soundfile==0.13.1
90
+ pydub==0.25.1
91
+ ffmpeg-python==0.2.0
92
+
93
+ # ========= Required System Packages (apt install manually) =========
94
+ # sudo apt-get install -y poppler-utils libreoffice
template/LICENSE.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+ =====================
3
+
4
+ **Copyright (c) Anish Athalye (me@anishathalye.com)**
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
7
+ this software and associated documentation files (the "Software"), to deal in
8
+ the Software without restriction, including without limitation the rights to
9
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10
+ of the Software, and to permit persons to whom the Software is furnished to do
11
+ so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
template/Makefile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: main clean FORCE
2
+
3
+ main: poster.pdf
4
+
5
+ poster.pdf: FORCE
6
+ latexmk -pdflatex='lualatex -interaction nonstopmode' -pdf poster.tex
7
+
8
+ clean:
9
+ latexmk -pdf -C
template/beamercolorthemecam.sty ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Definitions
6
+ % ====================
7
+
8
+ % Colors from https://www.cam.ac.uk/brand-resources/guidelines/typography-and-colour/rgb-and-hex-references
9
+ \definecolor{camblue}{RGB}{0, 62, 114}
10
+
11
+ % Extra colors
12
+ \definecolor{lightgray}{RGB}{240, 240, 240}
13
+ \definecolor{lightorange}{RGB}{255, 245, 242}
14
+
15
+ % ====================
16
+ % Theme
17
+ % ====================
18
+
19
+ % Basic colors
20
+ \setbeamercolor{palette primary}{fg=black,bg=white}
21
+ \setbeamercolor{palette secondary}{fg=black,bg=white}
22
+ \setbeamercolor{palette tertiary}{bg=black,fg=white}
23
+ \setbeamercolor{palette quaternary}{fg=black,bg=white}
24
+ \setbeamercolor{structure}{fg=camblue}
25
+
26
+ % Headline
27
+ \setbeamercolor{headline}{fg=white,bg=camblue}
28
+
29
+ % Block
30
+ \setbeamercolor{block title}{fg=camblue,bg=white}
31
+ \setbeamercolor{block separator}{bg=black}
32
+ \setbeamercolor{block body}{fg=black,bg=white}
33
+
34
+ % Alert Block
35
+ \setbeamercolor{block alerted title}{fg=camblue,bg=lightorange}
36
+ \setbeamercolor{block alerted separator}{bg=black}
37
+ \setbeamercolor{block alerted body}{fg=black,bg=lightorange}
38
+
39
+ % Example Block
40
+ \setbeamercolor{block example title}{fg=camblue,bg=lightgray}
41
+ \setbeamercolor{block example separator}{bg=black}
42
+ \setbeamercolor{block example body}{fg=black,bg=lightgray}
43
+
44
+ % Heading
45
+ \setbeamercolor{heading}{fg=black}
46
+
47
+ % Itemize
48
+ \setbeamercolor{item}{fg=camblue}
49
+
50
+ % Bibliography
51
+ \setbeamercolor{bibliography item}{fg=black}
52
+ \setbeamercolor{bibliography entry author}{fg=black}
53
+ \setbeamercolor{bibliography entry title}{fg=black}
54
+ \setbeamercolor{bibliography entry location}{fg=black}
55
+ \setbeamercolor{bibliography entry note}{fg=black}
template/beamercolorthemegemini.sty ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Definitions
6
+ % ====================
7
+
8
+ \definecolor{lightgray}{RGB}{245, 246, 250}
9
+ \definecolor{blue}{RGB}{64, 115, 158}
10
+ \definecolor{darkblue}{RGB}{39, 60, 117}
11
+ \definecolor{lightblue}{RGB}{232, 244, 255}
12
+
13
+ % ====================
14
+ % Theme
15
+ % ====================
16
+
17
+ % Basic colors
18
+ \setbeamercolor{palette primary}{fg=black,bg=white}
19
+ \setbeamercolor{palette secondary}{fg=black,bg=white}
20
+ \setbeamercolor{palette tertiary}{bg=black,fg=white}
21
+ \setbeamercolor{palette quaternary}{fg=black,bg=white}
22
+ \setbeamercolor{structure}{fg=darkblue}
23
+
24
+ % Headline
25
+ \setbeamercolor{headline}{fg=lightgray,bg=blue}
26
+ \setbeamercolor{headline rule}{bg=darkblue}
27
+
28
+ % Block
29
+ \setbeamercolor{block title}{fg=blue,bg=white}
30
+ \setbeamercolor{block separator}{bg=black}
31
+ \setbeamercolor{block body}{fg=black,bg=white}
32
+
33
+ % Alert Block
34
+ \setbeamercolor{block alerted title}{fg=blue,bg=lightblue}
35
+ \setbeamercolor{block alerted separator}{bg=black}
36
+ \setbeamercolor{block alerted body}{fg=black,bg=lightblue}
37
+
38
+ % Example Block
39
+ \setbeamercolor{block example title}{fg=blue,bg=lightgray}
40
+ \setbeamercolor{block example separator}{bg=black}
41
+ \setbeamercolor{block example body}{fg=black,bg=lightgray}
42
+
43
+ % Heading
44
+ \setbeamercolor{heading}{fg=black}
45
+
46
+ % Itemize
47
+ \setbeamercolor{item}{fg=darkblue}
48
+
49
+ % Bibliography
50
+ \setbeamercolor{bibliography item}{fg=black}
51
+ \setbeamercolor{bibliography entry author}{fg=black}
52
+ \setbeamercolor{bibliography entry title}{fg=black}
53
+ \setbeamercolor{bibliography entry location}{fg=black}
54
+ \setbeamercolor{bibliography entry note}{fg=black}
55
+ \setbeamertemplate{bibliography entry article}{}
56
+ \setbeamertemplate{bibliography entry title}{}
57
+ \setbeamertemplate{bibliography entry location}{}
58
+ \setbeamertemplate{bibliography entry note}{}
template/beamercolorthemelabsix.sty ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Definitions
6
+ % ====================
7
+
8
+ \definecolor{labsixorange}{RGB}{243, 111, 33}
9
+
10
+ % Extra colors
11
+ \definecolor{lightgray}{RGB}{240, 240, 240}
12
+ \definecolor{lightorange}{RGB}{255, 240, 230}
13
+
14
+ % ====================
15
+ % Theme
16
+ % ====================
17
+
18
+ % Basic colors
19
+ \setbeamercolor{palette primary}{fg=black,bg=white}
20
+ \setbeamercolor{palette secondary}{fg=black,bg=white}
21
+ \setbeamercolor{palette tertiary}{bg=black,fg=white}
22
+ \setbeamercolor{palette quaternary}{fg=black,bg=white}
23
+ \setbeamercolor{structure}{fg=labsixorange}
24
+
25
+ % Headline
26
+ \setbeamercolor{headline}{fg=white,bg=labsixorange}
27
+ \setbeamercolor{headline rule}{bg=black}
28
+
29
+ % Block
30
+ \setbeamercolor{block title}{fg=labsixorange,bg=white}
31
+ \setbeamercolor{block separator}{bg=black}
32
+ \setbeamercolor{block body}{fg=black,bg=white}
33
+
34
+ % Alert Block
35
+ \setbeamercolor{block alerted title}{fg=labsixorange,bg=lightorange}
36
+ \setbeamercolor{block alerted separator}{bg=black}
37
+ \setbeamercolor{block alerted body}{fg=black,bg=lightorange}
38
+
39
+ % Example Block
40
+ \setbeamercolor{block example title}{fg=labsixorange,bg=lightgray}
41
+ \setbeamercolor{block example separator}{bg=black}
42
+ \setbeamercolor{block example body}{fg=black,bg=lightgray}
43
+
44
+ % Heading
45
+ \setbeamercolor{heading}{fg=black}
46
+
47
+ % Itemize
48
+ \setbeamercolor{item}{fg=labsixorange}
49
+
50
+ % Bibliography
51
+ \setbeamercolor{bibliography item}{fg=black}
52
+ \setbeamercolor{bibliography entry author}{fg=black}
53
+ \setbeamercolor{bibliography entry title}{fg=black}
54
+ \setbeamercolor{bibliography entry location}{fg=black}
55
+ \setbeamercolor{bibliography entry note}{fg=black}
template/beamercolorthememit.sty ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Definitions
6
+ % ====================
7
+
8
+ % Colors from http://web.mit.edu/graphicidentity/colors.html
9
+ \definecolor{mitred}{cmyk}{0.24, 1.0, 0.78, 0.17}
10
+ \definecolor{mitdarkgray}{cmyk}{0.48, 0.39, 0.39, 0.04}
11
+ \definecolor{mitlightgray}{cmyk}{0.24, 0.20, 0.20, 0.0}
12
+
13
+ % Extra colors
14
+ \definecolor{lightgray}{RGB}{240, 240, 240}
15
+ \definecolor{lightorange}{RGB}{255, 245, 242}
16
+
17
+ % ====================
18
+ % Theme
19
+ % ====================
20
+
21
+ % Basic colors
22
+ \setbeamercolor{palette primary}{fg=black,bg=white}
23
+ \setbeamercolor{palette secondary}{fg=black,bg=white}
24
+ \setbeamercolor{palette tertiary}{bg=black,fg=white}
25
+ \setbeamercolor{palette quaternary}{fg=black,bg=white}
26
+ \setbeamercolor{structure}{fg=mitred}
27
+
28
+ % Headline
29
+ \setbeamercolor{headline}{fg=black,bg=lightgray}
30
+
31
+ % Block
32
+ \setbeamercolor{block title}{fg=mitred,bg=white}
33
+ \setbeamercolor{block separator}{bg=black}
34
+ \setbeamercolor{block body}{fg=black,bg=white}
35
+
36
+ % Alert Block
37
+ \setbeamercolor{block alerted title}{fg=mitred,bg=lightorange}
38
+ \setbeamercolor{block alerted separator}{bg=black}
39
+ \setbeamercolor{block alerted body}{fg=black,bg=lightorange}
40
+
41
+ % Example Block
42
+ \setbeamercolor{block example title}{fg=mitred,bg=lightgray}
43
+ \setbeamercolor{block example separator}{bg=black}
44
+ \setbeamercolor{block example body}{fg=black,bg=lightgray}
45
+
46
+ % Heading
47
+ \setbeamercolor{heading}{fg=black}
48
+
49
+ % Itemize
50
+ \setbeamercolor{item}{fg=mitred}
51
+
52
+ % Bibliography
53
+ \setbeamercolor{bibliography item}{fg=black}
54
+ \setbeamercolor{bibliography entry author}{fg=black}
55
+ \setbeamercolor{bibliography entry title}{fg=black}
56
+ \setbeamercolor{bibliography entry location}{fg=black}
57
+ \setbeamercolor{bibliography entry note}{fg=black}
template/beamercolorthemeumich.sty ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Definitions
6
+ % ====================
7
+
8
+ \definecolor{UMichBlue}{RGB}{0, 39, 76} % #00274C
9
+ \definecolor{UMichMaize}{RGB}{255, 203, 5} % #FFCB05
10
+ \definecolor{UMichWhite}{RGB}{255, 255, 255} % #FFFFFF
11
+ \definecolor{UMichGray}{RGB}{235, 235, 235}
12
+ \definecolor{UMichLightMaize}{RGB}{242, 237, 217}
13
+
14
+ % ====================
15
+ % Theme
16
+ % ====================
17
+
18
+ % Basic colors
19
+ \setbeamercolor{palette primary}{fg=UMichBlue,bg=UMichWhite}
20
+ \setbeamercolor{palette secondary}{fg=UMichBlue,bg=UMichWhite}
21
+ \setbeamercolor{palette tertiary}{bg=UMichBlue,fg=UMichWhite}
22
+ \setbeamercolor{palette quaternary}{fg=UMichBlue,bg=UMichWhite}
23
+ \setbeamercolor{structure}{fg=UMichBlue}
24
+
25
+ % Headline
26
+ \setbeamercolor{headline}{fg=UMichGray,bg=UMichBlue}
27
+ \setbeamercolor{headline rule}{bg=UMichMaize}
28
+
29
+ % Block
30
+ \setbeamercolor{block title}{fg=UMichBlue,bg=UMichWhite}
31
+ \setbeamercolor{block separator}{bg=UMichBlue}
32
+ \setbeamercolor{block body}{fg=UMichBlue,bg=UMichWhite}
33
+
34
+ % Alert Block
35
+ \setbeamercolor{block alerted title}{fg=UMichBlue,bg=UMichLightMaize}
36
+ \setbeamercolor{block alerted separator}{bg=UMichBlue}
37
+ \setbeamercolor{block alerted body}{fg=UMichBlue,bg=UMichLightMaize}
38
+
39
+ % Example Block
40
+ \setbeamercolor{block example title}{fg=UMichBlue,bg=UMichWhite}
41
+ \setbeamercolor{block example separator}{bg=UMichBlue}
42
+ \setbeamercolor{block example body}{fg=UMichBlue,bg=UMichWhite}
43
+
44
+ % Heading
45
+ \setbeamercolor{heading}{fg=UMichBlue}
46
+
47
+ % Itemize
48
+ \setbeamercolor{item}{fg=UMichBlue}
49
+
50
+ % Bibliography
51
+ \setbeamercolor{bibliography item}{fg=UMichBlue}
52
+ \setbeamercolor{bibliography entry author}{fg=UMichBlue}
53
+ \setbeamercolor{bibliography entry title}{fg=UMichBlue}
54
+ \setbeamercolor{bibliography entry location}{fg=UMichBlue}
55
+ \setbeamercolor{bibliography entry note}{fg=UMichBlue}
template/beamerthemegemini.sty ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Gemini theme
2
+ % https://github.com/anishathalye/gemini
3
+
4
+ % ====================
5
+ % Dependencies
6
+ % ====================
7
+
8
+ \RequirePackage{exscale}
9
+ \RequirePackage{ragged2e}
10
+ \RequirePackage{changepage}
11
+ \RequirePackage{fontspec}
12
+ \RequirePackage{calc}
13
+
14
+ % ====================
15
+ % Fonts
16
+ % ====================
17
+
18
+ \newfontfamily\Raleway[Ligatures=TeX]{Raleway}
19
+ \newfontfamily\Lato[Ligatures=TeX]{Lato}
20
+
21
+ \usefonttheme{professionalfonts}
22
+
23
+ \setsansfont{Lato}[
24
+ UprightFont=*-Light,
25
+ ItalicFont=*-LightItalic,
26
+ BoldFont=*-Regular,
27
+ BoldItalicFont=*-Italic
28
+ ]
29
+
30
+ \setbeamerfont{headline}{family=\Raleway}
31
+ \setbeamerfont{headline title}{size=\Huge,series=\bfseries}
32
+ \setbeamerfont{headline author}{size=\Large}
33
+ \setbeamerfont{headline institute}{size=\normalsize}
34
+ \setbeamerfont{block title}{family=\Raleway,size=\large,series=\bfseries}
35
+ \setbeamerfont{heading}{family=\Lato,series=\bfseries}
36
+ \setbeamerfont{caption}{size=\small}
37
+ \setbeamerfont{footline}{family=\Raleway,size=\normalsize}
38
+ \setbeamerfont{block body}{size=\normalsize}
39
+
40
+ % ====================
41
+ % Macros
42
+ % ====================
43
+
44
+ \newcommand{\samelineand}{\qquad}
45
+
46
+ % ====================
47
+ % Elements
48
+ % ====================
49
+
50
+ % List
51
+ \def\@listi{\leftmargin\leftmargini
52
+ \topsep 1ex % spacing before
53
+ \parsep 0\p@ \@plus\p@
54
+ \itemsep 0.5ex} % spacing between
55
+
56
+ % Itemize
57
+
58
+ \setbeamertemplate{itemize item}{\raise0.5ex \hbox{\vrule width 0.5ex height 0.5ex}}
59
+ \setbeamertemplate{itemize subitem}{\raise0.3ex \hbox{\vrule width 0.5ex height 0.5ex}}
60
+ \setbeamertemplate{itemize subsubitem}{\raise0.2ex \hbox{\vrule width 0.5ex height 0.5ex}}
61
+
62
+ % Enumerate
63
+
64
+ \setbeamertemplate{enumerate item}{\insertenumlabel.}
65
+ \setbeamertemplate{enumerate subitem}{\insertsubenumlabel.}
66
+ \setbeamertemplate{enumerate subsubitem}{\insertsubsubenumlabel.}
67
+
68
+ % Equation
69
+ \setlength\belowdisplayshortskip{2ex}
70
+
71
+ % Caption
72
+ \setbeamertemplate{caption}[numbered]
73
+ \setbeamertemplate{caption label separator}[period]
74
+ \setlength{\abovecaptionskip}{2ex}
75
+ \setlength{\belowcaptionskip}{1ex}
76
+
77
+ % Bibliography
78
+ \setbeamertemplate{bibliography item}[text]
79
+
80
+ % Navigation
81
+ \beamertemplatenavigationsymbolsempty
82
+
83
+ % ====================
84
+ % Components
85
+ % ====================
86
+
87
+ % Heading
88
+ \newcommand\heading[1]
89
+ {%
90
+ \par\bigskip
91
+ {\usebeamerfont{heading}\usebeamercolor[fg]{heading}#1}\par\smallskip
92
+ }
93
+
94
+ % logo
95
+ \newlength{\logoleftwidth}
96
+ \setlength{\logoleftwidth}{0cm}
97
+ \newlength{\logorightwidth}
98
+ \setlength{\logorightwidth}{0cm}
99
+ \newlength{\maxlogowidth} % space on both sides set to maxlogowidth to keep title centered
100
+ \setlength{\maxlogowidth}{0cm}
101
+
102
+ \newcommand{\logoright}[1]{
103
+ \newcommand{\insertlogoright}{#1}
104
+ \settowidth{\logorightwidth}{\insertlogoright}
105
+ \addtolength{\logorightwidth}{10ex}
106
+ \setlength{\maxlogowidth}{\maxof{\logoleftwidth}{\logorightwidth}}
107
+ }
108
+ \newcommand{\logoleft}[1]{
109
+ \newcommand{\insertlogoleft}{#1}
110
+ \settowidth{\logoleftwidth}{\insertlogoleft}
111
+ \addtolength{\logoleftwidth}{10ex}
112
+ \setlength{\maxlogowidth}{\maxof{\logoleftwidth}{\logorightwidth}}
113
+ }
114
+
115
+ % Headline
116
+ \setbeamertemplate{headline}
117
+ {
118
+ \begin{beamercolorbox}{headline}
119
+ \begin{columns}
120
+ \begin{column}{\maxlogowidth}
121
+ \vskip5ex
122
+ \ifdefined\insertlogoleft
123
+ \vspace*{\fill}
124
+ \hspace{10ex}
125
+ \raggedright
126
+ \insertlogoleft
127
+ \vspace*{\fill}
128
+ \else\fi
129
+ \end{column}
130
+ \begin{column}{\dimexpr\paperwidth-\maxlogowidth-\maxlogowidth}
131
+ \usebeamerfont{headline}
132
+ \vskip3ex
133
+ \centering
134
+ \ifx \inserttitle \empty \else
135
+ {\usebeamerfont{headline title}\usebeamercolor[fg]{headline title}\inserttitle\\[0.5ex]}
136
+ \fi
137
+ \ifx \beamer@shortauthor \empty \else
138
+ {\usebeamerfont{headline author}\usebeamercolor[fg]{headline author}\insertauthor\\[1ex]}
139
+ \fi
140
+ \ifx \insertinstitute \empty \else
141
+ {\usebeamerfont{headline institute}\usebeamercolor[fg]{headline institute}\insertinstitute\\[1ex]}
142
+ \fi
143
+ \end{column}
144
+ \begin{column}{\maxlogowidth}
145
+ \vskip5ex
146
+ \ifdefined\insertlogoright
147
+ \vspace*{\fill}
148
+ \raggedleft
149
+ \insertlogoright
150
+ \hspace{10ex}
151
+ \vspace*{\fill}
152
+ \else\fi
153
+ \end{column}
154
+ \end{columns}
155
+ \vspace{1ex}
156
+ \ifbeamercolorempty[bg]{headline rule}{}{
157
+ \begin{beamercolorbox}[wd=\paperwidth,colsep=0.5ex]{headline rule}\end{beamercolorbox}
158
+ }
159
+ \end{beamercolorbox}
160
+ }
161
+
162
+ % Block
163
+ \setbeamertemplate{block begin}
164
+ {
165
+ \begin{beamercolorbox}[colsep*=0ex,dp=2ex,center]{block title}
166
+ \vskip0pt
167
+ \usebeamerfont{block title}\insertblocktitle
168
+ % \vskip-1.25ex
169
+ % \begin{beamercolorbox}[colsep=0.025ex]{block separator}\end{beamercolorbox}
170
+ \end{beamercolorbox}
171
+ {\parskip0pt\par}
172
+ \usebeamerfont{block body}
173
+ \vskip1.0ex
174
+ \begin{beamercolorbox}[colsep*=0ex]{block body}
175
+ \justifying
176
+ \setlength{\parskip}{1ex}
177
+ \vskip-2ex
178
+ }
179
+ \setbeamertemplate{block end}
180
+ {
181
+ \end{beamercolorbox}
182
+ \vskip0pt
183
+ \vspace*{2ex}
184
+ }
185
+
186
+ % Alert Block
187
+ \setbeamertemplate{block alerted begin}
188
+ {
189
+ \begin{beamercolorbox}[colsep*=0ex,dp=2ex,center]{block alerted title}
190
+ \vskip0pt
191
+ \usebeamerfont{block title}\insertblocktitle
192
+ \vskip-1.25ex
193
+ \begin{beamercolorbox}[colsep=0.025ex]{block alerted separator}\end{beamercolorbox}
194
+ \end{beamercolorbox}
195
+ {\parskip0pt\par}
196
+ \usebeamerfont{block body}
197
+ \vskip1.0ex
198
+ \begin{beamercolorbox}[colsep*=0ex]{block alerted body}
199
+ \justifying
200
+ \begin{adjustwidth}{1ex}{1ex}
201
+ \setlength{\parskip}{1ex}
202
+ \vskip-2ex
203
+ }
204
+ \setbeamertemplate{block alerted end}
205
+ {
206
+ \end{adjustwidth}
207
+ \vskip1ex
208
+ \end{beamercolorbox}
209
+ \vskip0pt
210
+ \vspace*{2ex}
211
+ }
212
+
213
+ % Example Block
214
+ \setbeamertemplate{block example begin}
215
+ {
216
+ \begin{beamercolorbox}[colsep*=0ex,dp=2ex,center]{block example title}
217
+ \vskip0pt
218
+ \usebeamerfont{block title}\insertblocktitle
219
+ \vskip-1.25ex
220
+ \begin{beamercolorbox}[colsep=0.025ex]{block example separator}\end{beamercolorbox}
221
+ \end{beamercolorbox}
222
+ {\parskip0pt\par}
223
+ \usebeamerfont{block body}
224
+ \vskip1.0ex
225
+ \begin{beamercolorbox}[colsep*=0ex]{block example body}
226
+ \justifying
227
+ \begin{adjustwidth}{1ex}{1ex}
228
+ \setlength{\parskip}{1ex}
229
+ \vskip-2ex
230
+ }
231
+ \setbeamertemplate{block example end}
232
+ {
233
+ \end{adjustwidth}
234
+ \vskip1ex
235
+ \end{beamercolorbox}
236
+ \vskip0pt
237
+ \vspace*{2ex}
238
+ }
239
+
240
+ % Footer
241
+ \newcommand{\footercontent}[1]{\newcommand{\insertfootercontent}{#1}}
242
+
243
+ \setbeamertemplate{footline}{
244
+ \ifdefined\insertfootercontent
245
+ \begin{beamercolorbox}[vmode]{headline}
246
+ \ifbeamercolorempty[bg]{headline rule}{}{
247
+ \begin{beamercolorbox}[wd=\paperwidth,colsep=0.25ex]{headline rule}\end{beamercolorbox}
248
+ }
249
+ \vspace{1.5ex}
250
+ \hspace{\sepwidth}
251
+ \usebeamerfont{footline}
252
+ \centering
253
+ \insertfootercontent
254
+ \hspace{\sepwidth}
255
+ \vspace{1.5ex}
256
+ \end{beamercolorbox}
257
+ \else\fi
258
+ }
template/latexmkrc ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ $bibtex_use = 2;
2
+ $clean_ext = "nav snm";
template/poster.bib ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ @article{shannon1948communication,
2
+ author = {Claude E. Shannon},
3
+ title = {A Mathematical Theory of Communication},
4
+ journal = {Bell System Technical Journal},
5
+ year = 1948,
6
+ volume = {27},
7
+ number = {3},
8
+ pages = {379-423},
9
+ }