Spaces:
Runtime error
Runtime error
update mj-bench
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +263 -298
- evals/mjbench/detailed-results/AestheticsPredictor.json +47 -0
- evals/mjbench/detailed-results/BLIP-v2.json +47 -0
- evals/mjbench/detailed-results/CLIP-v2.json +47 -0
- evals/mjbench/detailed-results/Claude 3 Opus.json +47 -0
- evals/mjbench/detailed-results/GPT-4-vision.json +47 -0
- evals/mjbench/detailed-results/GPT-4o.json +47 -0
- evals/mjbench/detailed-results/Gemini Ultra.json +47 -0
- evals/mjbench/detailed-results/HPS-v2.1.json +47 -0
- evals/mjbench/detailed-results/Idefics2-8b.json +47 -0
- evals/mjbench/detailed-results/ImageReward.json +47 -0
- evals/mjbench/detailed-results/Instructblip-7b.json +47 -0
- evals/mjbench/detailed-results/InternVL-Chat-V1-5.json +47 -0
- evals/mjbench/detailed-results/LLaVA-1.5-13b.json +47 -0
- evals/mjbench/detailed-results/LLaVA-1.5-7b.json +47 -0
- evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json +47 -0
- evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json +35 -0
- evals/mjbench/detailed-results/MiniGPT4-v2.json +47 -0
- evals/mjbench/detailed-results/PickScore-v1.json +47 -0
- evals/mjbench/detailed-results/Prometheus-Vision-13b.json +47 -0
- evals/mjbench/detailed-results/Prometheus-Vision-7b.json +47 -0
- evals/mjbench/detailed-results/Qwen-VL-Chat.json +47 -0
- evals/mjbench/latex_reults/alignment_narrative.tex +37 -0
- evals/mjbench/latex_reults/alignment_number_10.tex +29 -0
- evals/mjbench/latex_reults/alignment_number_5.tex +35 -0
- evals/mjbench/latex_reults/artifact_narrative.tex +29 -0
- evals/mjbench/latex_reults/artifact_number_10.tex +38 -0
- evals/mjbench/latex_reults/artifact_number_5.tex +29 -0
- evals/mjbench/latex_reults/bias_acc.tex +39 -0
- evals/mjbench/latex_reults/bias_ges.tex +37 -0
- evals/mjbench/latex_reults/bias_nds.tex +39 -0
- evals/mjbench/latex_reults/bias_scale.tex +30 -0
- evals/mjbench/latex_reults/consitient_analysis.tex +26 -0
- evals/mjbench/latex_reults/dataset.text +69 -0
- evals/mjbench/latex_reults/human_eval.tex +22 -0
- evals/mjbench/latex_reults/main_result.tex +49 -0
- evals/mjbench/latex_reults/original_scale_study.tex +29 -0
- evals/mjbench/latex_reults/safety_narrative.tex +29 -0
- evals/mjbench/latex_reults/safety_number_10.tex +38 -0
- evals/mjbench/latex_reults/safety_number_5.tex +30 -0
- evals/mjbench/latex_reults/scale_study.tex +63 -0
- evals/mjbench/latex_reults/summary.tex +69 -0
- evals/mjbench/latex_reults/temp_table.tex +40 -0
- evals/mjbench/overall-results/AestheticsPredictor.json +12 -0
- evals/mjbench/overall-results/BLIP-v2.json +12 -0
- evals/mjbench/overall-results/CLIP-v2.json +12 -0
- evals/mjbench/overall-results/Claude 3 Opus.json +12 -0
- evals/mjbench/overall-results/GPT-4-vision.json +12 -0
- evals/mjbench/overall-results/GPT-4o.json +12 -0
- evals/mjbench/overall-results/Gemini Ultra.json +12 -0
app.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
-
import
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -11,6 +17,7 @@ from src.about import (
|
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
| 13 |
TITLE,
|
|
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
|
@@ -27,319 +34,277 @@ from src.display.utils import (
|
|
| 27 |
Precision
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def restart_space():
|
| 35 |
API.restart_space(repo_id=REPO_ID)
|
| 36 |
|
| 37 |
-
try:
|
| 38 |
-
print(EVAL_REQUESTS_PATH)
|
| 39 |
-
snapshot_download(
|
| 40 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 41 |
-
)
|
| 42 |
-
except Exception:
|
| 43 |
-
restart_space()
|
| 44 |
-
try:
|
| 45 |
-
print(EVAL_RESULTS_PATH)
|
| 46 |
-
snapshot_download(
|
| 47 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 48 |
-
)
|
| 49 |
-
except Exception:
|
| 50 |
-
restart_space()
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 54 |
-
leaderboard_df = original_df.copy()
|
| 55 |
-
|
| 56 |
-
(
|
| 57 |
-
finished_eval_queue_df,
|
| 58 |
-
running_eval_queue_df,
|
| 59 |
-
pending_eval_queue_df,
|
| 60 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
return df
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
demo = gr.Blocks(css=custom_css)
|
| 136 |
-
with demo:
|
| 137 |
-
gr.HTML(TITLE)
|
| 138 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 139 |
-
|
| 140 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 141 |
-
with gr.TabItem("
|
| 142 |
with gr.Row():
|
| 143 |
-
|
| 144 |
-
with
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
for c in fields(AutoEvalColumn)
|
| 155 |
-
if not c.hidden and not c.never_hidden
|
| 156 |
-
],
|
| 157 |
-
value=[
|
| 158 |
-
c.name
|
| 159 |
-
for c in fields(AutoEvalColumn)
|
| 160 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 161 |
-
],
|
| 162 |
-
label="Select columns to show",
|
| 163 |
-
elem_id="column-select",
|
| 164 |
-
interactive=True,
|
| 165 |
-
)
|
| 166 |
-
with gr.Row():
|
| 167 |
-
deleted_models_visibility = gr.Checkbox(
|
| 168 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
| 169 |
-
)
|
| 170 |
-
with gr.Column(min_width=320):
|
| 171 |
-
#with gr.Box(elem_id="box-filter"):
|
| 172 |
-
filter_columns_type = gr.CheckboxGroup(
|
| 173 |
-
label="Model types",
|
| 174 |
-
choices=[t.to_str() for t in ModelType],
|
| 175 |
-
value=[t.to_str() for t in ModelType],
|
| 176 |
-
interactive=True,
|
| 177 |
-
elem_id="filter-columns-type",
|
| 178 |
-
)
|
| 179 |
-
filter_columns_precision = gr.CheckboxGroup(
|
| 180 |
-
label="Precision",
|
| 181 |
-
choices=[i.value.name for i in Precision],
|
| 182 |
-
value=[i.value.name for i in Precision],
|
| 183 |
-
interactive=True,
|
| 184 |
-
elem_id="filter-columns-precision",
|
| 185 |
-
)
|
| 186 |
-
filter_columns_size = gr.CheckboxGroup(
|
| 187 |
-
label="Model sizes (in billions of parameters)",
|
| 188 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
| 189 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
| 190 |
-
interactive=True,
|
| 191 |
-
elem_id="filter-columns-size",
|
| 192 |
-
)
|
| 193 |
-
|
| 194 |
-
leaderboard_table = gr.components.Dataframe(
|
| 195 |
-
value=leaderboard_df[
|
| 196 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
| 197 |
-
+ shown_columns.value
|
| 198 |
-
],
|
| 199 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 200 |
-
datatype=TYPES,
|
| 201 |
-
elem_id="leaderboard-table",
|
| 202 |
-
interactive=False,
|
| 203 |
-
visible=True,
|
| 204 |
-
)
|
| 205 |
-
|
| 206 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 207 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 208 |
-
value=original_df[COLS],
|
| 209 |
-
headers=COLS,
|
| 210 |
-
datatype=TYPES,
|
| 211 |
-
visible=False,
|
| 212 |
-
)
|
| 213 |
-
search_bar.submit(
|
| 214 |
-
update_table,
|
| 215 |
-
[
|
| 216 |
-
hidden_leaderboard_table_for_search,
|
| 217 |
-
shown_columns,
|
| 218 |
-
filter_columns_type,
|
| 219 |
-
filter_columns_precision,
|
| 220 |
-
filter_columns_size,
|
| 221 |
-
deleted_models_visibility,
|
| 222 |
-
search_bar,
|
| 223 |
-
],
|
| 224 |
-
leaderboard_table,
|
| 225 |
-
)
|
| 226 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
| 227 |
-
selector.change(
|
| 228 |
-
update_table,
|
| 229 |
-
[
|
| 230 |
-
hidden_leaderboard_table_for_search,
|
| 231 |
-
shown_columns,
|
| 232 |
-
filter_columns_type,
|
| 233 |
-
filter_columns_precision,
|
| 234 |
-
filter_columns_size,
|
| 235 |
-
deleted_models_visibility,
|
| 236 |
-
search_bar,
|
| 237 |
-
],
|
| 238 |
-
leaderboard_table,
|
| 239 |
-
queue=True,
|
| 240 |
)
|
| 241 |
-
|
| 242 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 243 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 244 |
-
|
| 245 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 246 |
-
with gr.Column():
|
| 247 |
-
with gr.Row():
|
| 248 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 249 |
-
|
| 250 |
-
with gr.Column():
|
| 251 |
-
with gr.Accordion(
|
| 252 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 253 |
-
open=False,
|
| 254 |
-
):
|
| 255 |
-
with gr.Row():
|
| 256 |
-
finished_eval_table = gr.components.Dataframe(
|
| 257 |
-
value=finished_eval_queue_df,
|
| 258 |
-
headers=EVAL_COLS,
|
| 259 |
-
datatype=EVAL_TYPES,
|
| 260 |
-
row_count=5,
|
| 261 |
-
)
|
| 262 |
-
with gr.Accordion(
|
| 263 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 264 |
-
open=False,
|
| 265 |
-
):
|
| 266 |
-
with gr.Row():
|
| 267 |
-
running_eval_table = gr.components.Dataframe(
|
| 268 |
-
value=running_eval_queue_df,
|
| 269 |
-
headers=EVAL_COLS,
|
| 270 |
-
datatype=EVAL_TYPES,
|
| 271 |
-
row_count=5,
|
| 272 |
-
)
|
| 273 |
-
|
| 274 |
-
with gr.Accordion(
|
| 275 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 276 |
-
open=False,
|
| 277 |
-
):
|
| 278 |
-
with gr.Row():
|
| 279 |
-
pending_eval_table = gr.components.Dataframe(
|
| 280 |
-
value=pending_eval_queue_df,
|
| 281 |
-
headers=EVAL_COLS,
|
| 282 |
-
datatype=EVAL_TYPES,
|
| 283 |
-
row_count=5,
|
| 284 |
-
)
|
| 285 |
with gr.Row():
|
| 286 |
-
gr.
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
with gr.Row():
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
model_type = gr.Dropdown(
|
| 293 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 294 |
-
label="Model type",
|
| 295 |
-
multiselect=False,
|
| 296 |
-
value=None,
|
| 297 |
-
interactive=True,
|
| 298 |
-
)
|
| 299 |
-
|
| 300 |
-
with gr.Column():
|
| 301 |
-
precision = gr.Dropdown(
|
| 302 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 303 |
-
label="Precision",
|
| 304 |
-
multiselect=False,
|
| 305 |
-
value="float16",
|
| 306 |
-
interactive=True,
|
| 307 |
-
)
|
| 308 |
-
weight_type = gr.Dropdown(
|
| 309 |
-
choices=[i.value.name for i in WeightType],
|
| 310 |
-
label="Weights type",
|
| 311 |
-
multiselect=False,
|
| 312 |
-
value="Original",
|
| 313 |
-
interactive=True,
|
| 314 |
-
)
|
| 315 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 316 |
-
|
| 317 |
-
submit_button = gr.Button("Submit Eval")
|
| 318 |
-
submission_result = gr.Markdown()
|
| 319 |
-
submit_button.click(
|
| 320 |
-
add_new_eval,
|
| 321 |
-
[
|
| 322 |
-
model_name_textbox,
|
| 323 |
-
base_model_name_textbox,
|
| 324 |
-
revision_name_textbox,
|
| 325 |
-
precision,
|
| 326 |
-
weight_type,
|
| 327 |
-
model_type,
|
| 328 |
-
],
|
| 329 |
-
submission_result,
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
with gr.Row():
|
| 333 |
-
with gr.Accordion("📙 Citation", open=False):
|
| 334 |
citation_button = gr.Textbox(
|
| 335 |
-
value=
|
| 336 |
-
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
elem_id="citation-button",
|
| 339 |
show_copy_button=True,
|
| 340 |
)
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
scheduler = BackgroundScheduler()
|
| 343 |
-
scheduler.add_job(restart_space, "interval", seconds=
|
| 344 |
scheduler.start()
|
| 345 |
-
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from pathlib import Path
|
| 8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 9 |
from huggingface_hub import snapshot_download
|
| 10 |
+
from datasets import load_dataset
|
| 11 |
+
|
| 12 |
|
| 13 |
from src.about import (
|
| 14 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 17 |
INTRODUCTION_TEXT,
|
| 18 |
LLM_BENCHMARKS_TEXT,
|
| 19 |
TITLE,
|
| 20 |
+
ABOUT_TEXT
|
| 21 |
)
|
| 22 |
from src.display.css_html_js import custom_css
|
| 23 |
from src.display.utils import (
|
|
|
|
| 34 |
Precision
|
| 35 |
)
|
| 36 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 37 |
+
|
| 38 |
+
SUBSET_COUNTS = {
|
| 39 |
+
"Alignment-Object": 250,
|
| 40 |
+
"Alignment-Attribute": 229,
|
| 41 |
+
"Alignment-Action": 115,
|
| 42 |
+
"Alignment-Count": 55,
|
| 43 |
+
"Alignment-Location": 75,
|
| 44 |
+
"Safety-Toxicity-Crime": 29,
|
| 45 |
+
"Safety-Toxicity-Shocking": 31,
|
| 46 |
+
"Safety-Toxicity-Disgust": 42,
|
| 47 |
+
"Safety-Nsfw-Evident": 197,
|
| 48 |
+
"Safety-Nsfw-Evasive": 177,
|
| 49 |
+
"Safety-Nsfw-Subtle": 98,
|
| 50 |
+
"Quality-Distortion-Human_face": 169,
|
| 51 |
+
"Quality-Distortion-Human_limb": 152,
|
| 52 |
+
"Quality-Distortion-Object": 100,
|
| 53 |
+
"Quality-Blurry-Defocused": 350,
|
| 54 |
+
"Quality-Blurry-Motion": 350,
|
| 55 |
+
"Bias-Age": 80,
|
| 56 |
+
"Bias-Gender": 140,
|
| 57 |
+
"Bias-Race": 140,
|
| 58 |
+
"Bias-Nationality": 120,
|
| 59 |
+
"Bias-Religion": 60,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
PERSPECTIVE_COUNTS= {
|
| 63 |
+
"Alignment": 724,
|
| 64 |
+
"Safety": 574,
|
| 65 |
+
"Quality": 1121,
|
| 66 |
+
"Bias": 540
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
|
| 72 |
|
| 73 |
|
| 74 |
def restart_space():
|
| 75 |
API.restart_space(repo_id=REPO_ID)
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
color_map = {
|
| 79 |
+
"Score Model": "#7497db",
|
| 80 |
+
"Opensource VLM": "#E8ECF2",
|
| 81 |
+
"Closesource VLM": "#ffcd75",
|
| 82 |
+
"Others": "#75809c",
|
| 83 |
+
|
| 84 |
+
# #7497db #E8ECF2 #ffcd75 #75809c
|
| 85 |
+
}
|
| 86 |
+
def color_model_type_column(df, color_map):
|
| 87 |
+
"""
|
| 88 |
+
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
| 89 |
+
|
| 90 |
+
Parameters:
|
| 91 |
+
df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
|
| 92 |
+
color_map (dict): A dictionary mapping model types to colors.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
pd.Styler: The styled DataFrame.
|
| 96 |
+
"""
|
| 97 |
+
# Function to apply color based on the model type
|
| 98 |
+
def apply_color(val):
|
| 99 |
+
color = color_map.get(val, "default") # Default color if not specified in color_map
|
| 100 |
+
return f'background-color: {color}'
|
| 101 |
+
|
| 102 |
+
# Format for different columns
|
| 103 |
+
format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
|
| 104 |
+
format_dict['Overall Score'] = "{:.2f}"
|
| 105 |
+
format_dict[''] = "{:d}"
|
| 106 |
+
|
| 107 |
+
return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
|
| 108 |
+
|
| 109 |
+
def regex_table(dataframe, regex, filter_button, style=True):
|
| 110 |
+
"""
|
| 111 |
+
Takes a model name as a regex, then returns only the rows that has that in it.
|
| 112 |
+
"""
|
| 113 |
+
# Split regex statement by comma and trim whitespace around regexes
|
| 114 |
+
regex_list = [x.strip() for x in regex.split(",")]
|
| 115 |
+
# Join the list into a single regex pattern with '|' acting as OR
|
| 116 |
+
combined_regex = '|'.join(regex_list)
|
| 117 |
+
|
| 118 |
+
# if filter_button, remove all rows with "ai2" in the model name
|
| 119 |
+
update_scores = False
|
| 120 |
+
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
| 121 |
+
if "Score Model" not in filter_button:
|
| 122 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Score Model", case=False, na=False)]
|
| 123 |
+
if "Opensource VLM" not in filter_button:
|
| 124 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Opensource VLM", case=False, na=False)]
|
| 125 |
+
if "Closesource VLM" not in filter_button:
|
| 126 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Closesource VLM", case=False, na=False)]
|
| 127 |
+
if "Others" not in filter_button:
|
| 128 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Others", case=False, na=False)]
|
| 129 |
+
# Filter the dataframe such that 'model' contains any of the regex patterns
|
| 130 |
+
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
| 131 |
+
|
| 132 |
+
data.reset_index(drop=True, inplace=True)
|
| 133 |
+
|
| 134 |
+
# replace column '' with count/rank
|
| 135 |
+
data.insert(0, '', range(1, 1 + len(data)))
|
| 136 |
+
|
| 137 |
+
if style:
|
| 138 |
+
# apply color
|
| 139 |
+
data = color_model_type_column(data, color_map)
|
| 140 |
+
|
| 141 |
+
return data
|
| 142 |
+
|
| 143 |
+
def get_leaderboard_results(results_path):
|
| 144 |
+
data_dir = Path(results_path)
|
| 145 |
+
files = [d for d in os.listdir(data_dir)] # TODO check if "Path(data_dir) / d" is a dir
|
| 146 |
+
|
| 147 |
+
df = pd.DataFrame()
|
| 148 |
+
for file in files:
|
| 149 |
+
if not file.endswith(".json"):
|
| 150 |
+
continue
|
| 151 |
+
with open(results_path / file) as rf:
|
| 152 |
+
result = json.load(rf)
|
| 153 |
+
result = pd.DataFrame(result)
|
| 154 |
+
df = pd.concat([result, df])
|
| 155 |
+
df.reset_index(drop=True, inplace=True)
|
| 156 |
return df
|
| 157 |
|
| 158 |
+
def avg_all_subset(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, subset_counts=SUBSET_COUNTS):
|
| 159 |
+
new_df = orig_df.copy()[meta_data + columns_name]
|
| 160 |
+
|
| 161 |
+
# Filter the dictionary to include only the counts relevant to the specified columns
|
| 162 |
+
new_subset_counts = {col: subset_counts[col] for col in columns_name}
|
| 163 |
+
|
| 164 |
+
# Calculate the weights for each subset
|
| 165 |
+
total_count = sum(new_subset_counts.values())
|
| 166 |
+
weights = {subset: count / total_count for subset, count in new_subset_counts.items()}
|
| 167 |
+
|
| 168 |
+
# Calculate the weight_avg value for each row
|
| 169 |
+
def calculate_weighted_avg(row):
|
| 170 |
+
weighted_sum = sum(row[col] * weights[col] for col in columns_name)
|
| 171 |
+
return weighted_sum
|
| 172 |
+
|
| 173 |
+
new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
|
| 174 |
+
|
| 175 |
+
cols = meta_data + ["Overall Score"] + columns_name
|
| 176 |
+
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
|
| 177 |
+
return new_df
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
|
| 181 |
+
new_df = orig_df[meta_data + columns_name]
|
| 182 |
+
new_perspective_counts = {col: perspective_counts[col] for col in columns_name}
|
| 183 |
+
total_count = sum(perspective_counts.values())
|
| 184 |
+
weights = {perspective: count / total_count for perspective, count in perspective_counts.items()}
|
| 185 |
+
def calculate_weighted_avg(row):
|
| 186 |
+
weighted_sum = sum(row[col] * weights[col] for col in columns_name)
|
| 187 |
+
return weighted_sum
|
| 188 |
+
new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
|
| 189 |
+
|
| 190 |
+
cols = meta_data + ["Overall Score"] + columns_name
|
| 191 |
+
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
|
| 192 |
+
return new_df
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
results_path = Path("./evals/mjbench/eval-results")
|
| 196 |
+
orig_df = get_leaderboard_results(results_path)
|
| 197 |
+
colmuns_name = list(SUBSET_COUNTS.keys())
|
| 198 |
+
detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
|
| 199 |
+
|
| 200 |
+
results_path = Path("./evals/mjbench/overall-results")
|
| 201 |
+
orig_df = get_leaderboard_results(results_path)
|
| 202 |
+
colmuns_name = list(PERSPECTIVE_COUNTS.keys())
|
| 203 |
+
perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
|
| 204 |
+
|
| 205 |
+
total_models = len(detailed_df)
|
| 206 |
+
with gr.Blocks(css=custom_css) as app:
|
| 207 |
+
with gr.Row():
|
| 208 |
+
with gr.Column(scale=6):
|
| 209 |
+
gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
|
| 210 |
+
with gr.Column(scale=4):
|
| 211 |
+
gr.Markdown("")
|
| 212 |
+
# gr.HTML(BGB_LOGO, elem_classes="logo")
|
| 213 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 215 |
+
with gr.TabItem("🏆 MJ-Bench Leaderboard"):
|
| 216 |
with gr.Row():
|
| 217 |
+
search_overall = gr.Textbox(
|
| 218 |
+
label="Model Search (delimit with , )",
|
| 219 |
+
placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
|
| 220 |
+
show_label=False
|
| 221 |
+
)
|
| 222 |
+
model_type_overall = gr.CheckboxGroup(
|
| 223 |
+
choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
| 224 |
+
value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
| 225 |
+
label="Model Types",
|
| 226 |
+
show_label=False,
|
| 227 |
+
interactive=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
with gr.Row():
|
| 230 |
+
mjbench_table_overall_hidden = gr.Dataframe(
|
| 231 |
+
perspective_df,
|
| 232 |
+
headers=perspective_df.columns.tolist(),
|
| 233 |
+
elem_id="mjbench_leadboard_overall_hidden",
|
| 234 |
+
wrap=True,
|
| 235 |
+
visible=False,
|
| 236 |
+
)
|
| 237 |
+
mjbench_table_overall = gr.Dataframe(
|
| 238 |
+
regex_table(
|
| 239 |
+
perspective_df.copy(),
|
| 240 |
+
"",
|
| 241 |
+
["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
|
| 242 |
+
),
|
| 243 |
+
headers=perspective_df.columns.tolist(),
|
| 244 |
+
elem_id="mjbench_leadboard_overall",
|
| 245 |
+
wrap=True,
|
| 246 |
+
height=1000,
|
| 247 |
+
)
|
| 248 |
+
# with gr.TabItem("🔍 MJ-Bench Detailed Results"):
|
| 249 |
+
# with gr.Row():
|
| 250 |
+
# search_detail = gr.Textbox(
|
| 251 |
+
# label="Model Search (delimit with , )",
|
| 252 |
+
# placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
|
| 253 |
+
# show_label=False
|
| 254 |
+
# )
|
| 255 |
+
# model_type_detail = gr.CheckboxGroup(
|
| 256 |
+
# choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
| 257 |
+
# value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
| 258 |
+
# label="Model Types",
|
| 259 |
+
# show_label=False,
|
| 260 |
+
# interactive=True,
|
| 261 |
+
# )
|
| 262 |
+
# with gr.Row():
|
| 263 |
+
# mjbench_table_detail_hidden = gr.Dataframe(
|
| 264 |
+
# detailed_df,
|
| 265 |
+
# headers=detailed_df.columns.tolist(),
|
| 266 |
+
# elem_id="mjbench_detailed_hidden",
|
| 267 |
+
# # column_widths = ["500px", "500px"],
|
| 268 |
+
# wrap=True,
|
| 269 |
+
# visible=False,
|
| 270 |
+
# )
|
| 271 |
+
# mjbench_table_detail = gr.Dataframe(
|
| 272 |
+
# regex_table(
|
| 273 |
+
# detailed_df.copy(),
|
| 274 |
+
# "",
|
| 275 |
+
# ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
|
| 276 |
+
# ),
|
| 277 |
+
# headers=detailed_df.columns.tolist(),
|
| 278 |
+
# elem_id="mjbench_detailed",
|
| 279 |
+
# column_widths = ["40px", "200px", "180px", "130px", "150px"] + ["130px"]*50,
|
| 280 |
+
# wrap=True,
|
| 281 |
+
# height=1000,
|
| 282 |
+
# )
|
| 283 |
+
with gr.TabItem("About"):
|
| 284 |
with gr.Row():
|
| 285 |
+
gr.Markdown(ABOUT_TEXT)
|
| 286 |
+
|
| 287 |
+
with gr.Accordion("📚 Citation", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
citation_button = gr.Textbox(
|
| 289 |
+
value=r"""@misc{mjbench2024mjbench,
|
| 290 |
+
title={MJ-BENCH: Is Your Multimodal Reward Model Really a Good Judge?},
|
| 291 |
+
author={Chen*, Zhaorun and Du*, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and HUANG, Leria and Chen, Canyu and Ye Qinghao and Zhu, Zhihong and Zhang, Yuqing and Zhou, Jiawei and Zhao, Zhuokai and Rafailov, Rafael and Finn, Chelsea and Yao, Huaxiu},
|
| 292 |
+
year={2024}
|
| 293 |
+
}""",
|
| 294 |
+
lines=7,
|
| 295 |
+
label="Copy the following to cite these results.",
|
| 296 |
elem_id="citation-button",
|
| 297 |
show_copy_button=True,
|
| 298 |
)
|
| 299 |
+
|
| 300 |
+
search_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
|
| 301 |
+
model_type_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
|
| 302 |
+
|
| 303 |
+
# search_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
|
| 304 |
+
# model_type_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
|
| 305 |
+
|
| 306 |
scheduler = BackgroundScheduler()
|
| 307 |
+
scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
|
| 308 |
scheduler.start()
|
| 309 |
+
# app.queue(default_concurrency_limit=40).launch()
|
| 310 |
+
app.launch(allowed_paths=['./', "./src", "./evals"])
|
evals/mjbench/detailed-results/AestheticsPredictor.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "AestheticsPredictor",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "LAION",
|
| 7 |
+
"Alignment-Object": 35.9,
|
| 8 |
+
"Alignment-Attribute": 38.4,
|
| 9 |
+
"Alignment-Action": 43.6,
|
| 10 |
+
"Alignment-Location": 31.6,
|
| 11 |
+
"Alignment-Count": 35.7,
|
| 12 |
+
"Alignment-Avg": 34.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 51.7,
|
| 14 |
+
"Safety-Toxicity-Shocking": 58.6,
|
| 15 |
+
"Safety-Toxicity-Disgust": 64.3,
|
| 16 |
+
"Safety-Toxicity-Avg": 57.3,
|
| 17 |
+
"Safety-Nsfw-Evident": 14.6,
|
| 18 |
+
"Safety-Nsfw-Evasive": 55.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 14.2,
|
| 20 |
+
"Safety-Nsfw-Avg": 37.5,
|
| 21 |
+
"Quality-Distortion-Human_face": 78.7,
|
| 22 |
+
"Quality-Distortion-Human_limb": 57.1,
|
| 23 |
+
"Quality-Distortion-Object": 51.3,
|
| 24 |
+
"Quality-Distortion-Avg": 52.1,
|
| 25 |
+
"Quality-Blurry-Defocused": 90.1,
|
| 26 |
+
"Quality-Blurry-Motion": 93.4,
|
| 27 |
+
"Quality-Blurry-Avg": 91.6,
|
| 28 |
+
"Bias-Age": 59.4,
|
| 29 |
+
"Bias-Gender": 62.0,
|
| 30 |
+
"Bias-Race": 64.2,
|
| 31 |
+
"Bias-Nationality": 62.4,
|
| 32 |
+
"Bias-Religion": 61.0,
|
| 33 |
+
"Bias-Avg": 62.0,
|
| 34 |
+
"Bias-Age-NDS": 85.3,
|
| 35 |
+
"Bias-Gender-NDS": 85.9,
|
| 36 |
+
"Bias-Race-NDS": 86.3,
|
| 37 |
+
"Bias-Nationality-NDS": 85.8,
|
| 38 |
+
"Bias-Religion-NDS": 86.2,
|
| 39 |
+
"Bias-Avg-NDS": 85.9,
|
| 40 |
+
"Bias-Age-GES": 91.9,
|
| 41 |
+
"Bias-Gender-GES": 92.1,
|
| 42 |
+
"Bias-Race-GES": 92.4,
|
| 43 |
+
"Bias-Nationality-GES": 92.1,
|
| 44 |
+
"Bias-Religion-GES": 92.3,
|
| 45 |
+
"Bias-Avg-GES": 92.1
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/BLIP-v2.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "BLIP-v2",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "Salesforce",
|
| 7 |
+
"Alignment-Object": 23.5,
|
| 8 |
+
"Alignment-Attribute": 22.7,
|
| 9 |
+
"Alignment-Action": 24.8,
|
| 10 |
+
"Alignment-Location": 19.7,
|
| 11 |
+
"Alignment-Count": 16.1,
|
| 12 |
+
"Alignment-Avg": 21.5,
|
| 13 |
+
"Safety-Toxicity-Crime": 6.9,
|
| 14 |
+
"Safety-Toxicity-Shocking": 0.0,
|
| 15 |
+
"Safety-Toxicity-Disgust": 4.8,
|
| 16 |
+
"Safety-Toxicity-Avg": 4.5,
|
| 17 |
+
"Safety-Nsfw-Evident": 58.4,
|
| 18 |
+
"Safety-Nsfw-Evasive": 51.1,
|
| 19 |
+
"Safety-Nsfw-Subtle": 35.7,
|
| 20 |
+
"Safety-Nsfw-Avg": 49.1,
|
| 21 |
+
"Quality-Distortion-Human_face": 3.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 2.0,
|
| 23 |
+
"Quality-Distortion-Object": 1.1,
|
| 24 |
+
"Quality-Distortion-Avg": 1.9,
|
| 25 |
+
"Quality-Blurry-Defocused": 8.3,
|
| 26 |
+
"Quality-Blurry-Motion": 47.2,
|
| 27 |
+
"Quality-Blurry-Avg": 15.0,
|
| 28 |
+
"Bias-Age": 69.6,
|
| 29 |
+
"Bias-Gender": 68.5,
|
| 30 |
+
"Bias-Race": 65.9,
|
| 31 |
+
"Bias-Nationality": 68.6,
|
| 32 |
+
"Bias-Religion": 74.7,
|
| 33 |
+
"Bias-Avg": 68.5,
|
| 34 |
+
"Bias-Age-NDS": 85.3,
|
| 35 |
+
"Bias-Gender-NDS": 83.6,
|
| 36 |
+
"Bias-Race-NDS": 82.7,
|
| 37 |
+
"Bias-Nationality-NDS": 81.8,
|
| 38 |
+
"Bias-Religion-NDS": 87.5,
|
| 39 |
+
"Bias-Avg-NDS": 83.6,
|
| 40 |
+
"Bias-Age-GES": 92.2,
|
| 41 |
+
"Bias-Gender-GES": 91.3,
|
| 42 |
+
"Bias-Race-GES": 90.7,
|
| 43 |
+
"Bias-Nationality-GES": 90.4,
|
| 44 |
+
"Bias-Religion-GES": 93.1,
|
| 45 |
+
"Bias-Avg-GES": 91.3
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/CLIP-v2.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "CLIP-v2",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "LAION",
|
| 7 |
+
"Alignment-Object": 42.2,
|
| 8 |
+
"Alignment-Attribute": 45.9,
|
| 9 |
+
"Alignment-Action": 45.3,
|
| 10 |
+
"Alignment-Location": 43.4,
|
| 11 |
+
"Alignment-Count": 55.4,
|
| 12 |
+
"Alignment-Avg": 44.0,
|
| 13 |
+
"Safety-Toxicity-Crime": 89.7,
|
| 14 |
+
"Safety-Toxicity-Shocking": 96.6,
|
| 15 |
+
"Safety-Toxicity-Disgust": 97.6,
|
| 16 |
+
"Safety-Toxicity-Avg": 94.4,
|
| 17 |
+
"Safety-Nsfw-Evident": 20.8,
|
| 18 |
+
"Safety-Nsfw-Evasive": 4.5,
|
| 19 |
+
"Safety-Nsfw-Subtle": 16.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 7.9,
|
| 21 |
+
"Quality-Distortion-Human_face": 26.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 17.2,
|
| 23 |
+
"Quality-Distortion-Object": 34.0,
|
| 24 |
+
"Quality-Distortion-Avg": 19.3,
|
| 25 |
+
"Quality-Blurry-Defocused": 50.6,
|
| 26 |
+
"Quality-Blurry-Motion": 63.7,
|
| 27 |
+
"Quality-Blurry-Avg": 56.7,
|
| 28 |
+
"Bias-Age": 57.2,
|
| 29 |
+
"Bias-Gender": 57.8,
|
| 30 |
+
"Bias-Race": 55.5,
|
| 31 |
+
"Bias-Nationality": 59.5,
|
| 32 |
+
"Bias-Religion": 60.8,
|
| 33 |
+
"Bias-Avg": 57.7,
|
| 34 |
+
"Bias-Age-NDS": 73.6,
|
| 35 |
+
"Bias-Gender-NDS": 75.2,
|
| 36 |
+
"Bias-Race-NDS": 73.1,
|
| 37 |
+
"Bias-Nationality-NDS": 79.1,
|
| 38 |
+
"Bias-Religion-NDS": 78.4,
|
| 39 |
+
"Bias-Avg-NDS": 75.2,
|
| 40 |
+
"Bias-Age-GES": 73.6,
|
| 41 |
+
"Bias-Gender-GES": 75.2,
|
| 42 |
+
"Bias-Race-GES": 73.1,
|
| 43 |
+
"Bias-Nationality-GES": 79.1,
|
| 44 |
+
"Bias-Religion-GES": 78.4,
|
| 45 |
+
"Bias-Avg-GES": 75.2
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Claude 3 Opus.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Claude 3 Opus",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "Anthropic",
|
| 7 |
+
"Alignment-Object": 64.9,
|
| 8 |
+
"Alignment-Attribute": 38.9,
|
| 9 |
+
"Alignment-Action": 44.4,
|
| 10 |
+
"Alignment-Location": 55.3,
|
| 11 |
+
"Alignment-Count": 55.4,
|
| 12 |
+
"Alignment-Avg": 57.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 62.1,
|
| 14 |
+
"Safety-Toxicity-Shocking": 37.9,
|
| 15 |
+
"Safety-Toxicity-Disgust": 50.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 50.6,
|
| 17 |
+
"Safety-Nsfw-Evident": 10.5,
|
| 18 |
+
"Safety-Nsfw-Evasive": 6.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 3.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 8.3,
|
| 21 |
+
"Quality-Distortion-Human_face": 26.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 19.3,
|
| 23 |
+
"Quality-Distortion-Object": 10.7,
|
| 24 |
+
"Quality-Distortion-Avg": 17.6,
|
| 25 |
+
"Quality-Blurry-Defocused": 89.6,
|
| 26 |
+
"Quality-Blurry-Motion": 93.3,
|
| 27 |
+
"Quality-Blurry-Avg": 92.7,
|
| 28 |
+
"Bias-Age": 53.9,
|
| 29 |
+
"Bias-Gender": 58.2,
|
| 30 |
+
"Bias-Race": 62.1,
|
| 31 |
+
"Bias-Nationality": 59.0,
|
| 32 |
+
"Bias-Religion": 54.0,
|
| 33 |
+
"Bias-Avg": 58.2,
|
| 34 |
+
"Bias-Age-NDS": 63.3,
|
| 35 |
+
"Bias-Gender-NDS": 66.1,
|
| 36 |
+
"Bias-Race-NDS": 67.5,
|
| 37 |
+
"Bias-Nationality-NDS": 66.9,
|
| 38 |
+
"Bias-Religion-NDS": 66.8,
|
| 39 |
+
"Bias-Avg-NDS": 66.1,
|
| 40 |
+
"Bias-Age-GES": 83.2,
|
| 41 |
+
"Bias-Gender-GES": 85.2,
|
| 42 |
+
"Bias-Race-GES": 86.5,
|
| 43 |
+
"Bias-Nationality-GES": 85.8,
|
| 44 |
+
"Bias-Religion-GES": 84.8,
|
| 45 |
+
"Bias-Avg-GES": 85.2
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/GPT-4-vision.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "GPT-4-vision",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "OpenAI",
|
| 7 |
+
"Alignment-Object": 68.1,
|
| 8 |
+
"Alignment-Attribute": 62.9,
|
| 9 |
+
"Alignment-Action": 64.1,
|
| 10 |
+
"Alignment-Location": 67.1,
|
| 11 |
+
"Alignment-Count": 73.2,
|
| 12 |
+
"Alignment-Avg": 66.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 75.9,
|
| 14 |
+
"Safety-Toxicity-Shocking": 69.0,
|
| 15 |
+
"Safety-Toxicity-Disgust": 81.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 76.4,
|
| 17 |
+
"Safety-Nsfw-Evident": 69.5,
|
| 18 |
+
"Safety-Nsfw-Evasive": 43.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 32.5,
|
| 20 |
+
"Safety-Nsfw-Avg": 44.1,
|
| 21 |
+
"Quality-Distortion-Human_face": 87.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 57.6,
|
| 23 |
+
"Quality-Distortion-Object": 83.1,
|
| 24 |
+
"Quality-Distortion-Avg": 75.7,
|
| 25 |
+
"Quality-Blurry-Defocused": 98.8,
|
| 26 |
+
"Quality-Blurry-Motion": 99.3,
|
| 27 |
+
"Quality-Blurry-Avg": 99.2,
|
| 28 |
+
"Bias-Age": 76.7,
|
| 29 |
+
"Bias-Gender": 79.1,
|
| 30 |
+
"Bias-Race": 77.4,
|
| 31 |
+
"Bias-Nationality": 81.0,
|
| 32 |
+
"Bias-Religion": 86.5,
|
| 33 |
+
"Bias-Avg": 79.1,
|
| 34 |
+
"Bias-Age-NDS": 81.2,
|
| 35 |
+
"Bias-Gender-NDS": 80.2,
|
| 36 |
+
"Bias-Race-NDS": 77.6,
|
| 37 |
+
"Bias-Nationality-NDS": 79.9,
|
| 38 |
+
"Bias-Religion-NDS": 88.2,
|
| 39 |
+
"Bias-Avg-NDS": 80.2,
|
| 40 |
+
"Bias-Age-GES": 93.0,
|
| 41 |
+
"Bias-Gender-GES": 93.2,
|
| 42 |
+
"Bias-Race-GES": 92.2,
|
| 43 |
+
"Bias-Nationality-GES": 93.4,
|
| 44 |
+
"Bias-Religion-GES": 96.4,
|
| 45 |
+
"Bias-Avg-GES": 93.2
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/GPT-4o.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "GPT-4o",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "OpenAI",
|
| 7 |
+
"Alignment-Object": 62.2,
|
| 8 |
+
"Alignment-Attribute": 57.2,
|
| 9 |
+
"Alignment-Action": 64.1,
|
| 10 |
+
"Alignment-Location": 63.2,
|
| 11 |
+
"Alignment-Count": 67.9,
|
| 12 |
+
"Alignment-Avg": 61.5,
|
| 13 |
+
"Safety-Toxicity-Crime": 86.2,
|
| 14 |
+
"Safety-Toxicity-Shocking": 96.6,
|
| 15 |
+
"Safety-Toxicity-Disgust": 95.2,
|
| 16 |
+
"Safety-Toxicity-Avg": 92.1,
|
| 17 |
+
"Safety-Nsfw-Evident": 72.3,
|
| 18 |
+
"Safety-Nsfw-Evasive": 51.7,
|
| 19 |
+
"Safety-Nsfw-Subtle": 38.9,
|
| 20 |
+
"Safety-Nsfw-Avg": 54.3,
|
| 21 |
+
"Quality-Distortion-Human_face": 99.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 78.2,
|
| 23 |
+
"Quality-Distortion-Object": 100.0,
|
| 24 |
+
"Quality-Distortion-Avg": 93.8,
|
| 25 |
+
"Quality-Blurry-Defocused": 100.0,
|
| 26 |
+
"Quality-Blurry-Motion": 100.0,
|
| 27 |
+
"Quality-Blurry-Avg": 100.0,
|
| 28 |
+
"Bias-Age": 60.9,
|
| 29 |
+
"Bias-Gender": 66.6,
|
| 30 |
+
"Bias-Race": 69.1,
|
| 31 |
+
"Bias-Nationality": 68.2,
|
| 32 |
+
"Bias-Religion": 69.6,
|
| 33 |
+
"Bias-Avg": 66.6,
|
| 34 |
+
"Bias-Age-NDS": 81.2,
|
| 35 |
+
"Bias-Gender-NDS": 82.7,
|
| 36 |
+
"Bias-Race-NDS": 82.8,
|
| 37 |
+
"Bias-Nationality-NDS": 83.2,
|
| 38 |
+
"Bias-Religion-NDS": 86.1,
|
| 39 |
+
"Bias-Avg-NDS": 82.7,
|
| 40 |
+
"Bias-Age-GES": 91.8,
|
| 41 |
+
"Bias-Gender-GES": 92.9,
|
| 42 |
+
"Bias-Race-GES": 93.1,
|
| 43 |
+
"Bias-Nationality-GES": 93.3,
|
| 44 |
+
"Bias-Religion-GES": 94.4,
|
| 45 |
+
"Bias-Avg-GES": 92.9
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Gemini Ultra.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Gemini Ultra",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "Google",
|
| 7 |
+
"Alignment-Object": 71.7,
|
| 8 |
+
"Alignment-Attribute": 65.1,
|
| 9 |
+
"Alignment-Action": 63.2,
|
| 10 |
+
"Alignment-Location": 64.5,
|
| 11 |
+
"Alignment-Count": 67.8,
|
| 12 |
+
"Alignment-Avg": 67.2,
|
| 13 |
+
"Safety-Toxicity-Crime": 65.5,
|
| 14 |
+
"Safety-Toxicity-Shocking": 41.4,
|
| 15 |
+
"Safety-Toxicity-Disgust": 78.6,
|
| 16 |
+
"Safety-Toxicity-Avg": 64.0,
|
| 17 |
+
"Safety-Nsfw-Evident": 31.6,
|
| 18 |
+
"Safety-Nsfw-Evasive": 19.1,
|
| 19 |
+
"Safety-Nsfw-Subtle": 10.3,
|
| 20 |
+
"Safety-Nsfw-Avg": 22.7,
|
| 21 |
+
"Quality-Distortion-Human_face": 73.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 32.5,
|
| 23 |
+
"Quality-Distortion-Object": 61.0,
|
| 24 |
+
"Quality-Distortion-Avg": 55.7,
|
| 25 |
+
"Quality-Blurry-Defocused": 86.5,
|
| 26 |
+
"Quality-Blurry-Motion": 97.3,
|
| 27 |
+
"Quality-Blurry-Avg": 93.9,
|
| 28 |
+
"Bias-Age": 48.7,
|
| 29 |
+
"Bias-Gender": 56.9,
|
| 30 |
+
"Bias-Race": 62.9,
|
| 31 |
+
"Bias-Nationality": 60.0,
|
| 32 |
+
"Bias-Religion": 49.9,
|
| 33 |
+
"Bias-Avg": 56.9,
|
| 34 |
+
"Bias-Age-NDS": 72.6,
|
| 35 |
+
"Bias-Gender-NDS": 75.8,
|
| 36 |
+
"Bias-Race-NDS": 78.4,
|
| 37 |
+
"Bias-Nationality-NDS": 77.0,
|
| 38 |
+
"Bias-Religion-NDS": 72.3,
|
| 39 |
+
"Bias-Avg-NDS": 75.8,
|
| 40 |
+
"Bias-Age-GES": 86.6,
|
| 41 |
+
"Bias-Gender-GES": 89.0,
|
| 42 |
+
"Bias-Race-GES": 90.8,
|
| 43 |
+
"Bias-Nationality-GES": 90.0,
|
| 44 |
+
"Bias-Religion-GES": 86.2,
|
| 45 |
+
"Bias-Avg-GES": 89.0
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/HPS-v2.1.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "HPS-v2.1",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "CUHK MMLab",
|
| 7 |
+
"Alignment-Object": 49.4,
|
| 8 |
+
"Alignment-Attribute": 53.7,
|
| 9 |
+
"Alignment-Action": 49.6,
|
| 10 |
+
"Alignment-Location": 51.3,
|
| 11 |
+
"Alignment-Count": 57.1,
|
| 12 |
+
"Alignment-Avg": 48.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 89.7,
|
| 14 |
+
"Safety-Toxicity-Shocking": 86.2,
|
| 15 |
+
"Safety-Toxicity-Disgust": 85.7,
|
| 16 |
+
"Safety-Toxicity-Avg": 87.6,
|
| 17 |
+
"Safety-Nsfw-Evident": 1.1,
|
| 18 |
+
"Safety-Nsfw-Evasive": 30.8,
|
| 19 |
+
"Safety-Nsfw-Subtle": 0.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 15.1,
|
| 21 |
+
"Quality-Distortion-Human_face": 60.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 37.1,
|
| 23 |
+
"Quality-Distortion-Object": 80.3,
|
| 24 |
+
"Quality-Distortion-Avg": 51.7,
|
| 25 |
+
"Quality-Blurry-Defocused": 85.7,
|
| 26 |
+
"Quality-Blurry-Motion": 94.6,
|
| 27 |
+
"Quality-Blurry-Avg": 88.6,
|
| 28 |
+
"Bias-Age": 52.9,
|
| 29 |
+
"Bias-Gender": 55.3,
|
| 30 |
+
"Bias-Race": 55.7,
|
| 31 |
+
"Bias-Nationality": 55.0,
|
| 32 |
+
"Bias-Religion": 62.4,
|
| 33 |
+
"Bias-Avg": 55.3,
|
| 34 |
+
"Bias-Age-NDS": 75.8,
|
| 35 |
+
"Bias-Gender-NDS": 78.2,
|
| 36 |
+
"Bias-Race-NDS": 79.5,
|
| 37 |
+
"Bias-Nationality-NDS": 78.6,
|
| 38 |
+
"Bias-Religion-NDS": 79.3,
|
| 39 |
+
"Bias-Avg-NDS": 78.2,
|
| 40 |
+
"Bias-Age-GES": 86.4,
|
| 41 |
+
"Bias-Gender-GES": 87.8,
|
| 42 |
+
"Bias-Race-GES": 88.5,
|
| 43 |
+
"Bias-Nationality-GES": 88.0,
|
| 44 |
+
"Bias-Religion-GES": 88.5,
|
| 45 |
+
"Bias-Avg-GES": 87.8
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Idefics2-8b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Idefics2-8b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "HuggingFace",
|
| 7 |
+
"Alignment-Object": 35.5,
|
| 8 |
+
"Alignment-Attribute": 31.7,
|
| 9 |
+
"Alignment-Action": 30.8,
|
| 10 |
+
"Alignment-Location": 29.9,
|
| 11 |
+
"Alignment-Count": 30.4,
|
| 12 |
+
"Alignment-Avg": 32.6,
|
| 13 |
+
"Safety-Toxicity-Crime": 58.6,
|
| 14 |
+
"Safety-Toxicity-Shocking": 44.8,
|
| 15 |
+
"Safety-Toxicity-Disgust": 57.1,
|
| 16 |
+
"Safety-Toxicity-Avg": 52.8,
|
| 17 |
+
"Safety-Nsfw-Evident": 32.9,
|
| 18 |
+
"Safety-Nsfw-Evasive": 13.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 19.5,
|
| 20 |
+
"Safety-Nsfw-Avg": 20.2,
|
| 21 |
+
"Quality-Distortion-Human_face": 29.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 25.8,
|
| 23 |
+
"Quality-Distortion-Object": 2.3,
|
| 24 |
+
"Quality-Distortion-Avg": 21.7,
|
| 25 |
+
"Quality-Blurry-Defocused": 70.6,
|
| 26 |
+
"Quality-Blurry-Motion": 46.9,
|
| 27 |
+
"Quality-Blurry-Avg": 58.7,
|
| 28 |
+
"Bias-Age": 37.4,
|
| 29 |
+
"Bias-Gender": 42.7,
|
| 30 |
+
"Bias-Race": 45.3,
|
| 31 |
+
"Bias-Nationality": 46.9,
|
| 32 |
+
"Bias-Religion": 35.2,
|
| 33 |
+
"Bias-Avg": 42.7,
|
| 34 |
+
"Bias-Age-NDS": 55.1,
|
| 35 |
+
"Bias-Gender-NDS": 59.2,
|
| 36 |
+
"Bias-Race-NDS": 61.7,
|
| 37 |
+
"Bias-Nationality-NDS": 62.8,
|
| 38 |
+
"Bias-Religion-NDS": 51.0,
|
| 39 |
+
"Bias-Avg-NDS": 59.2,
|
| 40 |
+
"Bias-Age-GES": 77.0,
|
| 41 |
+
"Bias-Gender-GES": 79.7,
|
| 42 |
+
"Bias-Race-GES": 81.3,
|
| 43 |
+
"Bias-Nationality-GES": 82.0,
|
| 44 |
+
"Bias-Religion-GES": 74.4,
|
| 45 |
+
"Bias-Avg-GES": 79.8
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/ImageReward.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "ImageReward",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "THUDM",
|
| 7 |
+
"Alignment-Object": 50.6,
|
| 8 |
+
"Alignment-Attribute": 52.8,
|
| 9 |
+
"Alignment-Action": 47.1,
|
| 10 |
+
"Alignment-Location": 57.9,
|
| 11 |
+
"Alignment-Count": 53.6,
|
| 12 |
+
"Alignment-Avg": 51.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 96.6,
|
| 14 |
+
"Safety-Toxicity-Shocking": 96.6,
|
| 15 |
+
"Safety-Toxicity-Disgust": 95.2,
|
| 16 |
+
"Safety-Toxicity-Avg": 95.5,
|
| 17 |
+
"Safety-Nsfw-Evident": 31.1,
|
| 18 |
+
"Safety-Nsfw-Evasive": 10.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 27.4,
|
| 20 |
+
"Safety-Nsfw-Avg": 18.2,
|
| 21 |
+
"Quality-Distortion-Human_face": 31.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 34.4,
|
| 23 |
+
"Quality-Distortion-Object": 40.2,
|
| 24 |
+
"Quality-Distortion-Avg": 33.3,
|
| 25 |
+
"Quality-Blurry-Defocused": 77.4,
|
| 26 |
+
"Quality-Blurry-Motion": 86.6,
|
| 27 |
+
"Quality-Blurry-Avg": 82.1,
|
| 28 |
+
"Bias-Age": 41.8,
|
| 29 |
+
"Bias-Gender": 40.4,
|
| 30 |
+
"Bias-Race": 36.8,
|
| 31 |
+
"Bias-Nationality": 39.5,
|
| 32 |
+
"Bias-Religion": 52.8,
|
| 33 |
+
"Bias-Avg": 40.4,
|
| 34 |
+
"Bias-Age-NDS": 73.9,
|
| 35 |
+
"Bias-Gender-NDS": 73.2,
|
| 36 |
+
"Bias-Race-NDS": 70.9,
|
| 37 |
+
"Bias-Nationality-NDS": 73.0,
|
| 38 |
+
"Bias-Religion-NDS": 80.2,
|
| 39 |
+
"Bias-Avg-NDS": 73.2,
|
| 40 |
+
"Bias-Age-GES": 85.5,
|
| 41 |
+
"Bias-Gender-GES": 85.0,
|
| 42 |
+
"Bias-Race-GES": 83.6,
|
| 43 |
+
"Bias-Nationality-GES": 84.8,
|
| 44 |
+
"Bias-Religion-GES": 89.0,
|
| 45 |
+
"Bias-Avg-GES": 85.0
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Instructblip-7b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Instructblip-7b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "Salesforce",
|
| 7 |
+
"Alignment-Object": 17.1,
|
| 8 |
+
"Alignment-Attribute": 17.4,
|
| 9 |
+
"Alignment-Action": 16.2,
|
| 10 |
+
"Alignment-Location": 13.1,
|
| 11 |
+
"Alignment-Count": 21.4,
|
| 12 |
+
"Alignment-Avg": 17.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 31.0,
|
| 14 |
+
"Safety-Toxicity-Shocking": 34.5,
|
| 15 |
+
"Safety-Toxicity-Disgust": 40.5,
|
| 16 |
+
"Safety-Toxicity-Avg": 39.3,
|
| 17 |
+
"Safety-Nsfw-Evident": 36.9,
|
| 18 |
+
"Safety-Nsfw-Evasive": 24.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 30.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 33.7,
|
| 21 |
+
"Quality-Distortion-Human_face": 12.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 9.3,
|
| 23 |
+
"Quality-Distortion-Object": 21.0,
|
| 24 |
+
"Quality-Distortion-Avg": 13.3,
|
| 25 |
+
"Quality-Blurry-Defocused": 32.3,
|
| 26 |
+
"Quality-Blurry-Motion": 31.1,
|
| 27 |
+
"Quality-Blurry-Avg": 31.7,
|
| 28 |
+
"Bias-Age": 52.5,
|
| 29 |
+
"Bias-Gender": 53.6,
|
| 30 |
+
"Bias-Race": 53.6,
|
| 31 |
+
"Bias-Nationality": 52.0,
|
| 32 |
+
"Bias-Religion": 61.1,
|
| 33 |
+
"Bias-Avg": 53.6,
|
| 34 |
+
"Bias-Age-NDS": 80.8,
|
| 35 |
+
"Bias-Gender-NDS": 80.6,
|
| 36 |
+
"Bias-Race-NDS": 80.3,
|
| 37 |
+
"Bias-Nationality-NDS": 79.0,
|
| 38 |
+
"Bias-Religion-NDS": 85.4,
|
| 39 |
+
"Bias-Avg-NDS": 80.6,
|
| 40 |
+
"Bias-Age-GES": 91.0,
|
| 41 |
+
"Bias-Gender-GES": 91.2,
|
| 42 |
+
"Bias-Race-GES": 91.1,
|
| 43 |
+
"Bias-Nationality-GES": 90.4,
|
| 44 |
+
"Bias-Religion-GES": 93.8,
|
| 45 |
+
"Bias-Avg-GES": 91.1
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/InternVL-Chat-V1-5.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "InternVL-Chat-V1-5",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "OpenGVLab",
|
| 7 |
+
"Alignment-Object": 73.3,
|
| 8 |
+
"Alignment-Attribute": 74.8,
|
| 9 |
+
"Alignment-Action": 78.6,
|
| 10 |
+
"Alignment-Location": 80.5,
|
| 11 |
+
"Alignment-Count": 78.6,
|
| 12 |
+
"Alignment-Avg": 75.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 34.5,
|
| 14 |
+
"Safety-Toxicity-Shocking": 10.3,
|
| 15 |
+
"Safety-Toxicity-Disgust": 28.6,
|
| 16 |
+
"Safety-Toxicity-Avg": 25.8,
|
| 17 |
+
"Safety-Nsfw-Evident": 23.3,
|
| 18 |
+
"Safety-Nsfw-Evasive": 10.6,
|
| 19 |
+
"Safety-Nsfw-Subtle": 7.2,
|
| 20 |
+
"Safety-Nsfw-Avg": 16.2,
|
| 21 |
+
"Quality-Distortion-Human_face": 97.0,
|
| 22 |
+
"Quality-Distortion-Human_limb": 95.4,
|
| 23 |
+
"Quality-Distortion-Object": 97.1,
|
| 24 |
+
"Quality-Distortion-Avg": 97.1,
|
| 25 |
+
"Quality-Blurry-Defocused": 89.7,
|
| 26 |
+
"Quality-Blurry-Motion": 89.7,
|
| 27 |
+
"Quality-Blurry-Avg": 89.7,
|
| 28 |
+
"Bias-Age": 40.0,
|
| 29 |
+
"Bias-Gender": 41.3,
|
| 30 |
+
"Bias-Race": 42.1,
|
| 31 |
+
"Bias-Nationality": 42.0,
|
| 32 |
+
"Bias-Religion": 39.8,
|
| 33 |
+
"Bias-Avg": 41.3,
|
| 34 |
+
"Bias-Age-NDS": 74.0,
|
| 35 |
+
"Bias-Gender-NDS": 74.1,
|
| 36 |
+
"Bias-Race-NDS": 73.6,
|
| 37 |
+
"Bias-Nationality-NDS": 73.9,
|
| 38 |
+
"Bias-Religion-NDS": 76.6,
|
| 39 |
+
"Bias-Avg-NDS": 74.1,
|
| 40 |
+
"Bias-Age-GES": 86.9,
|
| 41 |
+
"Bias-Gender-GES": 87.2,
|
| 42 |
+
"Bias-Race-GES": 87.1,
|
| 43 |
+
"Bias-Nationality-GES": 87.3,
|
| 44 |
+
"Bias-Religion-GES": 88.0,
|
| 45 |
+
"Bias-Avg-GES": 87.2
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/LLaVA-1.5-13b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "LLaVA-1.5-13b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "UW-Madison & Microsoft",
|
| 7 |
+
"Alignment-Object": 17.7,
|
| 8 |
+
"Alignment-Attribute": 13.5,
|
| 9 |
+
"Alignment-Action": 11.8,
|
| 10 |
+
"Alignment-Location": 16.5,
|
| 11 |
+
"Alignment-Count": 8.9,
|
| 12 |
+
"Alignment-Avg": 10.3,
|
| 13 |
+
"Safety-Toxicity-Crime": 31.0,
|
| 14 |
+
"Safety-Toxicity-Shocking": 31.0,
|
| 15 |
+
"Safety-Toxicity-Disgust": 40.5,
|
| 16 |
+
"Safety-Toxicity-Avg": 33.7,
|
| 17 |
+
"Safety-Nsfw-Evident": 40.8,
|
| 18 |
+
"Safety-Nsfw-Evasive": 29.9,
|
| 19 |
+
"Safety-Nsfw-Subtle": 33.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 34.7,
|
| 21 |
+
"Quality-Distortion-Human_face": 20.1,
|
| 22 |
+
"Quality-Distortion-Human_limb": 14.6,
|
| 23 |
+
"Quality-Distortion-Object": 13.3,
|
| 24 |
+
"Quality-Distortion-Avg": 16.4,
|
| 25 |
+
"Quality-Blurry-Defocused": 18.0,
|
| 26 |
+
"Quality-Blurry-Motion": 34.0,
|
| 27 |
+
"Quality-Blurry-Avg": 26.1,
|
| 28 |
+
"Bias-Age": 67.0,
|
| 29 |
+
"Bias-Gender": 70.1,
|
| 30 |
+
"Bias-Race": 68.9,
|
| 31 |
+
"Bias-Nationality": 72.7,
|
| 32 |
+
"Bias-Religion": 75.1,
|
| 33 |
+
"Bias-Avg": 70.1,
|
| 34 |
+
"Bias-Age-NDS": 71.9,
|
| 35 |
+
"Bias-Gender-NDS": 74.8,
|
| 36 |
+
"Bias-Race-NDS": 76.6,
|
| 37 |
+
"Bias-Nationality-NDS": 74.0,
|
| 38 |
+
"Bias-Religion-NDS": 80.6,
|
| 39 |
+
"Bias-Avg-NDS": 74.8,
|
| 40 |
+
"Bias-Age-GES": 87.5,
|
| 41 |
+
"Bias-Gender-GES": 88.8,
|
| 42 |
+
"Bias-Race-GES": 88.9,
|
| 43 |
+
"Bias-Nationality-GES": 89.5,
|
| 44 |
+
"Bias-Religion-GES": 90.1,
|
| 45 |
+
"Bias-Avg-GES": 88.8
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/LLaVA-1.5-7b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "LLaVA-1.5-7b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "UW-Madison & Microsoft",
|
| 7 |
+
"Alignment-Object": 20.7,
|
| 8 |
+
"Alignment-Attribute": 25.2,
|
| 9 |
+
"Alignment-Action": 23.1,
|
| 10 |
+
"Alignment-Location": 18.2,
|
| 11 |
+
"Alignment-Count": 17.9,
|
| 12 |
+
"Alignment-Avg": 22.0,
|
| 13 |
+
"Safety-Toxicity-Crime": 44.8,
|
| 14 |
+
"Safety-Toxicity-Shocking": 41.4,
|
| 15 |
+
"Safety-Toxicity-Disgust": 47.6,
|
| 16 |
+
"Safety-Toxicity-Avg": 43.8,
|
| 17 |
+
"Safety-Nsfw-Evident": 35.7,
|
| 18 |
+
"Safety-Nsfw-Evasive": 21.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 17.6,
|
| 20 |
+
"Safety-Nsfw-Avg": 26.3,
|
| 21 |
+
"Quality-Distortion-Human_face": 13.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 7.3,
|
| 23 |
+
"Quality-Distortion-Object": 9.2,
|
| 24 |
+
"Quality-Distortion-Avg": 10.2,
|
| 25 |
+
"Quality-Blurry-Defocused": 7.1,
|
| 26 |
+
"Quality-Blurry-Motion": 19.1,
|
| 27 |
+
"Quality-Blurry-Avg": 13.1,
|
| 28 |
+
"Bias-Age": 80.8,
|
| 29 |
+
"Bias-Gender": 83.9,
|
| 30 |
+
"Bias-Race": 84.6,
|
| 31 |
+
"Bias-Nationality": 84.9,
|
| 32 |
+
"Bias-Religion": 88.1,
|
| 33 |
+
"Bias-Avg": 84.0,
|
| 34 |
+
"Bias-Age-NDS": 67.6,
|
| 35 |
+
"Bias-Gender-NDS": 71.4,
|
| 36 |
+
"Bias-Race-NDS": 75.8,
|
| 37 |
+
"Bias-Nationality-NDS": 68.4,
|
| 38 |
+
"Bias-Religion-NDS": 77.3,
|
| 39 |
+
"Bias-Avg-NDS": 71.4,
|
| 40 |
+
"Bias-Age-GES": 87.4,
|
| 41 |
+
"Bias-Gender-GES": 88.9,
|
| 42 |
+
"Bias-Race-GES": 90.1,
|
| 43 |
+
"Bias-Nationality-GES": 88.7,
|
| 44 |
+
"Bias-Religion-GES": 90.7,
|
| 45 |
+
"Bias-Avg-GES": 88.9
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/LLaVA-NeXT-mistral-7b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "LLaVA-NeXT-mistral-7b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "UW-Madison & ByteDance",
|
| 7 |
+
"Alignment-Object": 25.9,
|
| 8 |
+
"Alignment-Attribute": 30.0,
|
| 9 |
+
"Alignment-Action": 41.9,
|
| 10 |
+
"Alignment-Location": 33.8,
|
| 11 |
+
"Alignment-Count": 35.7,
|
| 12 |
+
"Alignment-Avg": 31.3,
|
| 13 |
+
"Safety-Toxicity-Crime": 20.7,
|
| 14 |
+
"Safety-Toxicity-Shocking": 24.1,
|
| 15 |
+
"Safety-Toxicity-Disgust": 19.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 21.3,
|
| 17 |
+
"Safety-Nsfw-Evident": 35.7,
|
| 18 |
+
"Safety-Nsfw-Evasive": 14.1,
|
| 19 |
+
"Safety-Nsfw-Subtle": 23.3,
|
| 20 |
+
"Safety-Nsfw-Avg": 25.6,
|
| 21 |
+
"Quality-Distortion-Human_face": 28.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 27.8,
|
| 23 |
+
"Quality-Distortion-Object": 19.0,
|
| 24 |
+
"Quality-Distortion-Avg": 30.1,
|
| 25 |
+
"Quality-Blurry-Defocused": 41.7,
|
| 26 |
+
"Quality-Blurry-Motion": 66.1,
|
| 27 |
+
"Quality-Blurry-Avg": 53.9,
|
| 28 |
+
"Bias-Age": 54.3,
|
| 29 |
+
"Bias-Gender": 56.7,
|
| 30 |
+
"Bias-Race": 57.0,
|
| 31 |
+
"Bias-Nationality": 56.1,
|
| 32 |
+
"Bias-Religion": 64.8,
|
| 33 |
+
"Bias-Avg": 56.6,
|
| 34 |
+
"Bias-Age-NDS": 63.2,
|
| 35 |
+
"Bias-Gender-NDS": 64.1,
|
| 36 |
+
"Bias-Race-NDS": 62.5,
|
| 37 |
+
"Bias-Nationality-NDS": 63.8,
|
| 38 |
+
"Bias-Religion-NDS": 74.2,
|
| 39 |
+
"Bias-Avg-NDS": 64.1,
|
| 40 |
+
"Bias-Age-GES": 82.1,
|
| 41 |
+
"Bias-Gender-GES": 82.8,
|
| 42 |
+
"Bias-Race-GES": 82.4,
|
| 43 |
+
"Bias-Nationality-GES": 82.5,
|
| 44 |
+
"Bias-Religion-GES": 87.8,
|
| 45 |
+
"Bias-Avg-GES": 82.8
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/LLaVA-NeXT-vicuna-13b.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "LLaVA-NeXT-vicuna-13b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "UW-Madison & ByteDance",
|
| 7 |
+
"Alignment-Object": 25.9,
|
| 8 |
+
"Alignment-Attribute": 27.4,
|
| 9 |
+
"Alignment-Action": 31.6,
|
| 10 |
+
"Alignment-Location": 38.9,
|
| 11 |
+
"Alignment-Count": 32.1,
|
| 12 |
+
"Alignment-Avg": 29.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 44.8,
|
| 14 |
+
"Safety-Toxicity-Shocking": 37.9,
|
| 15 |
+
"Safety-Toxicity-Disgust": 52.4,
|
| 16 |
+
"Safety-Toxicity-Avg": 43.8,
|
| 17 |
+
"Safety-Nsfw-Evident": 40.9,
|
| 18 |
+
"Safety-Nsfw-Evasive": 25.1,
|
| 19 |
+
"Safety-Nsfw-Subtle": 27.8,
|
| 20 |
+
"Safety-Nsfw-Avg": 36.5,
|
| 21 |
+
"Quality-Distortion-Human_face": 18.9,
|
| 22 |
+
"Quality-Distortion-Human_limb": 27.8,
|
| 23 |
+
"Quality-Distortion-Object": 12.0,
|
| 24 |
+
"Quality-Distortion-Avg": 20.5,
|
| 25 |
+
"Quality-Blurry-Defocused": 40.6,
|
| 26 |
+
"Quality-Blurry-Motion": 45.4,
|
| 27 |
+
"Quality-Blurry-Avg": 43.0,
|
| 28 |
+
"Bias-Age": 54.3,
|
| 29 |
+
"Bias-Gender": 56.7,
|
| 30 |
+
"Bias-Race": 57.0,
|
| 31 |
+
"Bias-Nationality": 56.1,
|
| 32 |
+
"Bias-Religion": 64.8,
|
| 33 |
+
"Bias-Avg": 56.6
|
| 34 |
+
}
|
| 35 |
+
]
|
evals/mjbench/detailed-results/MiniGPT4-v2.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "MiniGPT4-v2",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "Vision-CAIR",
|
| 7 |
+
"Alignment-Object": 37.5,
|
| 8 |
+
"Alignment-Attribute": 30.9,
|
| 9 |
+
"Alignment-Action": 30.8,
|
| 10 |
+
"Alignment-Location": 32.5,
|
| 11 |
+
"Alignment-Count": 39.3,
|
| 12 |
+
"Alignment-Avg": 32.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 41.4,
|
| 14 |
+
"Safety-Toxicity-Shocking": 62.1,
|
| 15 |
+
"Safety-Toxicity-Disgust": 42.9,
|
| 16 |
+
"Safety-Toxicity-Avg": 48.3,
|
| 17 |
+
"Safety-Nsfw-Evident": 39.6,
|
| 18 |
+
"Safety-Nsfw-Evasive": 21.4,
|
| 19 |
+
"Safety-Nsfw-Subtle": 36.5,
|
| 20 |
+
"Safety-Nsfw-Avg": 32.6,
|
| 21 |
+
"Quality-Distortion-Human_face": 39.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 39.1,
|
| 23 |
+
"Quality-Distortion-Object": 42.0,
|
| 24 |
+
"Quality-Distortion-Avg": 40.0,
|
| 25 |
+
"Quality-Blurry-Defocused": 33.4,
|
| 26 |
+
"Quality-Blurry-Motion": 37.4,
|
| 27 |
+
"Quality-Blurry-Avg": 35.4,
|
| 28 |
+
"Bias-Age": 31.8,
|
| 29 |
+
"Bias-Gender": 32.2,
|
| 30 |
+
"Bias-Race": 31.9,
|
| 31 |
+
"Bias-Nationality": 34.1,
|
| 32 |
+
"Bias-Religion": 28.3,
|
| 33 |
+
"Bias-Avg": 32.2,
|
| 34 |
+
"Bias-Age-NDS": 68.1,
|
| 35 |
+
"Bias-Gender-NDS": 67.2,
|
| 36 |
+
"Bias-Race-NDS": 66.2,
|
| 37 |
+
"Bias-Nationality-NDS": 67.0,
|
| 38 |
+
"Bias-Religion-NDS": 69.3,
|
| 39 |
+
"Bias-Avg-NDS": 67.2,
|
| 40 |
+
"Bias-Age-GES": 83.7,
|
| 41 |
+
"Bias-Gender-GES": 83.3,
|
| 42 |
+
"Bias-Race-GES": 82.8,
|
| 43 |
+
"Bias-Nationality-GES": 83.4,
|
| 44 |
+
"Bias-Religion-GES": 84.1,
|
| 45 |
+
"Bias-Avg-GES": 83.3
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/PickScore-v1.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "PickScore-v1",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "Stability AI",
|
| 7 |
+
"Alignment-Object": 60.9,
|
| 8 |
+
"Alignment-Attribute": 60.3,
|
| 9 |
+
"Alignment-Action": 62.4,
|
| 10 |
+
"Alignment-Location": 59.2,
|
| 11 |
+
"Alignment-Count": 67.9,
|
| 12 |
+
"Alignment-Avg": 60.9,
|
| 13 |
+
"Safety-Toxicity-Crime": 89.7,
|
| 14 |
+
"Safety-Toxicity-Shocking": 82.8,
|
| 15 |
+
"Safety-Toxicity-Disgust": 88.1,
|
| 16 |
+
"Safety-Toxicity-Avg": 86.5,
|
| 17 |
+
"Safety-Nsfw-Evident": 3.1,
|
| 18 |
+
"Safety-Nsfw-Evasive": 48.2,
|
| 19 |
+
"Safety-Nsfw-Subtle": 2.1,
|
| 20 |
+
"Safety-Nsfw-Avg": 32.2,
|
| 21 |
+
"Quality-Distortion-Human_face": 83.4,
|
| 22 |
+
"Quality-Distortion-Human_limb": 68.2,
|
| 23 |
+
"Quality-Distortion-Object": 92.1,
|
| 24 |
+
"Quality-Distortion-Avg": 79.3,
|
| 25 |
+
"Quality-Blurry-Defocused": 80.6,
|
| 26 |
+
"Quality-Blurry-Motion": 93.4,
|
| 27 |
+
"Quality-Blurry-Avg": 86.6,
|
| 28 |
+
"Bias-Age": 30.4,
|
| 29 |
+
"Bias-Gender": 31.1,
|
| 30 |
+
"Bias-Race": 30.8,
|
| 31 |
+
"Bias-Nationality": 31.7,
|
| 32 |
+
"Bias-Religion": 33.0,
|
| 33 |
+
"Bias-Avg": 31.1,
|
| 34 |
+
"Bias-Age-NDS": 65.3,
|
| 35 |
+
"Bias-Gender-NDS": 66.7,
|
| 36 |
+
"Bias-Race-NDS": 66.4,
|
| 37 |
+
"Bias-Nationality-NDS": 67.3,
|
| 38 |
+
"Bias-Religion-NDS": 69.4,
|
| 39 |
+
"Bias-Avg-NDS": 66.7,
|
| 40 |
+
"Bias-Age-GES": 80.5,
|
| 41 |
+
"Bias-Gender-GES": 81.2,
|
| 42 |
+
"Bias-Race-GES": 81.0,
|
| 43 |
+
"Bias-Nationality-GES": 81.6,
|
| 44 |
+
"Bias-Religion-GES": 82.6,
|
| 45 |
+
"Bias-Avg-GES": 81.2
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Prometheus-Vision-13b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Prometheus-Vision-13b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "prometheus-eval",
|
| 7 |
+
"Alignment-Object": 14.3,
|
| 8 |
+
"Alignment-Attribute": 10.9,
|
| 9 |
+
"Alignment-Action": 9.4,
|
| 10 |
+
"Alignment-Location": 11.7,
|
| 11 |
+
"Alignment-Count": 16.1,
|
| 12 |
+
"Alignment-Avg": 11.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 0.0,
|
| 14 |
+
"Safety-Toxicity-Shocking": 0.0,
|
| 15 |
+
"Safety-Toxicity-Disgust": 0.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 0.0,
|
| 17 |
+
"Safety-Nsfw-Evident": 6.5,
|
| 18 |
+
"Safety-Nsfw-Evasive": 4.1,
|
| 19 |
+
"Safety-Nsfw-Subtle": 4.2,
|
| 20 |
+
"Safety-Nsfw-Avg": 5.3,
|
| 21 |
+
"Quality-Distortion-Human_face": 7.1,
|
| 22 |
+
"Quality-Distortion-Human_limb": 4.6,
|
| 23 |
+
"Quality-Distortion-Object": 7.2,
|
| 24 |
+
"Quality-Distortion-Avg": 6.2,
|
| 25 |
+
"Quality-Blurry-Defocused": 9.4,
|
| 26 |
+
"Quality-Blurry-Motion": 10.6,
|
| 27 |
+
"Quality-Blurry-Avg": 10.0,
|
| 28 |
+
"Bias-Age": 65.1,
|
| 29 |
+
"Bias-Gender": 65.8,
|
| 30 |
+
"Bias-Race": 63.4,
|
| 31 |
+
"Bias-Nationality": 65.7,
|
| 32 |
+
"Bias-Religion": 77.1,
|
| 33 |
+
"Bias-Avg": 65.8,
|
| 34 |
+
"Bias-Age-NDS": 54.2,
|
| 35 |
+
"Bias-Gender-NDS": 44.7,
|
| 36 |
+
"Bias-Race-NDS": 36.0,
|
| 37 |
+
"Bias-Nationality-NDS": 39.3,
|
| 38 |
+
"Bias-Religion-NDS": 65.7,
|
| 39 |
+
"Bias-Avg-NDS": 44.7,
|
| 40 |
+
"Bias-Age-GES": 79.2,
|
| 41 |
+
"Bias-Gender-GES": 76.0,
|
| 42 |
+
"Bias-Race-GES": 72.7,
|
| 43 |
+
"Bias-Nationality-GES": 74.1,
|
| 44 |
+
"Bias-Religion-GES": 85.1,
|
| 45 |
+
"Bias-Avg-GES": 76.0
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Prometheus-Vision-7b.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Prometheus-Vision-7b",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "prometheus-eval",
|
| 7 |
+
"Alignment-Object": 19.5,
|
| 8 |
+
"Alignment-Attribute": 15.2,
|
| 9 |
+
"Alignment-Action": 16.2,
|
| 10 |
+
"Alignment-Location": 22.1,
|
| 11 |
+
"Alignment-Count": 26.8,
|
| 12 |
+
"Alignment-Avg": 18.8,
|
| 13 |
+
"Safety-Toxicity-Crime": 0.0,
|
| 14 |
+
"Safety-Toxicity-Shocking": 0.0,
|
| 15 |
+
"Safety-Toxicity-Disgust": 0.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 0.0,
|
| 17 |
+
"Safety-Nsfw-Evident": 10.3,
|
| 18 |
+
"Safety-Nsfw-Evasive": 6.8,
|
| 19 |
+
"Safety-Nsfw-Subtle": 4.3,
|
| 20 |
+
"Safety-Nsfw-Avg": 7.1,
|
| 21 |
+
"Quality-Distortion-Human_face": 16.6,
|
| 22 |
+
"Quality-Distortion-Human_limb": 17.9,
|
| 23 |
+
"Quality-Distortion-Object": 14.1,
|
| 24 |
+
"Quality-Distortion-Avg": 16.4,
|
| 25 |
+
"Quality-Blurry-Defocused": 22.3,
|
| 26 |
+
"Quality-Blurry-Motion": 30.3,
|
| 27 |
+
"Quality-Blurry-Avg": 26.3,
|
| 28 |
+
"Bias-Age": 43.8,
|
| 29 |
+
"Bias-Gender": 50.4,
|
| 30 |
+
"Bias-Race": 54.4,
|
| 31 |
+
"Bias-Nationality": 53.6,
|
| 32 |
+
"Bias-Religion": 44.9,
|
| 33 |
+
"Bias-Avg": 50.4,
|
| 34 |
+
"Bias-Age-NDS": 47.2,
|
| 35 |
+
"Bias-Gender-NDS": 42.5,
|
| 36 |
+
"Bias-Race-NDS": 37.8,
|
| 37 |
+
"Bias-Nationality-NDS": 40.0,
|
| 38 |
+
"Bias-Religion-NDS": 54.2,
|
| 39 |
+
"Bias-Avg-NDS": 42.5,
|
| 40 |
+
"Bias-Age-GES": 74.9,
|
| 41 |
+
"Bias-Gender-GES": 74.3,
|
| 42 |
+
"Bias-Race-GES": 73.1,
|
| 43 |
+
"Bias-Nationality-GES": 74.2,
|
| 44 |
+
"Bias-Religion-GES": 77.3,
|
| 45 |
+
"Bias-Avg-GES": 74.3
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/detailed-results/Qwen-VL-Chat.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Qwen-VL-Chat",
|
| 4 |
+
"Model Type": "Opensource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "Alibaba",
|
| 7 |
+
"Alignment-Object": 30.7,
|
| 8 |
+
"Alignment-Attribute": 29.1,
|
| 9 |
+
"Alignment-Action": 35.9,
|
| 10 |
+
"Alignment-Location": 29.9,
|
| 11 |
+
"Alignment-Count": 32.1,
|
| 12 |
+
"Alignment-Avg": 31.1,
|
| 13 |
+
"Safety-Toxicity-Crime": 27.6,
|
| 14 |
+
"Safety-Toxicity-Shocking": 13.8,
|
| 15 |
+
"Safety-Toxicity-Disgust": 31.0,
|
| 16 |
+
"Safety-Toxicity-Avg": 24.7,
|
| 17 |
+
"Safety-Nsfw-Evident": 18.9,
|
| 18 |
+
"Safety-Nsfw-Evasive": 7.6,
|
| 19 |
+
"Safety-Nsfw-Subtle": 6.3,
|
| 20 |
+
"Safety-Nsfw-Avg": 11.6,
|
| 21 |
+
"Quality-Distortion-Human_face": 14.2,
|
| 22 |
+
"Quality-Distortion-Human_limb": 15.9,
|
| 23 |
+
"Quality-Distortion-Object": 9.4,
|
| 24 |
+
"Quality-Distortion-Avg": 13.6,
|
| 25 |
+
"Quality-Blurry-Defocused": 0.9,
|
| 26 |
+
"Quality-Blurry-Motion": 2.1,
|
| 27 |
+
"Quality-Blurry-Avg": 1.4,
|
| 28 |
+
"Bias-Age": 70.8,
|
| 29 |
+
"Bias-Gender": 71.5,
|
| 30 |
+
"Bias-Race": 72.3,
|
| 31 |
+
"Bias-Nationality": 72.2,
|
| 32 |
+
"Bias-Religion": 68.1,
|
| 33 |
+
"Bias-Avg": 71.5,
|
| 34 |
+
"Bias-Age-NDS": 62.4,
|
| 35 |
+
"Bias-Gender-NDS": 62.3,
|
| 36 |
+
"Bias-Race-NDS": 62.3,
|
| 37 |
+
"Bias-Nationality-NDS": 63.1,
|
| 38 |
+
"Bias-Religion-NDS": 58.9,
|
| 39 |
+
"Bias-Avg-NDS": 62.3,
|
| 40 |
+
"Bias-Age-GES": 85.9,
|
| 41 |
+
"Bias-Gender-GES": 86.0,
|
| 42 |
+
"Bias-Race-GES": 86.0,
|
| 43 |
+
"Bias-Nationality-GES": 86.4,
|
| 44 |
+
"Bias-Religion-GES": 83.8,
|
| 45 |
+
"Bias-Avg-GES": 85.9
|
| 46 |
+
}
|
| 47 |
+
]
|
evals/mjbench/latex_reults/alignment_narrative.tex
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{0.9\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
| 8 |
+
\midrule
|
| 9 |
+
% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 10 |
+
% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 11 |
+
% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 12 |
+
% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 13 |
+
% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 14 |
+
% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 15 |
+
% \midrule
|
| 16 |
+
LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\
|
| 17 |
+
LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6
|
| 18 |
+
$ & $17.9$ & \cellcolor{skyblue} $21.1$ \\
|
| 19 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\
|
| 20 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\
|
| 21 |
+
Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1
|
| 22 |
+
$ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$ \\
|
| 23 |
+
MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$ \\
|
| 24 |
+
Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\
|
| 25 |
+
Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\
|
| 26 |
+
Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$ \\
|
| 27 |
+
Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\
|
| 28 |
+
Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\
|
| 29 |
+
\midrule
|
| 30 |
+
GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\
|
| 31 |
+
GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\
|
| 32 |
+
Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\
|
| 33 |
+
Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\
|
| 34 |
+
\bottomrule
|
| 35 |
+
\end{tabular}}
|
| 36 |
+
\label{exp:alignment_narrative_5}
|
| 37 |
+
\end{table}
|
evals/mjbench/latex_reults/alignment_number_10.tex
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[h]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{0.9\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{c|cccccc}
|
| 7 |
+
\toprule
|
| 8 |
+
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\
|
| 13 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$ \\
|
| 15 |
+
MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$ \\
|
| 16 |
+
Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\
|
| 17 |
+
Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\
|
| 18 |
+
Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$ \\
|
| 19 |
+
Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\
|
| 20 |
+
Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\
|
| 21 |
+
\midrule
|
| 22 |
+
GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\
|
| 23 |
+
GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\
|
| 24 |
+
Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\
|
| 25 |
+
Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\
|
| 26 |
+
\bottomrule
|
| 27 |
+
\end{tabular}}
|
| 28 |
+
\label{exp:alignment_number_10}
|
| 29 |
+
\end{table}
|
evals/mjbench/latex_reults/alignment_number_5.tex
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{0.9\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
| 8 |
+
\midrule
|
| 9 |
+
% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 10 |
+
% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 11 |
+
% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 12 |
+
% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 13 |
+
% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 14 |
+
% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
| 15 |
+
% \midrule
|
| 16 |
+
LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
|
| 17 |
+
LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
|
| 18 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
|
| 19 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
|
| 20 |
+
Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\
|
| 21 |
+
MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\
|
| 22 |
+
Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
|
| 23 |
+
Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
|
| 24 |
+
Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\
|
| 25 |
+
Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
|
| 26 |
+
Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
|
| 27 |
+
\midrule
|
| 28 |
+
GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\
|
| 29 |
+
GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
|
| 30 |
+
Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
|
| 31 |
+
Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
|
| 32 |
+
\bottomrule
|
| 33 |
+
\end{tabular}}
|
| 34 |
+
\label{exp:alignment_number_5}
|
| 35 |
+
\end{table}
|
evals/mjbench/latex_reults/artifact_narrative.tex
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccc|ccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
|
| 8 |
+
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\
|
| 13 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\
|
| 15 |
+
Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\
|
| 16 |
+
Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\
|
| 17 |
+
Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\
|
| 18 |
+
Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\
|
| 19 |
+
Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\
|
| 20 |
+
\midrule
|
| 21 |
+
GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\
|
| 22 |
+
GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9 & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\
|
| 23 |
+
Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\
|
| 24 |
+
Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\
|
| 25 |
+
\bottomrule
|
| 26 |
+
\end{tabular}%
|
| 27 |
+
}
|
| 28 |
+
\label{exp:artifact_result_narrative_5}
|
| 29 |
+
\end{table}
|
evals/mjbench/latex_reults/artifact_number_10.tex
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[h]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{c|cccc|ccc}
|
| 7 |
+
\toprule
|
| 8 |
+
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
|
| 9 |
+
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
|
| 10 |
+
\midrule
|
| 11 |
+
CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
|
| 12 |
+
BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
|
| 13 |
+
PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
|
| 14 |
+
HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
|
| 15 |
+
ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
|
| 16 |
+
Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
|
| 17 |
+
\midrule
|
| 18 |
+
LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
|
| 19 |
+
LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
|
| 20 |
+
LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
|
| 21 |
+
LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
|
| 22 |
+
Instructblip-7b$^\heartsuit$ & $12.4$ & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
|
| 23 |
+
MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
|
| 24 |
+
Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
|
| 25 |
+
Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
|
| 26 |
+
Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
|
| 27 |
+
Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
|
| 28 |
+
Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
|
| 29 |
+
\midrule
|
| 30 |
+
GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
|
| 31 |
+
GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
|
| 32 |
+
Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
|
| 33 |
+
Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
|
| 34 |
+
\bottomrule
|
| 35 |
+
\end{tabular}%
|
| 36 |
+
}
|
| 37 |
+
\label{exp:artifact_result_number_10}
|
| 38 |
+
\end{table}
|
evals/mjbench/latex_reults/artifact_number_5.tex
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccc|ccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
|
| 8 |
+
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 2.90 & 11.3 & \cellcolor{skyblue} 7.80 \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 24.9 & 36.9 & \cellcolor{skyblue} 32.9 \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 11.2 & 13.9 & 1.00 & \cellcolor{skyblue} 8.70 & 56.3 & 73.2 & \cellcolor{skyblue} 61.1 \\
|
| 13 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 18.3 & 17.9 & 17.0 & \cellcolor{skyblue} 17.7 & 27.7 & 34.3 & \cellcolor{skyblue} 28.8 \\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & 9.50 & 3.30 & 19.0 & \cellcolor{skyblue} 10.6 & 10.0 & 10.2 & \cellcolor{skyblue} 9.60 \\
|
| 15 |
+
Prometheus-Vision-7b$^\heartsuit$ & 20.1 & 15.2 & 12.0 & \cellcolor{skyblue} 15.8 & 26.3 & 29.5 & \cellcolor{skyblue} 27.5 \\
|
| 16 |
+
Prometheus-Vision-13b$^\heartsuit$ & 7.10 & 5.30 & 7.00 & \cellcolor{skyblue} 6.50 & 9.70 & 11.5 & \cellcolor{skyblue} 10.9 \\
|
| 17 |
+
Qwen-VL-Chat$^\spadesuit$ & 24.9 & 21.2 & 7.00 & \cellcolor{skyblue} 17.7 & 18.3 & 19.6 & \cellcolor{skyblue} 18.9 \\
|
| 18 |
+
Internvl-chat-v1-5$^\spadesuit$ & 21.9 & 24.5 & 1.00 &\cellcolor{skyblue} 15.8 & \bf 93.7 & 96.6 & \cellcolor{skyblue} \bf 95.7 \\
|
| 19 |
+
Idefics2-8b$^\spadesuit$ & 44.4 & 33.1 & 9.0 & \cellcolor{skyblue} 28.8 & 88.3 & 68.6 & \cellcolor{skyblue} 75.9 \\
|
| 20 |
+
\midrule
|
| 21 |
+
GPT-4-vision$^\clubsuit$ & 86.3 & 54.1 & 79.2 & \cellcolor{skyblue} 72.4 & 90.8 & 93.3 & \cellcolor{skyblue} 91.2 \\
|
| 22 |
+
GPT-4o$^\clubsuit$ & \bf 98.6 & \bf 73.5 & \bf 100 & \cellcolor{skyblue} \bf 90.4 & 91.6 & \bf 96.7 & \cellcolor{skyblue} 93.0 \\
|
| 23 |
+
Gemini Ultra$^\clubsuit$ & 71.6 & 29.9 & 59.8 & \cellcolor{skyblue} 50.7 & 80.7 & 90.8 & \cellcolor{skyblue} 83.9 \\
|
| 24 |
+
Claude 3 Opus$^\clubsuit$ & 21.6 & 16.9 & 9.30 & \cellcolor{skyblue} 16.6 & 85.3 & 93.3 & \cellcolor{skyblue} 87.7 \\
|
| 25 |
+
\bottomrule
|
| 26 |
+
\end{tabular}%
|
| 27 |
+
}
|
| 28 |
+
\label{exp:artifact_result_number_5}
|
| 29 |
+
\end{table}
|
evals/mjbench/latex_reults/bias_acc.tex
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[t]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{The detailed evaluation result in terms of ACC (accuracy) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{c|cccccc}
|
| 7 |
+
\toprule
|
| 8 |
+
% & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
|
| 9 |
+
& Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
|
| 10 |
+
\midrule
|
| 11 |
+
CLIP-v1$^\diamondsuit$ & 57.2 & 57.8 & 55.5 & 59.5 & 60.8 & \cellcolor{skyblue} 57.7 \\
|
| 12 |
+
BLIP-v2$^\diamondsuit$ & 69.6 & 68.5 & 65.9 & 68.6 & 74.7 & \cellcolor{skyblue} 68.5 \\
|
| 13 |
+
PickScore-v1$^\diamondsuit$ & 30.4 & 31.1 & 30.8 & 31.7 & 33.0 & \cellcolor{skyblue} 31.1 \\
|
| 14 |
+
HPS-v2.1$^\diamondsuit$ & 52.9 & 55.3 & 55.7 & 55.0 & 62.4 & \cellcolor{skyblue} 55.3 \\
|
| 15 |
+
ImageReward$^\diamondsuit$ & 41.8 & 40.4 & 36.8 & 39.5 & 52.8 & \cellcolor{skyblue} 40.4 \\
|
| 16 |
+
Aesthetics$^\diamondsuit$ & 59.4 & 62.0 & 64.2 & 62.4 & 61.0 & \cellcolor{skyblue} 62.0 \\
|
| 17 |
+
\midrule
|
| 18 |
+
LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & \bf 83.9 & \bf 84.6 & \bf 84.9 & \bf 88.1 & \cellcolor{skyblue} \bf 84.0 \\
|
| 19 |
+
LLaVA-1.5-13b$^\heartsuit$ & 67.0 & 70.1 & 68.9 & 72.7 & 75.1 & \cellcolor{skyblue} 70.1 \\
|
| 20 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 71.8 & 70.8 & 70.8 & 67.8 & 78.3 & \cellcolor{skyblue} 70.8 \\
|
| 21 |
+
LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 54.3 & 56.7 & 57.0 & 56.1 & 64.8 & \cellcolor{skyblue} 56.6 \\
|
| 22 |
+
Instructblip-7b$^\heartsuit$ & 52.5 & 53.6 & 53.6 & 52.0 & 61.1 & \cellcolor{skyblue} 53.6 \\
|
| 23 |
+
MiniGPT4-v2$^\heartsuit$ & 31.8 & 32.2 & 31.9 & 34.1 & 28.3 & \cellcolor{skyblue} 32.2 \\
|
| 24 |
+
Prometheus-Vision-7b$^\heartsuit$ & 43.8 & 50.4 & 54.4 & 53.6 & 44.9 & \cellcolor{skyblue} 50.4 \\
|
| 25 |
+
Prometheus-Vision-13b$^\heartsuit$ & 65.1 & 65.8 & 63.4 & 65.7 & 77.1 & \cellcolor{skyblue} 65.8 \\
|
| 26 |
+
Qwen-VL-Chat$^\spadesuit$ & 70.8 & 71.5 & 72.3 & 72.2 & 68.1 & \cellcolor{skyblue} 71.5 \\
|
| 27 |
+
Internvl-chat-v1-5$^\spadesuit$ & 40.0 & 41.3 & 42.1 & 42.0 & 39.8 & \cellcolor{skyblue} 41.3 \\
|
| 28 |
+
Idefics2-8b$^\spadesuit$ & 37.4 & 42.7 & 45.3 & 46.9 & 35.2 & \cellcolor{skyblue} 42.7 \\
|
| 29 |
+
\midrule
|
| 30 |
+
GPT-4-vision$^\clubsuit$ & 76.7 & 79.1 & 77.4 & 81.0 & 86.5 & \cellcolor{skyblue} 79.1 \\
|
| 31 |
+
GPT-4o$^\clubsuit$ & 60.9 & 66.6 & 69.1 & 68.2 & 69.6 & \cellcolor{skyblue} 66.6 \\
|
| 32 |
+
Gemini Ultra$^\clubsuit$ & 48.7 & 56.9 & 62.9 & 60.0 & 49.9 & \cellcolor{skyblue} 56.9 \\
|
| 33 |
+
Claude 3 Opus$^\clubsuit$ & 53.9 & 58.2 & 62.1 & 59.0 & 54.0 & \cellcolor{skyblue} 58.2 \\
|
| 34 |
+
\bottomrule
|
| 35 |
+
\end{tabular}%
|
| 36 |
+
}
|
| 37 |
+
\label{exp:bias_acc}
|
| 38 |
+
\end{table}
|
| 39 |
+
|
evals/mjbench/latex_reults/bias_ges.tex
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result in terms of Gini-based Equality Score (GES) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccccc}
|
| 6 |
+
\toprule
|
| 7 |
+
% & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
|
| 8 |
+
& Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\
|
| 11 |
+
BLIP-v2$^\diamondsuit$ & 92.2 & 91.3 & 90.7 & 90.4 & 93.1 & \cellcolor{skyblue} 91.3 \\
|
| 12 |
+
PickScore-v1$^\diamondsuit$ & 80.5 & 81.2 & 81.0 & 81.6 & 82.6 & \cellcolor{skyblue} 81.2 \\
|
| 13 |
+
HPS-v2.1$^\diamondsuit$ & 86.4 & 87.8 & 88.5 & 88.0 & 88.5 & \cellcolor{skyblue} 87.8 \\
|
| 14 |
+
ImageReward$^\diamondsuit$ & 85.5 & 85.0 & 83.6 & 84.8 & 89.0 & \cellcolor{skyblue} 85.0 \\
|
| 15 |
+
Aesthetics$^\diamondsuit$ & 91.9 & 92.1 & 92.4 & 92.1 & 92.3 & \cellcolor{skyblue} 92.1 \\
|
| 16 |
+
\midrule
|
| 17 |
+
LLaVA-1.5-7b$^\heartsuit$ & 87.4 & 88.9 & 90.1 & 88.7 & 90.7 & \cellcolor{skyblue} 88.9 \\
|
| 18 |
+
LLaVA-1.5-13b$^\heartsuit$ & 87.5 & 88.8 & 88.9 & 89.5 & 90.1 & \cellcolor{skyblue} 88.8 \\
|
| 19 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 86.4 & 85.8 & 85.8 & 84.1 & 90.2 & \cellcolor{skyblue} 85.8 \\
|
| 20 |
+
LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 82.1 & 82.8 & 82.4 & 82.5 & 87.8 & \cellcolor{skyblue} 82.8\\
|
| 21 |
+
Instructblip-7b$^\heartsuit$ & 91.0 & 91.2 & 91.1 & 90.4 & 93.8 & \cellcolor{skyblue} 91.1 \\
|
| 22 |
+
MiniGPT4-v2$^\heartsuit$ & 83.7 & 83.3 & 82.8 & 83.4 & 84.1 & \cellcolor{skyblue} 83.3 \\
|
| 23 |
+
Prometheus-Vision-7b$^\heartsuit$ & 74.9 & 74.3 & 73.1 & 74.2 & 77.3 & \cellcolor{skyblue} 74.3 \\
|
| 24 |
+
Prometheus-Vision-13b$^\heartsuit$ & 79.2 & 76.0 & 72.7 & 74.1 & 85.1 & \cellcolor{skyblue} 76.0 \\
|
| 25 |
+
Qwen-VL-Chat$^\spadesuit$ & 85.9 & 86.0 & 86.0 & 86.4 & 83.8 & \cellcolor{skyblue} 85.9 \\
|
| 26 |
+
Internvl-chat-v1-5$^\spadesuit$ & 86.9 & 87.2 & 87.1 & 87.3 & 88.0 & \cellcolor{skyblue} 87.2 \\
|
| 27 |
+
Idefics2-8b$^\spadesuit$ & 77.0 & 79.7 & 81.3 & 82.0 & 74.4 & \cellcolor{skyblue} 79.8 \\
|
| 28 |
+
\midrule
|
| 29 |
+
GPT-4-vision$^\clubsuit$ & \bf 93.0 & \bf 93.2 & 92.2 & \bf 93.4 & \bf 96.4 & \cellcolor{skyblue} \bf 93.2 \\
|
| 30 |
+
GPT-4o$^\clubsuit$ & 91.8 & 92.9 & \bf 93.1 & 93.3 & 94.4 & \cellcolor{skyblue} 92.9 \\
|
| 31 |
+
Gemini Ultra$^\clubsuit$ & 86.6 & 89.0 & 90.8 & 90.0 & 86.2 & \cellcolor{skyblue} 89.0 \\
|
| 32 |
+
Claude 3 Opus$^\clubsuit$ & 83.2 & 85.2 & 86.5 & 85.8 & 84.8 & \cellcolor{skyblue} 85.2 \\
|
| 33 |
+
\bottomrule
|
| 34 |
+
\end{tabular}%
|
| 35 |
+
}
|
| 36 |
+
\label{exp:bias_ges}
|
| 37 |
+
\end{table}
|
evals/mjbench/latex_reults/bias_nds.tex
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result in terms of Normalized Dispersion Score (NDS) for all multimodal judges on \textbf{bias} perspective. The feedback is provided in numerical scale with range [0, 10]. Specifically, we separately report the bias w.r.t. different demographic identifications, i.e. age, gender, race, nationality, and religion. The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccccc}
|
| 6 |
+
\toprule
|
| 7 |
+
% & \multicolumn{6}{c}{\bf Occupation} & \multicolumn{4}{c}{\bf Education} \\
|
| 8 |
+
& Age & Gender & Race & Nationality & Religion & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
CLIP-v1$^\diamondsuit$ & 73.6 & 75.2 & 73.1 & 79.1 & 78.4 & \cellcolor{skyblue} 75.2 \\
|
| 11 |
+
BLIP-v2$^\diamondsuit$ & 85.3 & 83.6 & 82.7 & 81.8 & 87.5 & \cellcolor{skyblue} 83.6 \\
|
| 12 |
+
PickScore-v1$^\diamondsuit$ & 65.3 & 66.7 & 66.4 & 67.3 & 69.4 & \cellcolor{skyblue} 66.7 \\
|
| 13 |
+
HPS-v2.1$^\diamondsuit$ & 75.8 & 78.2 & 79.5 & 78.6 & 79.3 & \cellcolor{skyblue} 78.2 \\
|
| 14 |
+
ImageReward$^\diamondsuit$ & 73.9 & 73.2 & 70.9 & 73.0 & 80.2 & \cellcolor{skyblue} 73.2 \\
|
| 15 |
+
Aesthetics$^\diamondsuit$ & \bf 85.3 & \bf 85.9 & \bf 86.3 & \bf 85.8 & 86.2 & \cellcolor{skyblue} \bf 85.9 \\
|
| 16 |
+
\midrule
|
| 17 |
+
LLaVA-1.5-7b$^\heartsuit$ & 67.6 & 71.4 & 75.8 & 68.4 & 77.3 & \cellcolor{skyblue} 71.4 \\
|
| 18 |
+
LLaVA-1.5-13b$^\heartsuit$ & 71.9 & 74.8 & 76.6 & 74.0 & 80.6 & \cellcolor{skyblue} 74.8 \\
|
| 19 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 68.4 & 64.6 & 62.4 & 59.7 & 78.1 & \cellcolor{skyblue} 64.6 \\
|
| 20 |
+
LLaVA-NeXT-vicuna-7b$^\heartsuit$ & 63.2 & 64.1 & 62.5 & 63.8 & 74.2 & \cellcolor{skyblue} 64.1\\
|
| 21 |
+
Instructblip-7b$^\heartsuit$ & 80.8 & 80.6 & 80.3 & 79.0 & 85.4 & \cellcolor{skyblue} 80.6 \\
|
| 22 |
+
MiniGPT4-v2$^\heartsuit$ & 68.1 & 67.2 & 66.2 & 67.0 & 69.3 & \cellcolor{skyblue} 67.2 \\
|
| 23 |
+
Prometheus-Vision-7b$^\heartsuit$ & 47.2 & 42.5 & 37.8 & 40.0 & 54.2 & \cellcolor{skyblue} 42.5 \\
|
| 24 |
+
Prometheus-Vision-13b$^\heartsuit$ & 54.2 & 44.7 & 36.0 & 39.3 & 65.7 & \cellcolor{skyblue} 44.7 \\
|
| 25 |
+
Qwen-VL-Chat$^\spadesuit$ & 62.4 & 62.3 & 62.3 & 63.1 & 58.9 & \cellcolor{skyblue} 62.3 \\
|
| 26 |
+
Internvl-chat-v1-5$^\spadesuit$ & 74.0 & 74.1 & 73.6 & 73.9 & 76.6 & \cellcolor{skyblue} 74.1 \\
|
| 27 |
+
Idefics2-8b$^\spadesuit$ & 55.1 & 59.2 & 61.7 & 62.8 & 51.0 & \cellcolor{skyblue} 59.2 \\
|
| 28 |
+
\midrule
|
| 29 |
+
GPT-4-vision$^\clubsuit$ & 81.2 & 80.2 & 77.6 & 79.9 & \bf 88.2 & \cellcolor{skyblue} 80.2 \\
|
| 30 |
+
GPT-4o$^\clubsuit$ & 81.2 & 82.7 & 82.8 & 83.2 & 86.1 & \cellcolor{skyblue} 82.7 \\
|
| 31 |
+
Gemini Ultra$^\clubsuit$ & 72.6 & 75.8 & 78.4 & 77.0 & 72.3 & \cellcolor{skyblue} 75.8 \\
|
| 32 |
+
Claude 3 Opus$^\clubsuit$ & 63.3 & 66.1 & 67.5 & 66.9 & 66.8 & \cellcolor{skyblue} 66.1 \\
|
| 33 |
+
\bottomrule
|
| 34 |
+
\end{tabular}%
|
| 35 |
+
}
|
| 36 |
+
\label{exp:bias_nds}
|
| 37 |
+
\end{table}
|
| 38 |
+
|
| 39 |
+
|
evals/mjbench/latex_reults/bias_scale.tex
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{bias} perspective. The feedback are provided in different scales including numerical scales ([0-5], and [0-10]) and Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. We study the average ACC, NDS, and GES score for each model across all occupations/educations. The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|ccc|ccc|ccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{3}{c}{\bf Numerical [0-5]} & \multicolumn{3}{c}{\bf Numerical [0-10]} & \multicolumn{3}{c}{\bf Likert scale}\\
|
| 8 |
+
& ACC & NDS & GES & ACC & NDS & GES & ACC & NDS & GES \\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & \bf 80.8 & 64.6 & 87.7 & 47.1 & 77.3 & 90.1 & \bf 81.5 & 82.4 & \bf 94.2 \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & 55.5 & 77.5 & 90.0 & 37.8 & 78.7 & 89.4 & 61.2 & 78.4 & 91.0 \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 72.1 & 71.2 & 88.3 & 58.6 & 65.4 & 84.1 & 59.1 & 68.3 & 86.1 \\
|
| 13 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 49.3 & 68.1 & 85.2 & 42.6 & 69.6 & 84.9 & 53.5 & 73.1 & 87.6\\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & 58.7 & \bf 85.3 & 91.5 & 53.6 & 80.6 & 91.1 & 71.5 & 84.5 & 94.3 \\
|
| 15 |
+
MiniGPT4-v2$^\heartsuit$ & 35.6 & 69.2 & 79.5 & 32.6 & 67.0 & 83.3 & 38.5 & 39.3 & 68.9 \\
|
| 16 |
+
Prometheus-Vision-7b$^\heartsuit$ & 49.5 & 43.4 & 74.4 & 52.1 & 37.9 & 73.0 & 47.4 & 25.3 & 64.6 \\
|
| 17 |
+
Prometheus-Vision-13b$^\heartsuit$ & 66.3 & 46.3 & 76.8 & \bf 68.2 & 23.3 & 69.4 & 67.6 & 47.4 & 77.6 \\
|
| 18 |
+
Qwen-VL-Chat$^\spadesuit$ & 71.8 & 76.3 & 91.3 & 30.1 & 70.6 & 85.7 & 45.9 & 74.9 & 88.0 \\
|
| 19 |
+
Internvl-chat-v1-5$^\spadesuit$ & 41.0 & 74.1 & 87.2 & 25.4 & 69.6 & 84.3 & 59.2 & 83.6 & 92.6\\
|
| 20 |
+
Idefics2-8b$^\spadesuit$ & 41.9 & 68.7 & 84.4 & 42.1 & 66.7 & 83.4 & 61.6 & \bf 86.5 & 93.9 \\
|
| 21 |
+
\midrule
|
| 22 |
+
GPT-4-vision$^\clubsuit$ & 79.1 & 80.2 & \bf 93.2 & 41.5 & \bf 86.4 & \bf 93.7 & 58.7 & 69.8 & 87.1 \\
|
| 23 |
+
GPT-4o$^\clubsuit$ & 66.6 & 82.7 & 92.9 & 26.2 & 74.2 & 86.5 & 74.3 & 79.2 & 92.2 \\
|
| 24 |
+
Gemini Ultra$^\clubsuit$ & 56.9 & 75.8 & 89.0 & 36.2 & 72.4 & 85.6 & 74.5 & 78.4 & 91.6 \\
|
| 25 |
+
Claude 3 Opus$^\clubsuit$ & 58.2 & 66.1 & 85.2 & 52.1 & 59.5 & 82.1 & 57.4 & 83.6 & 92.5 \\
|
| 26 |
+
\bottomrule
|
| 27 |
+
\end{tabular}%
|
| 28 |
+
}
|
| 29 |
+
\label{exp:bias_scale}
|
| 30 |
+
\end{table}
|
evals/mjbench/latex_reults/consitient_analysis.tex
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[htb]
|
| 2 |
+
\vspace{-5pt}
|
| 3 |
+
\centering
|
| 4 |
+
\small
|
| 5 |
+
\caption{Comparison of open-source judges w.r.t. different input modes. Specifically, we study VLMs with single image input, pairwise image input (pair-f), and pairwise image input in reverse order (pair-r). The best performance is in bold.}
|
| 6 |
+
|
| 7 |
+
\resizebox{0.92\linewidth}{!}{%
|
| 8 |
+
\begin{tabular}{l|ccc|ccc|cccccc}
|
| 9 |
+
\toprule
|
| 10 |
+
& \multicolumn{3}{c}{\bf Alignment} & \multicolumn{3}{c}{\bf Safety} & \multicolumn{3}{c}{\bf Artifact} \\
|
| 11 |
+
& single & pair-f & pair-r & single & pair-f & pair-r & single & pair-f & pair-r \\
|
| 12 |
+
\midrule
|
| 13 |
+
Qwen-VL-Chat$^\spadesuit$ & $29.1$ & $31.1$ & $\textbf{73.0}$ & $\textbf{33.5}$ & $6.8$ & $\textbf{60.1}$ & $19.8$ & $5.7$ & $41.5$ \\
|
| 14 |
+
Internvl-chat-v1-5$^\spadesuit$ & $\textbf{32.8}$ & $\textbf{75.8}$ & $34.8$ & $20.1$ & $5.9$ & $4.6$ & $38.8$ & $\textbf{91.8}$ & $40.7$ \\
|
| 15 |
+
Idefics2-8b$^\spadesuit$ & $30.2$ & $32.6$ & $32.6$ & $27.3$ & $\textbf{13.7}$ & $32.6$ & $\textbf{40.2}$ & $49.0$ & $\textbf{43.2}$ \\
|
| 16 |
+
% \midrule
|
| 17 |
+
% GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2 \\
|
| 18 |
+
% GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\
|
| 19 |
+
% Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\
|
| 20 |
+
% Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0 \\
|
| 21 |
+
\bottomrule
|
| 22 |
+
\end{tabular}%
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
\label{exp:judge_consitiency}
|
| 26 |
+
\end{table}
|
evals/mjbench/latex_reults/dataset.text
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h!]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{Summary of the dataset proposed in \algname.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{lllrl}
|
| 6 |
+
\toprule
|
| 7 |
+
\textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\
|
| 8 |
+
\midrule
|
| 9 |
+
\multirow{5}{*}{\textbf{Alignment}}
|
| 10 |
+
& \multirow{1}{*}{Object} & - & 250 & Ensures the correct objects are present in the image as specified by the text \\
|
| 11 |
+
\cmidrule{2-5}
|
| 12 |
+
& \multirow{1}{*}{Attributes} & - & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\
|
| 13 |
+
\cmidrule{2-5}
|
| 14 |
+
& \multirow{1}{*}{Actions} & - & 115 & Ensures actions specified in the text are accurately depicted in the image \\
|
| 15 |
+
\cmidrule{2-5}
|
| 16 |
+
& \multirow{1}{*}{Counting} & - & 55 & Verifies the correct number of objects as specified by the text \\
|
| 17 |
+
\cmidrule{2-5}
|
| 18 |
+
& \multirow{1}{*}{Spatial} & - & 75 & Ensures correct spatial relationships and positions of objects in the image \\
|
| 19 |
+
|
| 20 |
+
\midrule
|
| 21 |
+
\multirow{8}{*}{\textbf{Safety}}
|
| 22 |
+
& \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\
|
| 23 |
+
\cmidrule{3-5}
|
| 24 |
+
& & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\
|
| 25 |
+
\cmidrule{3-5}
|
| 26 |
+
& & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\
|
| 27 |
+
\cmidrule{2-5}
|
| 28 |
+
& \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\
|
| 29 |
+
\cmidrule{3-5}
|
| 30 |
+
& & Evasive & 177 & Images with attempts to subtly include NSFW content \\
|
| 31 |
+
\cmidrule{3-5}
|
| 32 |
+
& & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\
|
| 33 |
+
|
| 34 |
+
\midrule
|
| 35 |
+
\multirow{8}{*}{\textbf{Quality}}
|
| 36 |
+
& \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\
|
| 37 |
+
\cmidrule{3-5}
|
| 38 |
+
& & Human Limbs & 152 & Prefers images without distortions in human limbs \\
|
| 39 |
+
\cmidrule{3-5}
|
| 40 |
+
& & Object & 100 & Prefers images without distortions in objects \\
|
| 41 |
+
\cmidrule{2-5}
|
| 42 |
+
& \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\
|
| 43 |
+
\cmidrule{3-5}
|
| 44 |
+
& & Motion blur & 350 & Evaluates resistance to motion blur in images \\
|
| 45 |
+
|
| 46 |
+
\midrule
|
| 47 |
+
\multirow{10}{*}{\textbf{Bias}}
|
| 48 |
+
& \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\
|
| 49 |
+
\cmidrule{3-5}
|
| 50 |
+
& & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\
|
| 51 |
+
\cmidrule{3-5}
|
| 52 |
+
& & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\
|
| 53 |
+
\cmidrule{3-5}
|
| 54 |
+
& & Nationality & 60 & Evaluates bias across different nationalities \\
|
| 55 |
+
\cmidrule{3-5}
|
| 56 |
+
& & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\
|
| 57 |
+
\cmidrule{3-5}
|
| 58 |
+
& & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\
|
| 59 |
+
\cmidrule{2-5}
|
| 60 |
+
& \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\
|
| 61 |
+
\cmidrule{3-5}
|
| 62 |
+
& & Race & 60 & Evaluates bias in educational contexts across different races \\
|
| 63 |
+
\cmidrule{3-5}
|
| 64 |
+
& & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\
|
| 65 |
+
\bottomrule
|
| 66 |
+
\end{tabular}
|
| 67 |
+
}
|
| 68 |
+
\label{tab:dataset_detail}
|
| 69 |
+
\end{table}
|
evals/mjbench/latex_reults/human_eval.tex
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{Human evaluation result on the generated images from six fine-tuned SD-v1.5 model using the feedback from six multimodal judges, i.e. GPT-4o, GPT-4-vision, Gemini Ultra, Claude 3 Opus, Internvl-chat-v1-5, and HPS-v2.1. Specifically, we consider the following four metrics: ranking over fixed seed (\textbf{FR}), ranking over random seed (\textbf{RR}), average ranking (\textbf{AR}), and average voting (\textbf{AV}). The best performance across all models are bolded.}
|
| 4 |
+
\setlength{\tabcolsep}{2pt}
|
| 5 |
+
\renewcommand{\arraystretch}{0.9}
|
| 6 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 7 |
+
\begin{tabular}{l|cccc|cccc|cccc}
|
| 8 |
+
\toprule
|
| 9 |
+
& \multicolumn{4}{c}{\bf Alignment} & \multicolumn{4}{c}{\bf Safety} & \multicolumn{4}{c}{\bf Bias} \\
|
| 10 |
+
& FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} & FR $\downarrow$ & RR $\downarrow$ & \cellcolor{skyblue}{AR $\downarrow$} & \cellcolor{skyblue}{AV $\uparrow$} \\
|
| 11 |
+
\midrule
|
| 12 |
+
GPT-4o$^\clubsuit$ & \bf 2.16 & \bf 2.66 & \cellcolor{skyblue}{\bf 2.50} & \cellcolor{skyblue}{\bf 17.21\%} & 1.91 & \bf 1.88 & \cellcolor{skyblue}{\bf 1.89} & \cellcolor{skyblue}{\bf 17.37\%} & \bf 1.72 & \bf 2.48 & \cellcolor{skyblue}{\bf 2.10} & \cellcolor{skyblue}{\bf 21.58\%} \\
|
| 13 |
+
GPT-4-vision$^\clubsuit$ & 2.43 & 2.81 & \cellcolor{skyblue}{2.68} & \cellcolor{skyblue}{15.96\%} & \bf 1.84 & 1.98 & \cellcolor{skyblue}{1.94} & \cellcolor{skyblue}{16.81\%} & 1.99 & 3.14 & \cellcolor{skyblue}{2.57} & \cellcolor{skyblue}{16.80\%} \\
|
| 14 |
+
Gemini Ultra$^\clubsuit$ & \bf 2.15 & 2.72 & \cellcolor{skyblue}{2.54} & \cellcolor{skyblue}{14.87\%} & \bf 1.55 & \bf 1.69 & \cellcolor{skyblue}{\bf 1.64} & \cellcolor{skyblue}{\bf 18.98\%} & 2.23 & \bf 2.65 & \cellcolor{skyblue}{2.44} & \cellcolor{skyblue}{16.18\%} \\
|
| 15 |
+
Claude 3 Opus$^\clubsuit$ & 2.25 & 2.80 & \cellcolor{skyblue}{2.62} & \cellcolor{skyblue}{15.34\%} & 2.07 & 2.12 & \cellcolor{skyblue}{2.10} & \cellcolor{skyblue}{16.15\%} & 2.29 & 3.43 & \cellcolor{skyblue}{2.86} & \cellcolor{skyblue}{11.62\%} \\
|
| 16 |
+
Internvl-chat-v1-5$^\spadesuit$ & 3.16 & 2.99 & \cellcolor{skyblue}{3.05} & \cellcolor{skyblue}{16.90\%} & 2.49 & 2.28 & \cellcolor{skyblue}{2.35} & \cellcolor{skyblue}{15.30\%} & 1.97 & 3.43 & \cellcolor{skyblue}{2.70} & \cellcolor{skyblue}{14.52\%} \\
|
| 17 |
+
HPS-v2.1$^\diamondsuit$ & 2.21 & \bf 2.42 & \cellcolor{skyblue}{\bf 2.35} & \cellcolor{skyblue}{\bf 19.72\%} & 2.42 & 2.37 & \cellcolor{skyblue}{2.39} & \cellcolor{skyblue}{15.39\%} & \bf 1.78 & \bf 2.65 & \cellcolor{skyblue}{\bf 2.21} & \cellcolor{skyblue}{\bf 19.29\%} \\
|
| 18 |
+
\bottomrule
|
| 19 |
+
\end{tabular}%
|
| 20 |
+
}
|
| 21 |
+
\label{exp:human_eval}
|
| 22 |
+
\end{table}
|
evals/mjbench/latex_reults/main_result.tex
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[t]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{Evaluation of three types of multimodal judges across four perspectives on \algname dataset. The average accuracy (\%) with and without ties are provided for alignment, safety, and artifact. We evaluate preference biases over three metrics, i.e. accuracy (ACC), normalized dispersion score (NDS), Gini-based equality score (GES). The best performance across all models is bolded.}
|
| 5 |
+
\setlength{\tabcolsep}{2pt}
|
| 6 |
+
\renewcommand{\arraystretch}{0.9}
|
| 7 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 8 |
+
\begin{tabular}{l|cc|cc|cc|ccc}
|
| 9 |
+
\toprule
|
| 10 |
+
& \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\
|
| 11 |
+
& Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & Avg w/ tie & Avg w/o Tie & ACC & NDS & GES \\
|
| 12 |
+
\midrule
|
| 13 |
+
CLIP-v1$^\diamondsuit$ & $38.1$ & $59.5$ & $12.7$ & $33.3$ & $34.4$ & $68.4$ & $57.4$ & $76.3$ & $86.9$ \\
|
| 14 |
+
BLIP-v2$^\diamondsuit$ & $17.3$ & $38.8$ & $44.0$ & $65.6$ & $7.5$ & $36.5$ & $68.7$ & $83.7$ & $91.3$ \\
|
| 15 |
+
PickScore-v1$^\diamondsuit$ & $58.8$ & $64.6$ & \bf 37.2 & $42.2$ & $83.8$ & $89.6$ & $31.0$ & $66.5$ & $81.1$ \\
|
| 16 |
+
HPS-v2.1$^\diamondsuit$ & $47.3$ & \bf 70.1 & $18.8$ & $41.3$ & $67.3$ & $93.5$ & $55.0$ & $77.9$ & $87.6$ \\
|
| 17 |
+
ImageReward$^\diamondsuit$ & $50.9$ & $64.7$ & $24.9$ & $38.7$ & $63.5$ & $81.8$ & $40.9$ & $73.7$ & $85.3$ \\
|
| 18 |
+
Aesthetics$^\diamondsuit$ & $32.4$ & $52.7$ & $27.0$ & $53.6$ & $69.6$ & $92.5$ & $61.4$ & $85.7$ & $92.1$ \\
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
\midrule
|
| 22 |
+
LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & $24.8$ & $50.2$ & $12.4$ & $51.6$ & 83.7 & 70.4 & 88.7 \\
|
| 23 |
+
LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & $30.7$ & $60.7$ & $23.3$ & $61.2$ & 69.7 & 74.3 & 88.6 \\
|
| 24 |
+
LLaVA-1.6-mistral-7b$^\heartsuit$ & $31.3$ & $62.7$ & $15.2$ & $40.9$ & $45.8$ & $73.2$ & 69.9 & 64.3 & 85.4 \\
|
| 25 |
+
LLaVA-1.6-vicuna-13b$^\heartsuit$ & $29.1$ & $60.3$ & $27.9$ & $45.6$ & $36.8$ & $62.5$ & 56.3 & 64.0 & 82.7 \\
|
| 26 |
+
Instructblip-7b$^\heartsuit$ & $17.1$ & $49.8$ & $26.4$ & $46.9$ & $25.2$ & $64.1$ & 53.1 & 80.8 & 91.2 \\
|
| 27 |
+
MiniGPT4-v2$^\heartsuit$ & $32.8$ & $51.2$ & $25.7$ & $60.1$ & $36.7$ & $47.8$ & 32.6 & 67.0 & 83.3 \\
|
| 28 |
+
Prometheus-Vision-7b$^\heartsuit$ & $18.8$ & $63.9$ & $7.1$ & $58.8$ & $23.4$ & $67.7$ & 49.5 & 43.4 & 74.4 \\
|
| 29 |
+
Prometheus-Vision-13b$^\heartsuit$ & $11.8$ & $64.3$ & $3.6$ & $71.4$ & $8.7$ & $67.9$ & 66.3 & 46.3 & 76.8 \\
|
| 30 |
+
% Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & $6.8$ & $7.1$ & $5.7$ & $7.1$ & 71.9 & 62.8 & 86.2 \\
|
| 31 |
+
% Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & $5.9$ & $6.0$ & $91.8$ & $92.7$ & 25.4 & 69.6 & 84.3 \\
|
| 32 |
+
% Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.7$ & $52.0$ & $49.0$ & $74.7$ & 42.1 & 58.7 & 79.4 \\
|
| 33 |
+
Qwen-VL-Chat$^\spadesuit$ & $52.1$ & $31.6$ & $26.8$ & $7.1$ & $23.6$ & $24.6$ & 71.9 & 62.8 & 86.2 \\
|
| 34 |
+
Internvl-chat-v1-5$^\spadesuit$ & $55.3$ & $67.6$ & $6.3$ & $60.0$ & $66.3$ & $65.1$ & 25.4 & 69.6 & 84.3 \\
|
| 35 |
+
Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & $13.6$ & $52.0$ & $46.1$ & $68.9$ & 42.1 & 58.7 & 79.4 \\
|
| 36 |
+
\midrule
|
| 37 |
+
GPT-4-vision$^\clubsuit$ & $66.1$ & $67.0$ & $26.5$ & $97.6$ & $90.4$ & $96.5$ & \bf 79.0 & 80.4 & \bf 93.2 \\
|
| 38 |
+
GPT-4o$^\clubsuit$ & $61.5$ & $62.5$ & $35.3$ & \bf 100.0 & \bf 97.6 & \bf 98.7 & 65.8 & \bf 82.5 & 92.8 \\
|
| 39 |
+
Gemini Ultra$^\clubsuit$ & \bf 67.2 & $69.0$ & $13.1$ & $95.1$ & $55.7$ & $96.7$ & 55.6 & 75.3 & 88.6 \\
|
| 40 |
+
Claude 3 Opus$^\clubsuit$ & $57.1$ & $55.9$ & $13.4$ & $78.9$ & $11.9$ & $70.4$ & 57.7 & 65.6 & 85.0 \\
|
| 41 |
+
% \midrule
|
| 42 |
+
% Random & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 33.3 & 50.0 & 50.0 \\
|
| 43 |
+
\bottomrule
|
| 44 |
+
\end{tabular}%
|
| 45 |
+
\vspace{-0.2cm}
|
| 46 |
+
}
|
| 47 |
+
\label{exp:main_result}
|
| 48 |
+
\end{table}
|
| 49 |
+
|
evals/mjbench/latex_reults/original_scale_study.tex
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{Result with different scale.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cc|cc|cc|cc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{2}{c}{\bf Bias} \\
|
| 8 |
+
& numeric & likert & numeric & likert & numeric & likert & numeric & likert\\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & - & - & - & - \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & - & - & - & - \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & - & - \\
|
| 13 |
+
LLaVA-NeXT-vicuna-7b$^\heartsuit$ & - & - & - & - & - & - & - & -\\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 57.4 & 85.8 \\
|
| 15 |
+
MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & - & -\\
|
| 16 |
+
Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & - & - \\
|
| 17 |
+
Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & - & - & - & - \\
|
| 18 |
+
Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & - & - & 65.3 & 83.5 \\
|
| 19 |
+
Idefics2-8b$^\spadesuit$ & - & - & - & - & - & - & 52.7 & 77.6 \\
|
| 20 |
+
\midrule
|
| 21 |
+
GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2 \\
|
| 22 |
+
GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\
|
| 23 |
+
Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\
|
| 24 |
+
Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0 \\
|
| 25 |
+
\bottomrule
|
| 26 |
+
\end{tabular}%
|
| 27 |
+
}
|
| 28 |
+
\label{exp:numeric_likert}
|
| 29 |
+
\end{table}
|
evals/mjbench/latex_reults/safety_narrative.tex
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cccc|cccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
|
| 8 |
+
& Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
|
| 9 |
+
\midrule
|
| 10 |
+
LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $31.0$ & $26.2$ & \cellcolor{skyblue} $20.2$ & 14.2 & 9.90 & 6.80 & \cellcolor{skyblue} 9.70 \\
|
| 11 |
+
LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $24.1$ & $23.8$ & \cellcolor{skyblue} $18.0$ & 16.9 & 10.5 & 9.60 & \cellcolor{skyblue} 15.6 \\
|
| 12 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $27.6$ & $17.2$ & $21.4$ & \cellcolor{skyblue} $21.3$ & 26.9 & 9.30 & 6.70 & \cellcolor{skyblue} 19.5 \\
|
| 13 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $34.5$ & $27.6$ & $40.5$ & \cellcolor{skyblue} $32.6$ & 26.8 & 13.9 & 11.5 & \cellcolor{skyblue} 19.7 \\
|
| 14 |
+
Instructblip-7b$^\heartsuit$ & $34.5$ & $20.7$ & $31.0$ & \cellcolor{skyblue} $29.2$ & 23.9 & 12.6 & 5.90 & \cellcolor{skyblue} 16.8 \\
|
| 15 |
+
Prometheus-Vision-7b$^\heartsuit$ & $27.6$ & $20.7$ & $28.6$ & \cellcolor{skyblue} $24.7$ & 10.4 & 4.90 & 2.70 & \cellcolor{skyblue} 25.6 \\
|
| 16 |
+
Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $2.20$ & 9.80 & 3.00 & 1.50 & \cellcolor{skyblue} 5.60 \\
|
| 17 |
+
Qwen-VL-Chat$^\spadesuit$ & $34.5$ & $41.4$ & $42.9$ & \cellcolor{skyblue} $38.2$ & 32.2 & 24.0 & 16.6 & \cellcolor{skyblue} 30.1 \\
|
| 18 |
+
Internvl-chat-v1-5$^\spadesuit$ & $0.00$ & $3.40$ & $2.40$ & \cellcolor{skyblue} $2.20$ & 2.80 & 1.00 & 0.70 & \cellcolor{skyblue} 1.30 \\
|
| 19 |
+
Idefics2-8b$^\spadesuit$ & $37.9$ & $10.3$ & $38.1$ & \cellcolor{skyblue} $29.2$ & 20.2 & 10.0 & 7.10 & \cellcolor{skyblue} 16.7 \\
|
| 20 |
+
\midrule
|
| 21 |
+
GPT-4-vision$^\clubsuit$ & $10.3$ & $24.1$ & $31.0$ & \cellcolor{skyblue} $22.5$ & 64.0 & 50.1 & 34.4 & \cellcolor{skyblue} \bf 54.4 \\
|
| 22 |
+
GPT-4o$^\clubsuit$ & $34.5$ & $\bf 48.3$ & $50.0$ & \cellcolor{skyblue} $46.1$ & \bf 69.6 & \bf 50.9 & \bf 35.9 & \cellcolor{skyblue} 50.3 \\
|
| 23 |
+
Gemini Ultra$^\clubsuit$ & $\bf 41.4$ & $44.8$ & $\bf 66.7$ & \cellcolor{skyblue} $\bf 52.8$ & 53.5 & 45.6 & 31.9 & \cellcolor{skyblue} 51.5 \\
|
| 24 |
+
Claude 3 Opus$^\clubsuit$ & $10.3$ & $3.40$ & $4.80$ & \cellcolor{skyblue} $5.60$ & 45.6 & 32.4 & 27.0 & \cellcolor{skyblue} 35.2 \\
|
| 25 |
+
\bottomrule
|
| 26 |
+
\end{tabular}%
|
| 27 |
+
}
|
| 28 |
+
\label{exp:safety_result_narrative_5}
|
| 29 |
+
\end{table}
|
evals/mjbench/latex_reults/safety_number_10.tex
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[t]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{c|cccc|cccc}
|
| 7 |
+
\toprule
|
| 8 |
+
& \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
|
| 9 |
+
& Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
|
| 10 |
+
\midrule
|
| 11 |
+
CLIP-v1$^\diamondsuit$ & $\bf 89.7$ & $\bf 96.6$ & $\bf 97.6$ & \cellcolor{skyblue} $\bf 94.4$ & $20.8$ & $4.50$ & $16.6$ & \cellcolor{skyblue} $7.90$ \\
|
| 12 |
+
BLIP-v2$^\diamondsuit$ & $6.90$ & $0.00$ & $4.80$ & \cellcolor{skyblue} $4.50$ & $58.4$ & $51.1$ & $35.7$ & \cellcolor{skyblue} $49.1$ \\
|
| 13 |
+
PickScore-v1$^\diamondsuit$ & $89.7$ & $82.8$ & $88.1$ & \cellcolor{skyblue} $86.5$ & $3.10$ & $48.2$ & $2.10$ & \cellcolor{skyblue} $32.2$ \\
|
| 14 |
+
HPS-v2.1$^\diamondsuit$ & $89.7$ & $86.2$ & $85.7$ & \cellcolor{skyblue} $87.6$ & $1.10$ & $30.8$ & $0.6$ & \cellcolor{skyblue} $15.1$ \\
|
| 15 |
+
ImageReward$^\diamondsuit$ & $96.6$ & $96.6$ & $95.2$ & \cellcolor{skyblue} $95.5$ & $31.1$ & $10.2$ & $27.4$ & \cellcolor{skyblue} $18.2$ \\
|
| 16 |
+
Aesthetics$^\diamondsuit$ & $51.7$ & $58.6$ & $64.3$ & \cellcolor{skyblue} $57.3$& $14.6$ & $\bf 55.2$ & $14.2$ & \cellcolor{skyblue} $37.5$ \\
|
| 17 |
+
\midrule
|
| 18 |
+
LLaVA-1.5-7b$^\heartsuit$ & $44.8$ & $41.4$ & $47.6$ & \cellcolor{skyblue} $43.8$ & $35.7$ & $21.2$ & $17.6$ & \cellcolor{skyblue} $26.3$ \\
|
| 19 |
+
LLaVA-1.5-13b$^\heartsuit$ & $31.0$ & $31.0$ & $40.5$ & \cellcolor{skyblue} $33.7$ & $40.8$ & $29.9$ & $33.6$ & \cellcolor{skyblue} $34.7$ \\
|
| 20 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $24.1$ & $19.0$ & \cellcolor{skyblue} $21.3$ & $35.7$ & $14.1$ & $23.3$ & \cellcolor{skyblue} $25.6$ \\
|
| 21 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $44.8$ & $37.9$ & $52.4$ & \cellcolor{skyblue} $43.8$ & $40.9$ & $25.1$ & $27.8$ & \cellcolor{skyblue} $36.5$ \\
|
| 22 |
+
Instructblip-7b$^\heartsuit$ & $31.0$ & $34.5$ & $40.5$ & \cellcolor{skyblue} $39.3$ & $36.9$ & $24.2$ & $30.6$ & \cellcolor{skyblue} $33.7$ \\
|
| 23 |
+
MiniGPT4-v2$^\heartsuit$ & $41.4$ & $62.1$ & $42.9$ & \cellcolor{skyblue} $48.3$ & $39.6$ & $21.4$ & $36.5$ & \cellcolor{skyblue} $32.6$ \\
|
| 24 |
+
Prometheus-Vision-7b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $10.3$ & $6.80$ & $4.30$ & \cellcolor{skyblue} $7.10$ \\
|
| 25 |
+
Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & $6.50$ & $4.10$ & $4.20$ & \cellcolor{skyblue} $5.30$ \\
|
| 26 |
+
Qwen-VL-Chat$^\spadesuit$ & $27.6$ & $13.8$ & $31.0$ & \cellcolor{skyblue} $24.7$ & $18.9$ & $7.60$ & $6.30$ & \cellcolor{skyblue} $11.6$ \\
|
| 27 |
+
Internvl-chat-v1-5$^\spadesuit$ & $34.5$ & $10.3$ & $28.6$ & \cellcolor{skyblue} $25.8$ & $23.3$ & $10.6$ & $7.20$ & \cellcolor{skyblue} $16.2$ \\
|
| 28 |
+
Idefics2-8b$^\spadesuit$ & $58.6$ & $44.8$ & $57.1$ & \cellcolor{skyblue} $52.8$ & $32.9$ & $13.2$ & $19.5$ & \cellcolor{skyblue} $20.2$ \\
|
| 29 |
+
\midrule
|
| 30 |
+
GPT-4-vision$^\clubsuit$ & $75.9$ & $69.0$ & $81.0$ & \cellcolor{skyblue} $76.4$ & $69.5$ & $43.2$ & $32.5$ & \cellcolor{skyblue} $44.1$ \\
|
| 31 |
+
GPT-4o$^\clubsuit$ & $86.2$ & $\bf 96.6$ & $95.2$ & \cellcolor{skyblue} $92.1$ & $\bf 72.3$ & $51.7$ & $\bf 38.9$ & \cellcolor{skyblue} $\bf 54.3$ \\
|
| 32 |
+
Gemini Ultra$^\clubsuit$ & $65.5$ & $41.4$ & $78.6$ & \cellcolor{skyblue} $64.0$ & $31.6$ & $19.1$ & $10.3$ & \cellcolor{skyblue} $22.7$ \\
|
| 33 |
+
Claude 3 Opus$^\clubsuit$ & $62.1$ & $37.9$ & $50.0$ & \cellcolor{skyblue} $50.6$ & $10.5$ & $6.20$ & $3.60$ & \cellcolor{skyblue} $8.30$ \\
|
| 34 |
+
\bottomrule
|
| 35 |
+
\end{tabular}%
|
| 36 |
+
}
|
| 37 |
+
\label{exp:safety_result_number_10}
|
| 38 |
+
\end{table}
|
evals/mjbench/latex_reults/safety_number_5.tex
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
\begin{table}[t]
|
| 3 |
+
\centering
|
| 4 |
+
\caption{The detailed evaluation result of all multimodal judges on \textbf{safety} perspective. The feedback is provided in numerical scale of range [0, 5]. Specifically, we study their individual performance over two alignment objectives: toxicity (crime, shocking, and disgust) and NSFW (evident, evasive, and subtle). The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{c|cccc|cccc}
|
| 7 |
+
\toprule
|
| 8 |
+
& \multicolumn{4}{c}{\bf Toxicity} & \multicolumn{4}{c}{\bf NSFW} \\
|
| 9 |
+
& Crime & Shocking & Disgust & \cellcolor{skyblue}Avg & Evident & Evasive & Subtle & \cellcolor{skyblue}Avg \\
|
| 10 |
+
\midrule
|
| 11 |
+
LLaVA-1.5-7b$^\heartsuit$ & $10.3$ & $20.7$ & $19.0$ & \cellcolor{skyblue} $15.7$ & 13.5 & 11.2 & 5.10 & \cellcolor{skyblue} 7.60 \\
|
| 12 |
+
LLaVA-1.5-13b$^\heartsuit$ & $13.8$ & $10.3$ & $23.8$ & \cellcolor{skyblue} $16.9$ & 16.9 & 11.2 & 8.90 & \cellcolor{skyblue} 12.7 \\
|
| 13 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.7$ & $17.2$ & $16.7$ & \cellcolor{skyblue} $16.9$ & 15.6 & 8.70 & 5.30 & \cellcolor{skyblue} 9.30 \\
|
| 14 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $31.0$ & $27.6$ & $31.0$ & \cellcolor{skyblue} $27.0$ & 19.2 & 14.3 & 10.7 & \cellcolor{skyblue} 15.5 \\
|
| 15 |
+
Instructblip-7b$^\heartsuit$ & $20.7$ & $31.0$ & $16.7$ & \cellcolor{skyblue} $24.7$ & 16.8 & 12.4 & 5.60 & \cellcolor{skyblue} 13.0 \\
|
| 16 |
+
Prometheus-Vision-7b$^\heartsuit$ & $6.90$ & $0.00$ & $7.10$ & \cellcolor{skyblue} $4.50$ & 10.9 & 4.30 & 2.10 & \cellcolor{skyblue} 5.90 \\
|
| 17 |
+
Prometheus-Vision-13b$^\heartsuit$ & $0.00$ & $0.00$ & $0.00$ & \cellcolor{skyblue} $0.00$ & 9.30 & 2.50 & 1.30 & \cellcolor{skyblue} 4.90 \\
|
| 18 |
+
Qwen-VL-Chat$^\spadesuit$ & $31.0$ & $34.5$ & $21.4$ & \cellcolor{skyblue} $30.3$ & 31.6 & 24.9 & 16.3 & \cellcolor{skyblue} 25.3 \\
|
| 19 |
+
Internvl-chat-v1-5$^\spadesuit$ & $24.1$ & $6.90$ & $23.8$ & \cellcolor{skyblue} $19.1$ & 19.5 & 10.3 & 6.80 & \cellcolor{skyblue} 13.0 \\
|
| 20 |
+
Idefics2-8b$^\spadesuit$ & $44.8$ & $41.4$ & $54.8$ & \cellcolor{skyblue} $47.2$ & 29.1 & 10.6 & 8.60 & \cellcolor{skyblue} 16.8 \\
|
| 21 |
+
\midrule
|
| 22 |
+
GPT-4-vision$^\clubsuit$ & $69.0$ & $72.4$ & $73.8$ & \cellcolor{skyblue} $70.8$ & 63.5 & 49.6 & 33.8 & \cellcolor{skyblue} $52.3$ \\
|
| 23 |
+
GPT-4o$^\clubsuit$ & $\bf 75.9$ & $\bf 82.8$ & $\bf 92.9$ & \cellcolor{skyblue} $\bf 84.3$ & $\bf 70.1$ & $\bf 50.6$ & $\bf 36.2$ & \cellcolor{skyblue} $\bf 54.3$ \\
|
| 24 |
+
Gemini Ultra$^\clubsuit$ & $48.3$ & $69.0$ & $73.8$ & \cellcolor{skyblue} $65.2$ & 53.9 & 45.2 & 31.2 & \cellcolor{skyblue} $47.7$ \\
|
| 25 |
+
Claude 3 Opus$^\clubsuit$ & $13.8$ & $6.90$ & $7.10$ & \cellcolor{skyblue} $10.1$ & 45.9 & 32.6 & 26.8 & \cellcolor{skyblue} $38.3$ \\
|
| 26 |
+
\bottomrule
|
| 27 |
+
\end{tabular}%
|
| 28 |
+
}
|
| 29 |
+
\label{exp:safety_result_number_5}
|
| 30 |
+
\end{table}
|
evals/mjbench/latex_reults/scale_study.tex
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\small
|
| 4 |
+
\caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.}
|
| 5 |
+
\resizebox{0.7\linewidth}{!}{%
|
| 6 |
+
\begin{tabular}{l|cccc|cc}
|
| 7 |
+
\toprule
|
| 8 |
+
& \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\
|
| 9 |
+
& [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\
|
| 10 |
+
\midrule
|
| 11 |
+
LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\
|
| 12 |
+
LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\
|
| 13 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\
|
| 14 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\
|
| 15 |
+
Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\
|
| 16 |
+
MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\
|
| 17 |
+
Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\
|
| 18 |
+
Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\
|
| 19 |
+
\midrule
|
| 20 |
+
Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\
|
| 21 |
+
Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\
|
| 22 |
+
Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\
|
| 23 |
+
\midrule
|
| 24 |
+
GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\
|
| 25 |
+
GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\
|
| 26 |
+
Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\
|
| 27 |
+
Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\
|
| 28 |
+
\midrule
|
| 29 |
+
\cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\
|
| 30 |
+
\bottomrule
|
| 31 |
+
\end{tabular}
|
| 32 |
+
\label{exp:scale_study}
|
| 33 |
+
}
|
| 34 |
+
\vspace{-1em}
|
| 35 |
+
\end{table}
|
| 36 |
+
|
| 37 |
+
% \begin{table}[t]
|
| 38 |
+
% \centering
|
| 39 |
+
% \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.}
|
| 40 |
+
% \resizebox{0.7\linewidth}{!}{%
|
| 41 |
+
% \begin{tabular}{c|cccccc}
|
| 42 |
+
% \toprule
|
| 43 |
+
% & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\
|
| 44 |
+
% \midrule
|
| 45 |
+
% LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
|
| 46 |
+
% LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
|
| 47 |
+
% LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
|
| 48 |
+
% LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\
|
| 49 |
+
% Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 50 |
+
% MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 51 |
+
% Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 52 |
+
% Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 53 |
+
% Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 54 |
+
% Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 55 |
+
% \midrule
|
| 56 |
+
% GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 57 |
+
% GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 58 |
+
% Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 59 |
+
% Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\
|
| 60 |
+
% \bottomrule
|
| 61 |
+
% \end{tabular}}
|
| 62 |
+
% \label{exp:scale_study}
|
| 63 |
+
% \end{table}
|
evals/mjbench/latex_reults/summary.tex
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[h!]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{Summary of the dataset proposed in \algname.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{lllrl}
|
| 6 |
+
\toprule
|
| 7 |
+
\textbf{Category} & \textbf{Scenario} & \textbf{Subset} & \textbf{N} & \textbf{Description} \\
|
| 8 |
+
\midrule
|
| 9 |
+
\multirow{5}{*}{\textbf{Alignment}}
|
| 10 |
+
& \multirow{1}{*}{Object} & - & 250 & Ensures the correct objects are present in the image as specified by the text \\
|
| 11 |
+
\cmidrule{2-5}
|
| 12 |
+
& \multirow{1}{*}{Attributes} & - & 229 & Verifies correct association of attributes such as color, shape, size, and texture \\
|
| 13 |
+
\cmidrule{2-5}
|
| 14 |
+
& \multirow{1}{*}{Actions} & - & 115 & Ensures actions specified in the text are accurately depicted in the image \\
|
| 15 |
+
\cmidrule{2-5}
|
| 16 |
+
& \multirow{1}{*}{Counting} & - & 55 & Verifies the correct number of objects as specified by the text \\
|
| 17 |
+
\cmidrule{2-5}
|
| 18 |
+
& \multirow{1}{*}{Spatial} & - & 75 & Ensures correct spatial relationships and positions of objects in the image \\
|
| 19 |
+
|
| 20 |
+
\midrule
|
| 21 |
+
\multirow{8}{*}{\textbf{Safety}}
|
| 22 |
+
& \multirow{3}{*}{Toxicity} & Crime & 29 & Evaluates the presence of crime-related content in images \\
|
| 23 |
+
\cmidrule{3-5}
|
| 24 |
+
& & Shocking & 31 & Evaluates the presence of shocking or disturbing content in images \\
|
| 25 |
+
\cmidrule{3-5}
|
| 26 |
+
& & Disgust & 42 & Evaluates the presence of disgusting or offensive content in images \\
|
| 27 |
+
\cmidrule{2-5}
|
| 28 |
+
& \multirow{4}{*}{NSFW} & Evident & 197 & Images with clear and obvious NSFW content \\
|
| 29 |
+
\cmidrule{3-5}
|
| 30 |
+
& & Evasive & 177 & Images with attempts to subtly include NSFW content \\
|
| 31 |
+
\cmidrule{3-5}
|
| 32 |
+
& & Subtle & 98 & Images with low-level, hard-to-detect NSFW content \\
|
| 33 |
+
|
| 34 |
+
\midrule
|
| 35 |
+
\multirow{8}{*}{\textbf{Quality}}
|
| 36 |
+
& \multirow{3}{*}{Distortion} & Human Face & 169 & Prefers images without distortions in human faces \\
|
| 37 |
+
\cmidrule{3-5}
|
| 38 |
+
& & Human Limbs & 152 & Prefers images without distortions in human limbs \\
|
| 39 |
+
\cmidrule{3-5}
|
| 40 |
+
& & Object & 100 & Prefers images without distortions in objects \\
|
| 41 |
+
\cmidrule{2-5}
|
| 42 |
+
& \multirow{2}{*}{Blurry} & Defocused blur & 350 & Evaluates resistance to defocused blur in images \\
|
| 43 |
+
\cmidrule{3-5}
|
| 44 |
+
& & Motion blur & 350 & Evaluates resistance to motion blur in images \\
|
| 45 |
+
|
| 46 |
+
\midrule
|
| 47 |
+
\multirow{10}{*}{\textbf{Bias}}
|
| 48 |
+
& \multirow{5}{*}{Occupation} & Age & 80 & Evaluates bias across different age groups (young, adult, old) \\
|
| 49 |
+
\cmidrule{3-5}
|
| 50 |
+
& & Gender & 80 & Evaluates bias across different genders (male, female, non-binary) \\
|
| 51 |
+
\cmidrule{3-5}
|
| 52 |
+
& & Race & 80 & Evaluates bias across different races (Asian, Black, Latino, Middle Eastern, Indian, White) \\
|
| 53 |
+
\cmidrule{3-5}
|
| 54 |
+
& & Nationality & 60 & Evaluates bias across different nationalities \\
|
| 55 |
+
\cmidrule{3-5}
|
| 56 |
+
& & Nationality (continued) & 60 & (American, Mexican, European, Spanish, British, Russian, Chinese, Japanese, Korean) \\
|
| 57 |
+
\cmidrule{3-5}
|
| 58 |
+
& & Religion & 60 & Evaluates bias across different religions (Christian, Muslim, Jewish, Hindu) \\
|
| 59 |
+
\cmidrule{2-5}
|
| 60 |
+
& \multirow{3}{*}{Education} & Gender & 60 & Evaluates bias in educational contexts across different genders \\
|
| 61 |
+
\cmidrule{3-5}
|
| 62 |
+
& & Race & 60 & Evaluates bias in educational contexts across different races \\
|
| 63 |
+
\cmidrule{3-5}
|
| 64 |
+
& & Nationality & 60 & Evaluates bias in educational contexts across different nationalities \\
|
| 65 |
+
\bottomrule
|
| 66 |
+
\end{tabular}
|
| 67 |
+
}
|
| 68 |
+
\label{tab:dataset_detail}
|
| 69 |
+
\end{table}
|
evals/mjbench/latex_reults/temp_table.tex
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\begin{table}[t]
|
| 2 |
+
\centering
|
| 3 |
+
\caption{Main result.}
|
| 4 |
+
\resizebox{1.0\linewidth}{!}{%
|
| 5 |
+
\begin{tabular}{c|cc|cc|cc|ccc}
|
| 6 |
+
\toprule
|
| 7 |
+
& \multicolumn{2}{c}{\bf Alignment} & \multicolumn{2}{c}{\bf Safety} & \multicolumn{2}{c}{\bf Artifact} & \multicolumn{3}{c}{\bf Bias} \\
|
| 8 |
+
& Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & Avg w. tie & Avg w.o. Tie & ACC & NDS & GES \\
|
| 9 |
+
\midrule
|
| 10 |
+
CLIP-v1$^\diamondsuit$ & $44.0$ & $60.7$ & $13.1$ & $25.7$ & $41.9$ & $82.7$ & 57.4 & 76.3 & 86.9 \\
|
| 11 |
+
BLIP-v2$^\diamondsuit$ & $21.5$ & $34.1$ & $44.3$ & $75.3$ & $7.8$ & $24.4$ & 68.7 & 83.7 & 91.3 \\
|
| 12 |
+
PickScore-v1$^\diamondsuit$ & $60.9$ & $65.9$ & $37.3$ & $41.3$ & $83.9$ & $92.2$ & 31.0 & 66.5 & 81.1 \\
|
| 13 |
+
HPS-v2.1$^\diamondsuit$ & $48.8$ & $73.6$ & $20.8$ & $35.7$ & $69.6$ & $99.1$ & 55.0 & 77.9 & 87.6 \\
|
| 14 |
+
ImageReward$^\diamondsuit$ & $51.1$ & $67.9$ & $24.9$ & $35.9$ & $63.5$ & $91.7$ & 40.9 & 73.7 & 85.3 \\
|
| 15 |
+
Aesthetics$^\diamondsuit$ & $34.8$ & $56.7$ & $31.6$ & $54.7$ & $70.8$ & $98.5$ & 61.4 & 85.7 & 92.1 \\
|
| 16 |
+
\midrule
|
| 17 |
+
LLaVA-1.5-7b$^\heartsuit$ & $22.0$ & $50.8$ & - & - & - & - & 83.7 & 70.4 & 88.7 \\
|
| 18 |
+
LLaVA-1.5-13b$^\heartsuit$ & $10.3$ & $51.9$ & - & - & - & - & 69.7 & 74.3 & 88.6 \\
|
| 19 |
+
LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & - & - & 69.9 & 64.3 & 85.4 \\
|
| 20 |
+
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & - & - & 56.3 & 64.0 & 82.7 \\
|
| 21 |
+
Instructblip-7b$^\heartsuit$ & - & - & - & - & - & - & 53.1 & 80.8 & 91.2 \\
|
| 22 |
+
MiniGPT4-v2$^\heartsuit$ & - & - & - & - & - & - & 32.6 & 67.0 & 83.3 \\
|
| 23 |
+
Prometheus-Vision-7b$^\heartsuit$ & - & - & - & - & - & - & 49.5 & 43.4 & 74.4 \\
|
| 24 |
+
Prometheus-Vision-13b$^\heartsuit$ & - & - & - & - & - & - & 66.3 & 46.3 & 76.8 \\
|
| 25 |
+
Qwen-VL-Chat$^\heartsuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\
|
| 26 |
+
Internvl-chat-v1-5$^\heartsuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\
|
| 27 |
+
Idefics2-8b$^\heartsuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\
|
| 28 |
+
\midrule
|
| 29 |
+
Qwen-VL-Chat$^\spadesuit$ & $31.1$ & $31.6$ & - & - & - & - & 71.9 & 62.8 & 86.2 \\
|
| 30 |
+
Internvl-chat-v1-5$^\spadesuit$ & $75.8$ & $77.6$ & - & - & - & - & 25.4 & 69.6 & 84.3 \\
|
| 31 |
+
Idefics2-8b$^\spadesuit$ & $32.6$ & $43.5$ & - & - & - & - & 42.1 & 58.7 & 79.4 \\
|
| 32 |
+
GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 79.0 & 80.4 & 93.2 \\
|
| 33 |
+
GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 65.8 & 82.5 & 92.8 \\
|
| 34 |
+
Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 55.6 & 75.3 & 88.6 \\
|
| 35 |
+
Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 57.7 & 65.6 & 85.0 \\
|
| 36 |
+
\bottomrule
|
| 37 |
+
\end{tabular}%
|
| 38 |
+
}
|
| 39 |
+
% \label{exp:main_result}
|
| 40 |
+
\end{table}
|
evals/mjbench/overall-results/AestheticsPredictor.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "AestheticsPredictor",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "LAION",
|
| 7 |
+
"Alignment": 32.4,
|
| 8 |
+
"Safety": 27.0,
|
| 9 |
+
"Quality": 69.6,
|
| 10 |
+
"Bias": 61.4
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/BLIP-v2.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "BLIP-v2",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "Salesforce",
|
| 7 |
+
"Alignment": 17.3,
|
| 8 |
+
"Safety": 44.0,
|
| 9 |
+
"Quality": 7.5,
|
| 10 |
+
"Bias": 68.7
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/CLIP-v2.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "CLIP-v2",
|
| 4 |
+
"Model Type": "Score Model",
|
| 5 |
+
"Input Type": "Single Image",
|
| 6 |
+
"Organization": "LAION",
|
| 7 |
+
"Alignment": 38.1,
|
| 8 |
+
"Safety": 12.7,
|
| 9 |
+
"Quality": 34.4,
|
| 10 |
+
"Bias": 57.4
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/Claude 3 Opus.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Claude 3 Opus",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "Anthropic",
|
| 7 |
+
"Alignment": 57.1,
|
| 8 |
+
"Safety": 13.4,
|
| 9 |
+
"Quality": 11.9,
|
| 10 |
+
"Bias": 57.7
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/GPT-4-vision.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "GPT-4-vision",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "OpenAI",
|
| 7 |
+
"Alignment": 66.1,
|
| 8 |
+
"Safety": 26.5,
|
| 9 |
+
"Quality": 90.4,
|
| 10 |
+
"Bias": 79.0
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/GPT-4o.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "GPT-4o",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "OpenAI",
|
| 7 |
+
"Alignment": 61.5,
|
| 8 |
+
"Safety": 35.3,
|
| 9 |
+
"Quality": 97.6,
|
| 10 |
+
"Bias": 65.8
|
| 11 |
+
}
|
| 12 |
+
]
|
evals/mjbench/overall-results/Gemini Ultra.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "Gemini Ultra",
|
| 4 |
+
"Model Type": "Closesource VLM",
|
| 5 |
+
"Input Type": "Multi Image",
|
| 6 |
+
"Organization": "Google",
|
| 7 |
+
"Alignment": 67.2,
|
| 8 |
+
"Safety": 13.1,
|
| 9 |
+
"Quality": 55.7,
|
| 10 |
+
"Bias": 55.6
|
| 11 |
+
}
|
| 12 |
+
]
|