commit
Browse files
data/250206/textonly_decoder_report.csv
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
family,model,tag,all_tasks-add_relation_XS,all_tasks-add_relation_S,all_tasks-add_relation_M,all_tasks-add_relation_L,all_tasks-add_attribute_XS,all_tasks-add_attribute_S,all_tasks-add_attribute_M,all_tasks-add_attribute_L,all_tasks-replace_relation_XS,all_tasks-replace_relation_S,all_tasks-replace_relation_M,all_tasks-replace_relation_L,all_tasks-replace_attribute_XS,all_tasks-replace_attribute_S,all_tasks-replace_attribute_M,all_tasks-replace_attribute_L,all_tasks-replace_entity_XS,all_tasks-replace_entity_S,all_tasks-replace_entity_M,all_tasks-replace_entity_L,all_tasks-swap_relations,all_tasks-swap_attributes,overall-add_relation,overall-add_attribute,overall-replace_relation,overall-replace_attribute,overall-replace_entity,overall-swap_relations,overall-swap_attributes,overall-macro_avg,summary-add,summary-replace,summary-swap,summary-avg_ops,summary-relation,summary-attribute,summary-entity,summary-avg_neg_types,neg_start_positions-add_XS,neg_start_positions-add_S,neg_start_positions-add_M,neg_start_positions-add_L,neg_start_positions-replace_XS,neg_start_positions-replace_S,neg_start_positions-replace_M,neg_start_positions-replace_L,neg_start_positions-Avg_XS,neg_start_positions-Avg_S,neg_start_positions-Avg_M,neg_start_positions-Avg_L
|
| 2 |
+
vqascore,instructblip-flant5-xl,none,56.71641791044776,55.494505494505496,50.0,50.0,50.54945054945055,53.6,50.0,50.0,50.3448275862069,51.42857142857143,50.0,50.0,41.566265060240966,55.14018691588785,50.0,50.0,45.87155963302752,45.774647887323944,50.0,50.0,49.57446808510638,48.17927170868347,53.052730851238316,51.037362637362634,50.443349753694584,49.176612994032205,47.91155188008787,49.57446808510638,48.17927170868347,49.91076398717221,52.04504674430048,49.17717154260489,48.87686989689493,50.0330293946001,51.023516230013094,49.46441578002611,47.91155188008787,49.466494630042355,53.63293422994916,54.54725274725275,50.0,50.0,45.927550759825124,50.78113541059441,50.0,50.0,49.78024249488714,52.66419407892358,50.0,50.0
|
| 3 |
+
vqascore,clip-flant5-xl,none,26.11940298507463,29.94505494505495,31.34328358208955,22.22222222222222,50.0,50.8,58.333333333333336,51.724137931034484,56.206896551724135,54.64285714285714,58.267716535433074,59.67741935483871,61.445783132530124,55.14018691588785,59.16666666666667,61.53846153846155,62.38532110091743,57.04225352112677,56.074766355140184,64.28571428571428,58.08510638297872,58.96358543417367,27.40749093361034,52.71436781609195,57.19872239621326,59.322774563386545,59.94701381572467,58.08510638297872,58.96358543417367,53.37700876316846,40.06092937485114,58.822836925108156,58.524345908576194,52.4693707361785,47.56377323760078,57.00024260455072,59.94701381572467,54.837009885958715,38.059701492537314,40.37252747252747,44.83830845771144,36.97318007662835,60.01266692839056,55.608432526623915,57.83638318574665,61.83386505967152,49.03618421046394,47.9904799995757,51.337345821729045,49.40352256814994
|
| 4 |
+
vqascore,llava-v1.5-7b,none,40.298507462686565,40.93406593406594,42.91044776119403,37.03703703703704,54.120879120879124,54.4,59.55882352941177,48.27586206896552,54.48275862068965,49.28571428571428,43.7007874015748,41.935483870967744,51.204819277108435,51.86915887850468,50.83333333333333,42.30769230769231,47.247706422018354,51.056338028169016,51.4018691588785,69.64285714285714,48.29787234042553,52.10084033613445,40.2950145487459,54.088891179814105,47.35118604473662,49.05375094915969,54.83719268798075,48.29787234042553,52.10084033613445,49.432106869571,47.19195286428,50.41404322729235,50.19935633827999,49.268450809950785,45.31469097796935,51.74782748836941,54.83719268798075,50.633237051439835,47.20969329178284,47.667032967032966,51.2346356453029,42.65644955300128,50.978428106605485,50.73707039746265,48.64532996459554,51.29534444050572,49.09406069919416,49.20205168224781,49.939982804949224,46.975896996753505
|
| 5 |
+
vqascore,sharegpt4v-7b,none,24.626865671641788,33.24175824175824,43.47014925373135,43.05555555555556,44.78021978021978,53.60000000000001,54.41176470588235,63.793103448275865,48.27586206896551,47.142857142857146,51.181102362204726,59.677419354838705,55.12048192771084,51.4018691588785,50.833333333333336,50.0,64.22018348623854,61.971830985915496,51.4018691588785,73.21428571428572,55.31914893617021,53.22128851540616,36.098582180671734,54.1462719835945,51.56931023221652,51.83892110498067,62.70204233632956,55.31914893617021,53.22128851540616,52.127937898481335,45.122427082133115,55.37009122450892,54.27021872578818,51.58757901081007,47.66234711635283,53.06882720132711,62.70204233632956,54.47773888466983,34.70354272593079,43.42087912087912,48.94095697980685,53.42432950191571,55.872175827638294,53.505519095883706,51.138768284805515,60.963901689708145,45.287859276784545,48.46319910838142,50.03986263230618,57.19411559581192
|
| 6 |
+
visualgptscore,instructblip-flant5-xl,none,26.865671641791046,20.87912087912088,33.3955223880597,50.0,23.626373626373624,24.8,37.254901960784316,50.0,65.51724137931035,70.0,63.38582677165354,50.0,62.65060240963856,61.6822429906542,53.33333333333333,50.0,58.71559633027523,68.30985915492957,60.74766355140187,50.0,77.23404255319149,74.50980392156862,32.785078727242905,33.92031889678948,62.22576703774097,56.91654468340653,59.443279759151665,77.23404255319149,74.50980392156862,56.71926222558452,33.35269881201619,59.52853049343306,75.87192323738006,56.25105084760977,57.41496277272512,55.11555583392154,59.443279759151665,57.32459945526611,25.246022634082337,22.83956043956044,35.325212174422006,50.0,62.29448003974138,66.66403404852792,59.155607885462906,50.0,43.770251336911855,44.751797244044184,47.24041002994245,50.0
|
| 7 |
+
visualgptscore,clip-flant5-xl,none,25.37313432835821,15.934065934065933,19.029850746268657,20.37037037037037,18.13186813186813,24.8,31.862745098039213,24.137931034482758,67.58620689655173,66.42857142857143,62.20472440944882,87.09677419354838,62.65060240963856,72.89719626168224,61.25,57.692307692307686,60.550458715596335,68.30985915492957,62.616822429906534,85.71428571428571,76.59574468085107,76.47058823529412,20.17685534476579,24.733136066097526,70.8290692320301,63.62252659090712,69.29785650367954,76.59574468085107,76.47058823529412,57.3893966648036,22.454995705431656,67.91648410887224,76.5331664580726,55.63488209079217,55.86722308588232,54.942083630766255,69.29785650367954,60.03572107344271,21.752501230113168,20.367032967032966,25.446297922153935,22.254150702426564,63.5957560072622,69.21187561506107,62.023848946451785,76.83445586671392,42.67412861868769,44.78945429104702,43.73507343430286,49.54430328457025
|
| 8 |
+
visualgptscore,llava-v1.5-7b,none,10.44776119402985,10.43956043956044,7.835820895522389,5.555555555555555,17.582417582417584,16.8,22.058823529411764,20.689655172413794,73.79310344827587,70.0,74.01574803149606,83.87096774193549,74.69879518072288,71.02803738317756,68.75,65.38461538461539,75.22935779816514,78.16901408450704,71.02803738317756,89.28571428571429,85.1063829787234,83.75350140056022,8.569674521167059,19.282724071060787,75.41995480542685,69.96536198712896,78.428030887891,85.1063829787234,83.75350140056022,60.0750900931369,13.926199296113923,74.6044492268156,84.42994218964182,57.653530237523775,56.36533743510577,57.66719581958332,78.428030887891,64.15352138086003,14.015089388223718,13.619780219780221,14.947322212467077,13.122605363984675,74.57375214238796,73.06568382256152,71.26459513822454,79.5137658040884,44.29442076530584,43.342732021170875,43.1059586753458,46.318185584036534
|
| 9 |
+
visualgptscore,sharegpt4v-7b,none,5.970149253731343,8.241758241758241,7.835820895522389,9.25925925925926,20.32967032967033,13.600000000000001,20.588235294117645,20.689655172413794,71.72413793103448,77.14285714285715,71.65354330708661,87.09677419354838,76.50602409638554,72.89719626168224,64.58333333333334,65.38461538461539,77.06422018348624,79.5774647887324,79.43925233644859,82.14285714285714,83.40425531914893,82.35294117647058,7.826746912567808,18.80189019905044,76.90432814363166,69.84279226900412,79.55594861288108,83.40425531914893,82.35294117647058,59.812700376107806,13.314318555809123,75.43435634183895,82.87859824780975,57.20909104848594,56.04511012511614,56.999207881508376,79.55594861288108,64.20008887316853,13.149909791700836,10.920879120879121,14.212028094820017,14.974457215836527,75.09812740363542,76.5391727310906,71.89204299228952,78.2080822403403,44.124018597668126,43.73002592598486,43.052035543554766,46.59126972808841
|
pages/report_0206.py
CHANGED
|
@@ -96,6 +96,7 @@ def main():
|
|
| 96 |
df_all = pd.concat((df_all, df_decoder_all), axis=0).reset_index(drop=True)
|
| 97 |
cols_all = [col for col in df.columns if col not in ["family", "model", "tag"]]
|
| 98 |
df[cols_all] = df[cols_all] - df_all[cols_all]
|
|
|
|
| 99 |
|
| 100 |
columns = list(df.columns)
|
| 101 |
col_dict = defaultdict(list)
|
|
@@ -172,6 +173,16 @@ def main():
|
|
| 172 |
"- InstructBLIP은 input context length 128, output context length는 256으로 제한. description을 input에 태우면 (VQAScore) context length 초과로 evaluation 불가, output에는 태우기 가능 (VisualGPTScore)"
|
| 173 |
)
|
| 174 |
print_table_overall(df, model_names, metric_group, metric_columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
st.markdown("---")
|
| 176 |
|
| 177 |
|
|
|
|
| 96 |
df_all = pd.concat((df_all, df_decoder_all), axis=0).reset_index(drop=True)
|
| 97 |
cols_all = [col for col in df.columns if col not in ["family", "model", "tag"]]
|
| 98 |
df[cols_all] = df[cols_all] - df_all[cols_all]
|
| 99 |
+
textonly_df = pd.read_csv("./data/250206/textonly_decoder_report.csv")
|
| 100 |
|
| 101 |
columns = list(df.columns)
|
| 102 |
col_dict = defaultdict(list)
|
|
|
|
| 173 |
"- InstructBLIP은 input context length 128, output context length는 256으로 제한. description을 input에 태우면 (VQAScore) context length 초과로 evaluation 불가, output에는 태우기 가능 (VisualGPTScore)"
|
| 174 |
)
|
| 175 |
print_table_overall(df, model_names, metric_group, metric_columns)
|
| 176 |
+
if selected == "after filtering" and group == "decoder-based":
|
| 177 |
+
st.write("### decoder-based models (zero-tensor images)")
|
| 178 |
+
if metric_group == "summary":
|
| 179 |
+
st.markdown(
|
| 180 |
+
"- Image 정보가 없을 때 전반적으로 accuracy 하락. VQAScore는 random chance accuracy (50%) 근처인 반면 VisualGPTScore는 여전히 text input만으로 `replace`, `swap` split에서 outperforming 가능"
|
| 181 |
+
)
|
| 182 |
+
st.markdown(
|
| 183 |
+
"- From VisualGPTScore, Output token propability is critical to identify negative descriptions, even one token change."
|
| 184 |
+
)
|
| 185 |
+
print_table_overall(textonly_df, model_names, metric_group, metric_columns)
|
| 186 |
st.markdown("---")
|
| 187 |
|
| 188 |
|