|
|
import ast
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import streamlit as st
|
|
|
|
|
|
st.set_page_config(layout="wide")
|
|
|
SHORT_CAPTIONS = [
|
|
|
'ALIGN:align-base:coyo700m', 'OpenCLIP:ViT-B-32:openai', 'OpenCLIP:ViT-B-16:openai',
|
|
|
'OpenCLIP:ViT-L-14:openai', 'OpenCLIP:ViT-L-14-336:openai',
|
|
|
'OpenCLIP:ViT-B-32:laion2b_s34b_b79k', 'OpenCLIP:ViT-B-16:laion2b_s34b_b88k',
|
|
|
'OpenCLIP:ViT-L-14:laion2b_s32b_b82k', 'OpenCLIP:ViT-g-14:laion2b_s34b_b88k',
|
|
|
'OpenCLIP:ViT-H-14:laion2b_s32b_b79k', 'OpenCLIP:roberta-ViT-B-32:laion2b_s12b_b32k',
|
|
|
'OpenCLIP:ViT-B-16-SigLIP:webli', 'OpenCLIP:ViT-B-16-SigLIP-384:webli',
|
|
|
'OpenCLIP:ViT-L-16-SigLIP-256:webli', 'OpenCLIP:ViT-L-16-SigLIP-384:webli',
|
|
|
'OpenCLIP:ViT-SO400M-14-SigLIP:webli', 'OpenCLIP:coca_ViT-B-32:laion2b_s13b_b90k',
|
|
|
'OpenCLIP:coca_ViT-L-14:laion2b_s13b_b90k'
|
|
|
]
|
|
|
LONG_CAPTIONS = [
|
|
|
'DreamLIP:dreamlip-vitb16:cc3m-long', 'DreamLIP:dreamlip-vitb16:cc12m-long',
|
|
|
'DreamLIP:dreamlip-vitb16:yfcc15m-long', 'DreamLIP:dreamlip-vitb16:cc30m-long',
|
|
|
'CLIPS:CLIPS-Large-14-224:recap-datacomp1b', 'CLIPS:CLIPS-Large-14-336:recap-datacomp1b',
|
|
|
'CLIPS:CLIPS-Huge-14-224:recap-datacomp1b', 'LoTLIP:LoTLIP-ViT-B-32:lotlip100m',
|
|
|
'LoTLIP:LoTLIP-ViT-B-16:lotlip100m', 'Recap-CLIP:ViT-L-16-HTxt-Recap-CLIP:recap-datacomp1b',
|
|
|
'LongCLIP:longclip-vitb32:sharegpt4v-1m', 'LongCLIP:longclip-vitb16:sharegpt4v-1m',
|
|
|
'LongCLIP:longclip-vitl14:sharegpt4v-1m', 'LongCLIP:longclip-vitl14_336px:sharegpt4v-1m',
|
|
|
'Jina-CLIP:jina-clip-v1:jinaai', 'Jina-CLIP:jina-clip-v2:jinaai'
|
|
|
]
|
|
|
COMPOSITIONALITY = [
|
|
|
'OpenCLIP:ViT-B-32:openai', 'StructuredCLIP:NegCLIP-ViT-B-32:coco-ft',
|
|
|
'StructuredCLIP:CE-CLIP-ViT-B-32:coco-ft', 'StructuredCLIP:DAC-LLM-ViT-B-32:cc3m-ft',
|
|
|
'StructuredCLIP:DAC-SAM-ViT-B-32:cc3m-ft', 'FSC-CLIP:fsc-clip-ViT-B-32:laioncoco-ft',
|
|
|
'FSC-CLIP:fsc-clip-ViT-B-16:laioncoco-ft', 'FSC-CLIP:fsc-clip-ViT-L-14:laioncoco-ft'
|
|
|
]
|
|
|
|
|
|
MODEL_GROUPS = {
|
|
|
"short_captions": SHORT_CAPTIONS,
|
|
|
"long_captions": LONG_CAPTIONS,
|
|
|
"compositionality": COMPOSITIONALITY
|
|
|
}
|
|
|
|
|
|
|
|
|
def render_mi_table(df, level0_cols):
|
|
|
|
|
|
table_style = """
|
|
|
<style>
|
|
|
table {
|
|
|
width: 100%;
|
|
|
border-collapse: collapse;
|
|
|
}
|
|
|
th, td {
|
|
|
border: 1px solid black;
|
|
|
text-align: center;
|
|
|
padding: 8px;
|
|
|
}
|
|
|
th {
|
|
|
background-color: #262730;
|
|
|
}
|
|
|
</style>
|
|
|
"""
|
|
|
|
|
|
|
|
|
header_html = "<tr>"
|
|
|
for col in level0_cols:
|
|
|
colspan = len(df.xs(col, axis=1, level=0).columns) if col else 1
|
|
|
header_html += f'<th colspan="{colspan}" style="text-align: center;">{col if col else ""}</th>'
|
|
|
header_html += "</tr>"
|
|
|
|
|
|
|
|
|
sub_header_html = "<tr>"
|
|
|
for col in df.columns:
|
|
|
sub_header_html += f"<th style='text-align: center;'>{col[1] if len(col) > 1 else col[0]}</th>"
|
|
|
sub_header_html += "</tr>"
|
|
|
|
|
|
|
|
|
def map_val(value):
|
|
|
try:
|
|
|
value = f"{float(value):.1f}"
|
|
|
except:
|
|
|
value = value
|
|
|
return value
|
|
|
|
|
|
rows_html = ""
|
|
|
for _, row in df.iterrows():
|
|
|
|
|
|
rows_html += "<tr>" + "".join(f"<td>{map_val(value)}</td>" for value in row) + "</tr>"
|
|
|
|
|
|
|
|
|
table_html = f"""
|
|
|
{table_style}
|
|
|
<table>
|
|
|
{header_html}
|
|
|
{sub_header_html}
|
|
|
{rows_html}
|
|
|
</table>
|
|
|
"""
|
|
|
return table_html
|
|
|
|
|
|
|
|
|
def format_df(df):
|
|
|
cols = []
|
|
|
for col in df.columns:
|
|
|
if col in [("Model", "family"), ("Model", "model"), ("Model", "tag")]:
|
|
|
continue
|
|
|
cols.append(col)
|
|
|
formatted_df = df.style.format({col: "{:.1f}" for col in cols})
|
|
|
return formatted_df
|
|
|
|
|
|
|
|
|
def print_table(df):
|
|
|
level0_cols = []
|
|
|
for col in df.columns:
|
|
|
if col[0] not in level0_cols:
|
|
|
level0_cols.append(col[0])
|
|
|
st.markdown(render_mi_table(df, level0_cols), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
def get_model_key_from_df(df, model_names):
|
|
|
columns = [("Model", "family"), ("Model", "model"), ("Model", "tag")]
|
|
|
named_rows = df[columns].apply(lambda row: ":".join(row), axis=1)
|
|
|
new_rows = []
|
|
|
for name in model_names:
|
|
|
new_rows.append(df[named_rows == name])
|
|
|
new_rows = pd.concat(new_rows, axis=0)
|
|
|
new_rows.columns = pd.MultiIndex.from_tuples(new_rows.columns)
|
|
|
print_table(new_rows)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
st.title("Interface")
|
|
|
df = pd.read_csv("data/250116/summary.csv")
|
|
|
df.columns = [ast.literal_eval(col) for col in df.columns]
|
|
|
for group, model_names in MODEL_GROUPS.items():
|
|
|
st.markdown(f"## {group} models")
|
|
|
if group == "short_captions":
|
|
|
st.markdown(
|
|
|
"- **Length group**: 이미 short group부터, 80<(Num_tokens)<120. 중간에 문장 더해졌으면 60-70%정도 맞추고, 끝에 문장 더해졌으면 애초에 added sentence encoding 불가 -> accuracy 0%"
|
|
|
)
|
|
|
st.markdown(
|
|
|
"- **neg_target**: description의 끝 (=background)에 sentence 더해진 경우 accuracy 0%"
|
|
|
)
|
|
|
st.markdown("- **neg_type**: contradictory sentence가 모델 입장에서 맞추기 더 어려움")
|
|
|
|
|
|
if group == "long_captions":
|
|
|
st.markdown(
|
|
|
"- **Length group**: 모델의 context length에 성능 심하게 dependent함. DreamLIP: 77, CLIPS: 80, LoTLIP: 128, Recap-CLIP: 128, LongCLIP: 248, Jina-CLIP: 512"
|
|
|
)
|
|
|
st.markdown("- **neg_target**: 여전히 background level에서 sentence 더해진게 전반적으로 어려움")
|
|
|
st.markdown("- **neg_type**: contradictory sentence가 모델 입장에서 맞추기 더 어려움")
|
|
|
if group == "compositionality":
|
|
|
st.markdown("- context length 77의 한계. Hard Negative Caption으로 Fine-tuning 하면 일부 좋아짐")
|
|
|
get_model_key_from_df(df, model_names)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|