Spaces:

ytaek-oh
/

table

Sleeping

App Files Files Community

table / pages /summary_acc_250117.py

ytaek-oh

flair

c38d116 10 months ago

raw

history blame contribute delete

7.97 kB

	import ast

	import pandas as pd

	import streamlit as st

	st.set_page_config(layout="wide")
	SHORT_CAPTIONS = [
	'ALIGN:align-base:coyo700m', 'OpenCLIP:ViT-B-32:openai', 'OpenCLIP:ViT-B-16:openai',
	'OpenCLIP:ViT-L-14:openai', 'OpenCLIP:ViT-L-14-336:openai',
	'OpenCLIP:ViT-B-32:laion2b_s34b_b79k', 'OpenCLIP:ViT-B-16:laion2b_s34b_b88k',
	'OpenCLIP:ViT-L-14:laion2b_s32b_b82k', 'OpenCLIP:ViT-g-14:laion2b_s34b_b88k',
	'OpenCLIP:ViT-H-14:laion2b_s32b_b79k', 'OpenCLIP:roberta-ViT-B-32:laion2b_s12b_b32k',
	'OpenCLIP:ViT-B-16-SigLIP:webli', 'OpenCLIP:ViT-B-16-SigLIP-384:webli',
	'OpenCLIP:ViT-L-16-SigLIP-256:webli', 'OpenCLIP:ViT-L-16-SigLIP-384:webli',
	'OpenCLIP:ViT-SO400M-14-SigLIP:webli', 'OpenCLIP:coca_ViT-B-32:laion2b_s13b_b90k',
	'OpenCLIP:coca_ViT-L-14:laion2b_s13b_b90k'
	]
	LONG_CAPTIONS = [
	'DreamLIP:dreamlip-vitb16:cc3m-long', 'DreamLIP:dreamlip-vitb16:cc12m-long',
	'DreamLIP:dreamlip-vitb16:yfcc15m-long', 'DreamLIP:dreamlip-vitb16:cc30m-long',
	"FLAIR:flair-vitb16:cc3m-recap", "FLAIR:flair-vitb16:cc12m-recap",
	"FLAIR:flair-vitb16:yfcc15m-recap", "FLAIR:flair-vitb16:cc30m-recap",
	'CLIPS:CLIPS-Large-14-224:recap-datacomp1b', 'CLIPS:CLIPS-Large-14-336:recap-datacomp1b',
	'CLIPS:CLIPS-Huge-14-224:recap-datacomp1b', 'LoTLIP:LoTLIP-ViT-B-32:lotlip100m',
	'LoTLIP:LoTLIP-ViT-B-16:lotlip100m', 'Recap-CLIP:ViT-L-16-HTxt-Recap-CLIP:recap-datacomp1b',
	'LongCLIP:longclip-vitb32:sharegpt4v-1m', 'LongCLIP:longclip-vitb16:sharegpt4v-1m',
	'LongCLIP:longclip-vitl14:sharegpt4v-1m', 'LongCLIP:longclip-vitl14_336px:sharegpt4v-1m',
	'Jina-CLIP:jina-clip-v1:jinaai', 'Jina-CLIP:jina-clip-v2:jinaai'
	]
	COMPOSITIONALITY = [
	'OpenCLIP:ViT-B-32:openai', 'StructuredCLIP:NegCLIP-ViT-B-32:coco-ft',
	'StructuredCLIP:CE-CLIP-ViT-B-32:coco-ft', 'StructuredCLIP:DAC-LLM-ViT-B-32:cc3m-ft',
	'StructuredCLIP:DAC-SAM-ViT-B-32:cc3m-ft', 'FSC-CLIP:fsc-clip-ViT-B-32:laioncoco-ft',
	'FSC-CLIP:fsc-clip-ViT-B-16:laioncoco-ft', 'FSC-CLIP:fsc-clip-ViT-L-14:laioncoco-ft'
	]

	DECODERS = [
	'vqascore:instructblip-flant5-xl:none', 'vqascore:clip-flant5-xl:none',
	'vqascore:llava-v1.5-7b:none', 'vqascore:sharegpt4v-7b:none',
	'visualgptscore:instructblip-flant5-xl:none', 'visualgptscore:clip-flant5-xl:none',
	'visualgptscore:llava-v1.5-7b:none', 'visualgptscore:sharegpt4v-7b:none'
	]

	MODEL_GROUPS = {
	"short_captions": SHORT_CAPTIONS,
	"long_captions": LONG_CAPTIONS,
	"compositionality": COMPOSITIONALITY
	}


	def render_mi_table(df, level0_cols):
	# HTML 스타일 정의
	table_style = """
	<style>
	table {
	width: 100%;
	border-collapse: collapse;
	}
	th, td {
	border: 1px solid black;
	text-align: center;
	padding: 8px;
	}
	th {
	background-color: #262730;
	}
	</style>
	"""

	# 상위 헤더 (레벨 0)
	header_html = "<tr>"
	for col in level0_cols:
	colspan = len(df.xs(col, axis=1, level=0).columns) if col else 1
	header_html += f'<th colspan="{colspan}" style="text-align: center;">{col if col else ""}</th>'
	header_html += "</tr>"

	# 하위 헤더 (레벨 1)
	sub_header_html = "<tr>"
	for col in df.columns:
	sub_header_html += f"<th style='text-align: center;'>{col[1] if len(col) > 1 else col[0]}</th>"
	sub_header_html += "</tr>"

	# 데이터 HTML 생성
	def map_val(value):
	try:
	value = f"{float(value):.1f}"
	except:
	value = value
	return value

	rows_html = ""
	for _, row in df.iterrows():

	rows_html += "<tr>" + "".join(f"<td>{map_val(value)}</td>" for value in row) + "</tr>"

	# 최종 HTML 합치기
	table_html = f"""
	{table_style}
	<table>
	{header_html}
	{sub_header_html}
	{rows_html}
	</table>
	"""
	return table_html


	def format_df(df):
	cols = []
	for col in df.columns:
	if col in [("Model", "family"), ("Model", "model"), ("Model", "tag")]:
	continue
	cols.append(col)
	formatted_df = df.style.format({col: "{:.1f}" for col in cols})
	return formatted_df


	def print_table(df):
	level0_cols = []
	for col in df.columns:
	if col[0] not in level0_cols:
	level0_cols.append(col[0])
	st.markdown(render_mi_table(df, level0_cols), unsafe_allow_html=True)


	def get_model_key_from_df(df, model_names):
	columns = [("Model", "family"), ("Model", "model"), ("Model", "tag")]
	named_rows = df[columns].apply(lambda row: ":".join(row), axis=1)
	new_rows = []
	for name in model_names:
	new_rows.append(df[named_rows == name])
	new_rows = pd.concat(new_rows, axis=0)
	new_rows.columns = pd.MultiIndex.from_tuples(new_rows.columns)
	print_table(new_rows)


	# Streamlit app
	def main():
	st.title("Interface")
	st.markdown("### Summarized Evaluation Results on Sentence Addition Tasks")
	st.markdown("- random chance 50% 반영")
	st.markdown("- decoder-based model 결과 추가")
	st.markdown("- FLAIR model 결과 추가 (context length 77)")

	df = pd.read_csv("data/250117/summary.csv")
	df.columns = [ast.literal_eval(col) for col in df.columns]
	for group, model_names in MODEL_GROUPS.items():
	st.markdown(f"## {group} models")
	if group == "short_captions":
	st.markdown(
	"- Length group: 이미 short group부터, 80<(Num_tokens)<120. 중간에 문장 더해졌으면 60-70%정도 맞추고, 끝에 문장 더해졌으면 애초에 added sentence encoding 불가 -> accuracy 는 random chance, 50%."
	)
	st.markdown(
	"- neg_target: description의 끝 (=background)에 sentence 더해진 경우 accuracy 50%"
	)
	st.markdown("- neg_type: contradictory sentence가 모델 입장에서 맞추기 더 어려움")

	if group == "long_captions":
	st.markdown(
	"- Length group: 모델의 context length에 성능 심하게 dependent함. DreamLIP: 77, CLIPS: 80, LoTLIP: 128, Recap-CLIP: 128, LongCLIP: 248, Jina-CLIP: 512"
	)
	st.markdown("- neg_target: 여전히 background level에서 sentence 더해진게 전반적으로 어려움")
	st.markdown("- neg_type: contradictory sentence가 모델 입장에서 맞추기 더 어려움")
	if group == "compositionality":
	st.markdown("- context length 77의 한계. Hard Negative Caption으로 Fine-tuning 하면 일부 좋아짐")
	get_model_key_from_df(df, model_names)

	df = pd.read_csv("data/250117/decoder_summary.csv")
	df.columns = [ast.literal_eval(col) for col in df.columns]
	st.markdown("## Decoder-based models")
	st.markdown(
	"- InstructBLIP은 text input context length가 128 -> medium length group부터 헷갈리기 시작 (vqascore, visualgptscore 모두.)"
	)
	st.markdown(
	"- 나머지 세 모델은 vision+language 토탈 2048 context length (충분함). VQAScore에서 high performance, VisualGPTScore은 거의 random chance."
	)
	st.markdown(
	"- visualgptscore는 given caption의 매 token 위치마다 auto-regressive cross-entropy loss의 avg으로 계산됨 (like image captioning)"
	)
	st.markdown(
	"- vqascore는 given caption을 question에 넣고, yes/no 형식 question으로 물어봄 -> answer token 위치에서 cross entropy loss으로 계산"
	)
	st.markdown(
	"- 즉 long text generative task는 약한데 qa 능력은 좋아서 visualgptscore는 낮고, vqascore가 더 높게 나온다고 추측가능"
	)
	get_model_key_from_df(df, DECODERS)


	if __name__ == "__main__":
	main()