Spaces:

zhiyuanpeng
/

ELOQ-Data-Viewer

Sleeping

App Files Files Community

ELOQ-Data-Viewer / app.py

zhiyuanpeng

Create app.py

96322c0 verified 9 months ago

raw

history blame contribute delete

5.91 kB

	import streamlit as st
	import os
	import json
	from os.path import join
	import pandas as pd
	from huggingface_hub import snapshot_download
	import collections

	@st.cache_data
	def download_folder():
	local_dir = "view"
	os.makedirs(local_dir, exist_ok=True)
	st.info("Downloading folder from Hugging Face...It may take a while.")
	snapshot_download(
	repo_id="zhiyuanpeng/ELOQ",
	repo_type="dataset",
	local_dir=local_dir, # Or a folder of your choice
	use_auth_token=False,
	allow_patterns=["ELOQ/*"]
	)
	return local_dir

	@st.cache_data
	def load_data():
	data_dir = "view/ELOQ"
	news_file = join(data_dir, "news.json")
	with open(news_file, "r") as f:
	news_data = json.load(f)
	question_file = join(data_dir, "questions.json")
	with open(question_file, "r") as f:
	question_data = json.load(f)
	silver_file = join(data_dir, "silver.csv")
	silver_data = pd.read_csv(silver_file)
	gold_file = join(data_dir, "gold.csv")
	gold_data = pd.read_csv(gold_file)
	silver_response_file = join(data_dir, "silver_responses.json")
	with open(silver_response_file, "r") as f:
	silver_response_data = json.load(f)
	gold_response_file = join(data_dir, "gold_responses.json")
	with open(gold_response_file, "r") as f:
	gold_response_data = json.load(f)
	silver_data = process_data(silver_data, "silver")
	gold_data = process_data(gold_data, "gold")
	return news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data

	def process_data(data, file_name):
	# {data_dict: {q_id: {}, ...}}
	data_dict = collections.defaultdict(dict)
	for index, row in data.iterrows():
	doc_id = row["doc_id"]
	q_id = row["q_id"]
	if file_name == "gold":
	data_dict[doc_id][q_id] = {
	"llm_confusion_label": row["llm_confusion_label"],
	"human_confusion_label": row["human_confusion_label"],
	"llm_defusion_label": row["llm_defusion_label"],
	"human_defusion_label": row["human_defusion_label"]
	}
	else:
	data_dict[doc_id][q_id] = {
	"llm_confusion_label": row["llm_confusion_label"]
	}
	return data_dict

	def sidebar_logic(gold_doc_ids, silver_doc_ids):
	st.sidebar.header("Which data file to view?")
	file_name = st.sidebar.selectbox("Choose Data Name:", ["gold", "silver"])
	if file_name == "gold":
	doc_ids = gold_doc_ids
	else:
	doc_ids = silver_doc_ids
	topic_q_ids = collections.defaultdict(list)
	for doc_id in doc_ids:
	topic_q_ids[doc_id.split("_")[0]].append(doc_id)
	tpoic = st.sidebar.selectbox("Choose Doc id:", topic_q_ids.keys())
	doc_id = st.sidebar.selectbox("Choose Doc id:", topic_q_ids[tpoic])
	return file_name, doc_id

	def init():
	st.set_page_config(layout="wide")
	cwd = os.getcwd()
	st.title("Dataset Viewer")

	if "document_content" not in st.session_state:
	st.session_state.document_content = ""
	if "question_content" not in st.session_state:
	st.session_state.question_content = ""

	st.markdown(
	"""
	<style>
	/* Make the left column sticky */
	div[data-testid="column"]:nth-child(1) {
	position: sticky;
	top: 0;
	height: 100vh;
	overflow-y: auto;
	padding-right: 10px;
	padding-bottom: 30px;
	border-right: 2px solid #ccc;
	}
	</style>
	""",
	unsafe_allow_html=True
	)
	return cwd

	def show_doc_contents(doc_data, doc_id):
	st.session_state.document_content = doc_data[doc_data["doc_id"] == doc_id]["document"].values[0]
	if st.session_state.document_content:
	st.write(f"### Document: {doc_id}")
	st.write(st.session_state.document_content)

	def show_question_contents_and_labels(q_labels, question_data, gold_response_data):
	for q_id, labels in q_labels.items():
	st.write(f"### Question: {q_id}")
	st.text_area("Question:", value=question_data[q_id], key=q_id)
	if q_id in gold_response_data:
	st.text_area("Gold Response:", value=gold_response_data[q_id], key=f"gold_response_{q_id}")
	# Gather label info
	label_lines = []
	for label_key in [
	"llm_confusion_label",
	"human_confusion_label",
	"llm_defusion_label",
	"human_defusion_label"
	]:
	if label_key in labels:
	label_lines.append(f"{label_key}: {labels[label_key]}")

	# Display all labels in one box
	if label_lines:
	st.text_area(
	"Labels:",
	value="\n".join(label_lines),
	key=f"labels_{q_id}"
	)


	if __name__ == "__main__":

	cwd = init()
	# Check if the folder already exists
	# If not, download it
	if not os.path.exists("view"):
	folder_path = download_folder()
	st.success(f"Downloaded to {folder_path}")
	# Load data
	news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data = load_data()
	# Sidebar
	file_name, doc_id = sidebar_logic(gold_data.keys(), silver_data.keys())
	# Show Question contents
	left, right = st.columns([2 , 1.5]) # these numbers represent proportions
	# Display document
	with left:
	st.session_state.document_content = news_data[doc_id]["title"] + "\n" + news_data[doc_id]["content"]
	st.write(f"### Document: {doc_id}")
	st.write(st.session_state.document_content)
	# Display question
	with right:
	if file_name == "gold":
	q_labels = gold_data[doc_id]
	else:
	q_labels = silver_data[doc_id]
	show_question_contents_and_labels(q_labels, question_data, gold_response_data)