Spaces:
Sleeping
Sleeping
File size: 5,910 Bytes
96322c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import streamlit as st
import os
import json
from os.path import join
import pandas as pd
from huggingface_hub import snapshot_download
import collections
@st.cache_data
def download_folder():
local_dir = "view"
os.makedirs(local_dir, exist_ok=True)
st.info("Downloading folder from Hugging Face...It may take a while.")
snapshot_download(
repo_id="zhiyuanpeng/ELOQ",
repo_type="dataset",
local_dir=local_dir, # Or a folder of your choice
use_auth_token=False,
allow_patterns=["ELOQ/*"]
)
return local_dir
@st.cache_data
def load_data():
data_dir = "view/ELOQ"
news_file = join(data_dir, "news.json")
with open(news_file, "r") as f:
news_data = json.load(f)
question_file = join(data_dir, "questions.json")
with open(question_file, "r") as f:
question_data = json.load(f)
silver_file = join(data_dir, "silver.csv")
silver_data = pd.read_csv(silver_file)
gold_file = join(data_dir, "gold.csv")
gold_data = pd.read_csv(gold_file)
silver_response_file = join(data_dir, "silver_responses.json")
with open(silver_response_file, "r") as f:
silver_response_data = json.load(f)
gold_response_file = join(data_dir, "gold_responses.json")
with open(gold_response_file, "r") as f:
gold_response_data = json.load(f)
silver_data = process_data(silver_data, "silver")
gold_data = process_data(gold_data, "gold")
return news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data
def process_data(data, file_name):
# {data_dict: {q_id: {}, ...}}
data_dict = collections.defaultdict(dict)
for index, row in data.iterrows():
doc_id = row["doc_id"]
q_id = row["q_id"]
if file_name == "gold":
data_dict[doc_id][q_id] = {
"llm_confusion_label": row["llm_confusion_label"],
"human_confusion_label": row["human_confusion_label"],
"llm_defusion_label": row["llm_defusion_label"],
"human_defusion_label": row["human_defusion_label"]
}
else:
data_dict[doc_id][q_id] = {
"llm_confusion_label": row["llm_confusion_label"]
}
return data_dict
def sidebar_logic(gold_doc_ids, silver_doc_ids):
st.sidebar.header("Which data file to view?")
file_name = st.sidebar.selectbox("Choose Data Name:", ["gold", "silver"])
if file_name == "gold":
doc_ids = gold_doc_ids
else:
doc_ids = silver_doc_ids
topic_q_ids = collections.defaultdict(list)
for doc_id in doc_ids:
topic_q_ids[doc_id.split("_")[0]].append(doc_id)
tpoic = st.sidebar.selectbox("Choose Doc id:", topic_q_ids.keys())
doc_id = st.sidebar.selectbox("Choose Doc id:", topic_q_ids[tpoic])
return file_name, doc_id
def init():
st.set_page_config(layout="wide")
cwd = os.getcwd()
st.title("Dataset Viewer")
if "document_content" not in st.session_state:
st.session_state.document_content = ""
if "question_content" not in st.session_state:
st.session_state.question_content = ""
st.markdown(
"""
<style>
/* Make the left column sticky */
div[data-testid="column"]:nth-child(1) {
position: sticky;
top: 0;
height: 100vh;
overflow-y: auto;
padding-right: 10px;
padding-bottom: 30px;
border-right: 2px solid #ccc;
}
</style>
""",
unsafe_allow_html=True
)
return cwd
def show_doc_contents(doc_data, doc_id):
st.session_state.document_content = doc_data[doc_data["doc_id"] == doc_id]["document"].values[0]
if st.session_state.document_content:
st.write(f"### Document: {doc_id}")
st.write(st.session_state.document_content)
def show_question_contents_and_labels(q_labels, question_data, gold_response_data):
for q_id, labels in q_labels.items():
st.write(f"### Question: {q_id}")
st.text_area("Question:", value=question_data[q_id], key=q_id)
if q_id in gold_response_data:
st.text_area("Gold Response:", value=gold_response_data[q_id], key=f"gold_response_{q_id}")
# Gather label info
label_lines = []
for label_key in [
"llm_confusion_label",
"human_confusion_label",
"llm_defusion_label",
"human_defusion_label"
]:
if label_key in labels:
label_lines.append(f"{label_key}: {labels[label_key]}")
# Display all labels in one box
if label_lines:
st.text_area(
"Labels:",
value="\n".join(label_lines),
key=f"labels_{q_id}"
)
if __name__ == "__main__":
cwd = init()
# Check if the folder already exists
# If not, download it
if not os.path.exists("view"):
folder_path = download_folder()
st.success(f"Downloaded to {folder_path}")
# Load data
news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data = load_data()
# Sidebar
file_name, doc_id = sidebar_logic(gold_data.keys(), silver_data.keys())
# Show Question contents
left, right = st.columns([2 , 1.5]) # these numbers represent proportions
# Display document
with left:
st.session_state.document_content = news_data[doc_id]["title"] + "\n" + news_data[doc_id]["content"]
st.write(f"### Document: {doc_id}")
st.write(st.session_state.document_content)
# Display question
with right:
if file_name == "gold":
q_labels = gold_data[doc_id]
else:
q_labels = silver_data[doc_id]
show_question_contents_and_labels(q_labels, question_data, gold_response_data) |