File size: 5,910 Bytes
96322c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import streamlit as st
import os
import json
from os.path import join
import pandas as pd
from huggingface_hub import snapshot_download
import collections

@st.cache_data
def download_folder():
    local_dir = "view"
    os.makedirs(local_dir, exist_ok=True)
    st.info("Downloading folder from Hugging Face...It may take a while.")
    snapshot_download(
        repo_id="zhiyuanpeng/ELOQ",
        repo_type="dataset",
        local_dir=local_dir,  # Or a folder of your choice
        use_auth_token=False,
        allow_patterns=["ELOQ/*"]
    )
    return local_dir

@st.cache_data
def load_data():
    data_dir = "view/ELOQ"
    news_file = join(data_dir, "news.json")
    with open(news_file, "r") as f:
        news_data = json.load(f)
    question_file = join(data_dir, "questions.json")
    with open(question_file, "r") as f:
        question_data = json.load(f)
    silver_file = join(data_dir, "silver.csv")
    silver_data = pd.read_csv(silver_file)
    gold_file = join(data_dir, "gold.csv")
    gold_data = pd.read_csv(gold_file)
    silver_response_file = join(data_dir, "silver_responses.json")
    with open(silver_response_file, "r") as f:
        silver_response_data = json.load(f)
    gold_response_file = join(data_dir, "gold_responses.json")
    with open(gold_response_file, "r") as f:
        gold_response_data = json.load(f)
    silver_data = process_data(silver_data, "silver")
    gold_data = process_data(gold_data, "gold")
    return news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data

def process_data(data, file_name):
    # {data_dict: {q_id: {}, ...}}
    data_dict = collections.defaultdict(dict)
    for index, row in data.iterrows():
        doc_id = row["doc_id"]
        q_id = row["q_id"]
        if file_name == "gold":
            data_dict[doc_id][q_id] = {
                "llm_confusion_label": row["llm_confusion_label"],
                "human_confusion_label": row["human_confusion_label"],
                "llm_defusion_label": row["llm_defusion_label"],
                "human_defusion_label": row["human_defusion_label"]
            }
        else:
            data_dict[doc_id][q_id]  = {
                "llm_confusion_label": row["llm_confusion_label"]
            }
    return data_dict

def sidebar_logic(gold_doc_ids, silver_doc_ids):
    st.sidebar.header("Which data file to view?")
    file_name = st.sidebar.selectbox("Choose Data Name:", ["gold", "silver"])
    if file_name == "gold":
        doc_ids = gold_doc_ids
    else:
        doc_ids = silver_doc_ids
    topic_q_ids = collections.defaultdict(list)
    for doc_id in doc_ids:
        topic_q_ids[doc_id.split("_")[0]].append(doc_id)
    tpoic = st.sidebar.selectbox("Choose Doc id:", topic_q_ids.keys())
    doc_id = st.sidebar.selectbox("Choose Doc id:", topic_q_ids[tpoic])
    return file_name, doc_id

def init():
    st.set_page_config(layout="wide")
    cwd = os.getcwd() 
    st.title("Dataset Viewer")

    if "document_content" not in st.session_state:
        st.session_state.document_content = ""
    if "question_content" not in st.session_state:
        st.session_state.question_content = ""
        
    st.markdown(
        """
        <style>
        /* Make the left column sticky */
        div[data-testid="column"]:nth-child(1) {
            position: sticky;
            top: 0;
            height: 100vh;
            overflow-y: auto;
            padding-right: 10px;
            padding-bottom: 30px;
            border-right: 2px solid #ccc;
        }
        </style>
        """,
        unsafe_allow_html=True
    )
    return cwd 

def show_doc_contents(doc_data, doc_id):
    st.session_state.document_content = doc_data[doc_data["doc_id"] == doc_id]["document"].values[0]
    if st.session_state.document_content:
        st.write(f"### Document: {doc_id}")
        st.write(st.session_state.document_content)

def show_question_contents_and_labels(q_labels, question_data, gold_response_data):
    for q_id, labels in q_labels.items():
        st.write(f"### Question: {q_id}")
        st.text_area("Question:", value=question_data[q_id], key=q_id)
        if q_id in gold_response_data:
            st.text_area("Gold Response:", value=gold_response_data[q_id], key=f"gold_response_{q_id}")
        # Gather label info
        label_lines = []
        for label_key in [
            "llm_confusion_label",
            "human_confusion_label",
            "llm_defusion_label",
            "human_defusion_label"
        ]:
            if label_key in labels:
                label_lines.append(f"{label_key}: {labels[label_key]}")

        # Display all labels in one box
        if label_lines:
            st.text_area(
                "Labels:",
                value="\n".join(label_lines),
                key=f"labels_{q_id}"
            )
    

if __name__ == "__main__":

    cwd = init()
    # Check if the folder already exists
    # If not, download it
    if not os.path.exists("view"):
        folder_path = download_folder()
        st.success(f"Downloaded to {folder_path}")
    # Load data
    news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data = load_data()
    # Sidebar
    file_name, doc_id = sidebar_logic(gold_data.keys(), silver_data.keys())
    # Show Question contents
    left, right = st.columns([2 , 1.5])  # these numbers represent proportions
    # Display document
    with left:
        st.session_state.document_content = news_data[doc_id]["title"] + "\n" + news_data[doc_id]["content"]
        st.write(f"### Document: {doc_id}")
        st.write(st.session_state.document_content)
    # Display question
    with right:
        if file_name == "gold":
            q_labels = gold_data[doc_id]
        else:
            q_labels = silver_data[doc_id]
        show_question_contents_and_labels(q_labels, question_data, gold_response_data)