Spaces:

zhiyuanpeng
/

ELOQ-Data-Viewer

Sleeping

File size: 5,910 Bytes

96322c0

import streamlit as st
import os
import json
from os.path import join
import pandas as pd
from huggingface_hub import snapshot_download
import collections

@st.cache_data
def download_folder():
    local_dir = "view"
    os.makedirs(local_dir, exist_ok=True)
    st.info("Downloading folder from Hugging Face...It may take a while.")
    snapshot_download(
        repo_id="zhiyuanpeng/ELOQ",
        repo_type="dataset",
        local_dir=local_dir,  # Or a folder of your choice
        use_auth_token=False,
        allow_patterns=["ELOQ/*"]
    )
    return local_dir

@st.cache_data
def load_data():
    data_dir = "view/ELOQ"
    news_file = join(data_dir, "news.json")
    with open(news_file, "r") as f:
        news_data = json.load(f)
    question_file = join(data_dir, "questions.json")
    with open(question_file, "r") as f:
        question_data = json.load(f)
    silver_file = join(data_dir, "silver.csv")
    silver_data = pd.read_csv(silver_file)
    gold_file = join(data_dir, "gold.csv")
    gold_data = pd.read_csv(gold_file)
    silver_response_file = join(data_dir, "silver_responses.json")
    with open(silver_response_file, "r") as f:
        silver_response_data = json.load(f)
    gold_response_file = join(data_dir, "gold_responses.json")
    with open(gold_response_file, "r") as f:
        gold_response_data = json.load(f)
    silver_data = process_data(silver_data, "silver")
    gold_data = process_data(gold_data, "gold")
    return news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data

def process_data(data, file_name):
    # {data_dict: {q_id: {}, ...}}
    data_dict = collections.defaultdict(dict)
    for index, row in data.iterrows():
        doc_id = row["doc_id"]
        q_id = row["q_id"]
        if file_name == "gold":
            data_dict[doc_id][q_id] = {
                "llm_confusion_label": row["llm_confusion_label"],
                "human_confusion_label": row["human_confusion_label"],
                "llm_defusion_label": row["llm_defusion_label"],
                "human_defusion_label": row["human_defusion_label"]
            }
        else:
            data_dict[doc_id][q_id]  = {
                "llm_confusion_label": row["llm_confusion_label"]
            }
    return data_dict

def sidebar_logic(gold_doc_ids, silver_doc_ids):
    st.sidebar.header("Which data file to view?")
    file_name = st.sidebar.selectbox("Choose Data Name:", ["gold", "silver"])
    if file_name == "gold":
        doc_ids = gold_doc_ids
    else:
        doc_ids = silver_doc_ids
    topic_q_ids = collections.defaultdict(list)
    for doc_id in doc_ids:
        topic_q_ids[doc_id.split("_")[0]].append(doc_id)
    tpoic = st.sidebar.selectbox("Choose Doc id:", topic_q_ids.keys())
    doc_id = st.sidebar.selectbox("Choose Doc id:", topic_q_ids[tpoic])
    return file_name, doc_id

def init():
    st.set_page_config(layout="wide")
    cwd = os.getcwd() 
    st.title("Dataset Viewer")

    if "document_content" not in st.session_state:
        st.session_state.document_content = ""
    if "question_content" not in st.session_state:
        st.session_state.question_content = ""
        
    st.markdown(
        """
        <style>
        /* Make the left column sticky */
        div[data-testid="column"]:nth-child(1) {
            position: sticky;
            top: 0;
            height: 100vh;
            overflow-y: auto;
            padding-right: 10px;
            padding-bottom: 30px;
            border-right: 2px solid #ccc;
        }
        </style>
        """,
        unsafe_allow_html=True
    )
    return cwd 

def show_doc_contents(doc_data, doc_id):
    st.session_state.document_content = doc_data[doc_data["doc_id"] == doc_id]["document"].values[0]
    if st.session_state.document_content:
        st.write(f"### Document: {doc_id}")
        st.write(st.session_state.document_content)

def show_question_contents_and_labels(q_labels, question_data, gold_response_data):
    for q_id, labels in q_labels.items():
        st.write(f"### Question: {q_id}")
        st.text_area("Question:", value=question_data[q_id], key=q_id)
        if q_id in gold_response_data:
            st.text_area("Gold Response:", value=gold_response_data[q_id], key=f"gold_response_{q_id}")
        # Gather label info
        label_lines = []
        for label_key in [
            "llm_confusion_label",
            "human_confusion_label",
            "llm_defusion_label",
            "human_defusion_label"
        ]:
            if label_key in labels:
                label_lines.append(f"{label_key}: {labels[label_key]}")

        # Display all labels in one box
        if label_lines:
            st.text_area(
                "Labels:",
                value="\n".join(label_lines),
                key=f"labels_{q_id}"
            )
    

if __name__ == "__main__":

    cwd = init()
    # Check if the folder already exists
    # If not, download it
    if not os.path.exists("view"):
        folder_path = download_folder()
        st.success(f"Downloaded to {folder_path}")
    # Load data
    news_data, question_data, silver_data, gold_data, silver_response_data, gold_response_data = load_data()
    # Sidebar
    file_name, doc_id = sidebar_logic(gold_data.keys(), silver_data.keys())
    # Show Question contents
    left, right = st.columns([2 , 1.5])  # these numbers represent proportions
    # Display document
    with left:
        st.session_state.document_content = news_data[doc_id]["title"] + "\n" + news_data[doc_id]["content"]
        st.write(f"### Document: {doc_id}")
        st.write(st.session_state.document_content)
    # Display question
    with right:
        if file_name == "gold":
            q_labels = gold_data[doc_id]
        else:
            q_labels = silver_data[doc_id]
        show_question_contents_and_labels(q_labels, question_data, gold_response_data)