import streamlit as st
import pandas as pd
import random
import time
import string
import gspread
import os
import json
import datetime
import re
from oauth2client.service_account import ServiceAccountCredentials
# Set page config at the very beginning
st.set_page_config(page_title="LLM Output Evaluation", layout="wide")
# Define the primary highlight color (keeping it consistent with previous apps)
HIGHLIGHT_COLOR = "#2c7be5"
# --- ALL UTILITY FUNCTIONS DEFINED AT THE TOP (Solving NameError) ---
def highlight_keyword(sentence, keyword, color=HIGHLIGHT_COLOR):
"""Highlights a specific keyword in a sentence, ignoring case."""
# Use word boundaries (\b) to match whole words and ignore case
return re.sub(r'\b' + re.escape(keyword) + r'\b',
r"\g<0>",
sentence, flags=re.IGNORECASE)
def generate_passcode(worker_id):
suffix = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
return f"EXP2-pilot-W{worker_id:02d}-{suffix}"
def get_google_creds():
service_account_json = os.getenv("SERVICE_ACCOUNT_JSON")
if service_account_json:
try:
creds_dict = json.loads(service_account_json)
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
return gspread.authorize(creds)
except json.JSONDecodeError:
st.error("Invalid JSON format in SERVICE_ACCOUNT_JSON environment variable. Please ensure it's a single, valid JSON string.")
return None
except Exception as e:
st.error(f"Error loading Google credentials: {e}")
return None
else:
st.error("Google service account credentials (SERVICE_ACCOUNT_JSON) not found in environment variables. Please configure your Streamlit app secrets or local environment.")
return None
def upload_to_google_drive(response_df):
if response_df.empty:
st.warning("No responses to upload.")
return
try:
client = get_google_creds()
if client is None:
st.error("β Google credentials not loaded. Cannot upload results.")
return
sheet_name = "EXP2-pilot" # Sheet name for Experiment 2
try:
sheet = client.open(sheet_name).sheet1
except gspread.exceptions.SpreadsheetNotFound:
st.info(f"Creating new Google Sheet: {sheet_name}")
sheet = client.create(sheet_name).sheet1
# Get current headers from the sheet
current_sheet_headers = sheet.row_values(1) if sheet.row_count > 0 else []
expected_headers = list(response_df.columns)
# Add headers if the sheet is empty or headers don't match
if not current_sheet_headers or current_sheet_headers != expected_headers:
# if sheet.row_count > 0:
# st.warning("Google Sheet headers do not match. Data will be appended, but consider manual alignment or creating a new sheet/worksheet.")
if not current_sheet_headers: # Only add if sheet is truly empty after potential clear
sheet.append_row(expected_headers)
# st.info("Added headers to the Google Sheet.")
# elif current_sheet_headers != expected_headers:
# st.error("Existing sheet headers mismatch. Data will be appended, but columns might be misaligned.")
# Prepare data: Replace NaN, inf with empty string, then convert to list of lists
response_df_clean = response_df.replace([float("inf"), float("-inf")], None).fillna("")
data_to_upload = response_df_clean.values.tolist()
# Append all rows at once for efficiency
if data_to_upload:
sheet.append_rows(data_to_upload)
st.success("β
Your responses have been recorded successfully.")
# Clear responses after successful upload to prevent re-uploading on rerun
st.session_state.responses = []
else:
st.warning("No new responses to upload.")
except Exception as e:
st.error("β Error uploading to Google Drive:")
st.error(f"Details: {e}")
# Function to record responses for the current section
def record_section_responses(idx, sec_idx, current_sample_data, current_section_title, acc_score, comp_score, interp_score):
worker_id = st.session_state.get("worker_id", "N/A")
passcode = st.session_state.get("passcode", "N/A")
timestamp = datetime.datetime.now().isoformat()
# Calculate response_time_sec *before* appending to state, as time.time() changes.
start_time_for_section = st.session_state.get("response_start_time", time.time())
response_time = time.time() - start_time_for_section
# Define common fields for all metrics from this section
base_record = {
"timestamp": timestamp,
"worker_id": worker_id,
"passcode": passcode,
"sample_index": idx,
"section_index_within_sample": sec_idx,
"section_title": current_section_title,
"original_text": current_sample_data["text"],
"keyword": current_sample_data["keyword"],
"response_time_sec": response_time,
}
# Record each metric as a separate row
st.session_state.responses.append({**base_record, "metric": "Accuracy", "score": acc_score})
st.session_state.responses.append({**base_record, "metric": "Completeness", "score": comp_score})
st.session_state.responses.append({**base_record, "metric": "Interpretability", "score": interp_score})
def generate_rating_prompt(section_title: str) -> str:
# Remove leading number and colon
if ". " in section_title:
section_title = section_title.split(". ", 1)[1]
if ":" in section_title:
section_name = section_title.split(":", 1)[0].strip()
else:
section_name = section_title.strip()
section_name = section_name.lower()
if "engaged event" in section_name:
return "How well does this capture the events involving the keyword in this situation? More specifically: "
elif "generalizable propert" in section_name: # 'propert' for 'property' or 'properties'
return "How well does this reflect the relevant properties of the keyword in this situation? More specifically: "
elif "evoked emotion" in section_name:
return "How well does this capture the emotions evoked by the keyword in this situation? More specifically: "
else:
return f"How well does this describe the {section_name}? More specifically: "
# --- Data Definition for Samples (Moved to after utility functions) ---
stimuli_list = [
{
"text": "The mournful cry of a pair of crows and a single lost lamb added an eeriness to the scene.",
"keyword": "crow",
"scene_output": {
"1. Engaged Events: What is happening in the situation?": [
"They emit a mournful cry",
"Their presence adds eeriness to the scene"
],
"2. Generalizable Properties: What are the relevant properties of crow in the situation?": [
"They are often associated with foreboding or ominous situations",
"Their vocalizations can enhance the emotional tone of a setting"
],
"3. Evoked Emotions: Which emotions do you observe in the situation?": [
"Eerie: Their cries contribute to a haunting atmosphere.",
"Mourning: Their sound suggests themes of loss and sorrow."
]
}
},
{
"text": "Not knowing what else to do, I got up. Tea, I told myself. Chamomile. Or white. White tea is soothing, and there's nothing in it that sets me off.",
"keyword": "tea",
"scene_output": {
"1. Engaged Events: What is happening in the situation?": [ # Corrected
"PersonX considers chamomile tea",
"PersonX considers white tea",
"PersonX plans to prepare tea"
],
"2. Generalizable Properties: What are the relevant properties of tea in the situation?": [ # Corrected
"It is associated with comfort and relaxation",
"It has various types that can cater to different needs"
],
"3. Evoked Emotions: Which emotions do you observe in the situation?": [ # Corrected
"Comfort: The choice of tea is aimed at providing solace.",
"Uncertainty: The initial indecision reflects a search for clarity."
]
}
},
{
"text": "One morning when Tessie lifted the lid of the crate, she found a beautiful monarch butterfly clinging upside down from the broken cocoon.",
"keyword": "butterfly",
"scene_output": {
"1. Engaged Events: What is happening in the situation?": [
"AnimalX clings to ObjectY",
"AnimalX emerges from ObjectY"
],
"2. Generalizable Properties: What are the relevant properties of butterfly in the situation?": [
"It symbolizes transformation and beauty",
"It represents new beginnings after a period of change"
],
"3. Evoked Emotions: Which emotions do you observe in the situation?": [
"Wonder: The discovery of a butterfly can evoke feelings of awe and appreciation for nature."
]
}
}
]
# --- Page Functions ---
def instructions_1():
st.title("Experiment 2: LLM Scene Abstraction Evaluation")
st.header("π Instructions (1/2)")
st.write(f"""
Welcome to Experiment 2! Hereβs how it works:
- You will read a sentence that contains a specific **keyword**.
- You will then see **scene-level information about the keyword** in the given situation, generated by a large language model (LLM).
- The information is organized into three sections:
1. **Engaged Events** β What is happening to the keyword in this situation?
2. **Generalizable Properties** β What context-relevant properties of the keyword are revealed through this situation?
3. **Evoked Emotions** β What emotions are associated with the keyword in this scene, and why?
Your task is to **evaluate each section** based on how well it reflects the information conveyed in the original sentence.
- For each section, please rate the following dimensions on a 1β5 scale:
- **Accuracy** β How accurate is it? Is the content factually consistent with the sentence?
- **Completeness** β How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
- **Interpretability** β How interpretable is it? Is it easy to understand?
If you have questions or feedback, please feel free to let us know via email.
""", unsafe_allow_html=True)
if st.button("Next β‘οΈ"):
st.session_state.step = "instructions_2"
st.rerun()
st.stop()
def instructions_2():
st.title("Experiment 2: LLM Scene Abstraction Evaluation")
st.header("π Instructions (2/2)")
st.write(f"""
Placeholder notation guide
In the scene descriptions, you will encounter placeholder labels like PersonX and AnimalX. These can be interpreted as follows:
- PersonX: someone in the scene
- PersonY: another individual in the scene
- AnimalX: some animal in the scene
- ObjectX: some non-living object in the scene
- PersonGroupX: a group of people
- AnimalGroupX: a group of animals (e.g., a flock of birds, a pack of wolves)
These labels are used instead of specific names to help you focus on the roles and actions of each entity in the scene, rather than their exact names or identities.
When you're ready, click below to begin!
""", unsafe_allow_html=True)
if st.button("Start practicing βΆοΈ"):
st.session_state.step = "training"
# The response_start_time will be set inside the training() function
# when the first section is actually displayed.
st.rerun()
st.stop()
def training():
st.title("Experiment 2: LLM Scene Abstraction Evaluation")
stimuli = stimuli_list # Using the predefined stimuli_list for training
idx = st.session_state.training_index
# --- Handle Training Completion ---
if idx >= len(stimuli):
st.session_state.training_complete = True
st.header("π Practice Complete!")
st.markdown("""
Sentence {idx + 1} of {total_samples}
", unsafe_allow_html=True) # Keyword display st.markdown( f"Keyword: {current_sample_data.get('keyword', 'N/A')}
", unsafe_allow_html=True ) # Text box st.markdown("Text:") text = current_sample_data['text'] keyword = current_sample_data['keyword'] pattern = re.compile(re.escape(keyword), re.IGNORECASE) text_with_bold = pattern.sub(r"\g<0>", text, count=1) st.markdown( f"""{prompt_text}
", unsafe_allow_html=True ) # Rating Keys (using session state to retrieve prior selections) acc_key = f"rating_acc_{idx}_{sec_idx}" comp_key = f"rating_comp_{idx}_{sec_idx}" interp_key = f"rating_interp_{idx}_{sec_idx}" # Retrieve current selected values from session state to pre-fill radio buttons current_acc_val = st.session_state.get(acc_key) current_comp_val = st.session_state.get(comp_key) current_interp_val = st.session_state.get(interp_key) # Accuracy st.markdown("How accurate is it? Is the content factually consistent with the sentence?
", unsafe_allow_html=True) acc = st.radio( label="Accuracy", options=[1,2,3,4,5], index=current_acc_val - 1 if current_acc_val else None, # Convert value (1-5) to index (0-4) key=acc_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
", unsafe_allow_html=True) comp = st.radio( label="Completeness", options=[1,2,3,4,5], index=current_comp_val - 1 if current_comp_val else None, key=comp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""How interpretable is it? Is it easy to understand?
", unsafe_allow_html=True) interp = st.radio( label="Interpretability", options=[1,2,3,4,5], index=current_interp_val - 1 if current_interp_val else None, key=interp_key, horizontal=True, label_visibility="collapsed" ) st.markdown("""