File size: 29,179 Bytes
6a5434e
 
 
 
 
 
 
 
c95447d
428030b
becbb53
6a5434e
 
becbb53
 
 
 
c95447d
 
428030b
 
 
 
 
 
 
 
becbb53
 
 
 
 
c95447d
 
 
 
 
 
 
 
 
428030b
c95447d
 
 
 
 
 
 
becbb53
c95447d
 
 
 
 
 
 
 
 
 
 
becbb53
 
c95447d
 
 
 
 
 
 
 
 
 
 
 
b568a13
 
becbb53
c95447d
b568a13
 
 
c95447d
becbb53
c95447d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
becbb53
 
 
 
c95447d
becbb53
428030b
becbb53
 
c95447d
becbb53
 
c95447d
 
 
 
becbb53
 
 
 
c95447d
 
 
becbb53
 
 
 
 
428030b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
becbb53
428030b
becbb53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428030b
becbb53
 
 
 
428030b
becbb53
 
 
428030b
becbb53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c95447d
 
 
428030b
d609ac9
 
 
becbb53
d609ac9
6a5434e
becbb53
 
6a5434e
d609ac9
 
 
 
 
 
6a5434e
d609ac9
 
 
 
6a5434e
d609ac9
 
 
6a5434e
d609ac9
 
 
 
428030b
6a5434e
becbb53
d609ac9
 
 
becbb53
d609ac9
6a5434e
becbb53
6a5434e
becbb53
 
 
 
6a5434e
becbb53
 
6a5434e
d609ac9
6a5434e
d609ac9
 
 
 
6a5434e
d609ac9
 
428030b
 
d609ac9
428030b
6a5434e
d609ac9
 
6a5434e
c95447d
d609ac9
6a5434e
428030b
d609ac9
 
 
 
b568a13
d609ac9
 
 
 
 
6a5434e
428030b
d609ac9
428030b
c95447d
becbb53
d609ac9
c95447d
428030b
becbb53
 
 
 
 
c95447d
becbb53
 
 
 
d609ac9
6a5434e
d609ac9
 
428030b
6a5434e
c95447d
becbb53
c95447d
6a5434e
becbb53
 
 
c95447d
428030b
 
6a5434e
becbb53
c95447d
 
6a5434e
428030b
becbb53
428030b
 
 
c95447d
 
 
428030b
c95447d
 
 
 
428030b
c95447d
becbb53
c95447d
 
6a5434e
428030b
c95447d
becbb53
 
c95447d
 
 
 
 
 
 
 
 
 
 
 
 
6a5434e
428030b
c95447d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a5434e
428030b
becbb53
c95447d
 
 
 
 
6a5434e
428030b
c95447d
 
 
 
 
 
 
6a5434e
c95447d
 
 
d609ac9
 
428030b
c95447d
 
 
 
428030b
 
 
 
becbb53
428030b
c95447d
 
 
 
428030b
c95447d
 
 
 
d609ac9
c95447d
becbb53
c95447d
 
 
 
 
 
 
 
 
 
428030b
c95447d
 
 
 
428030b
c95447d
 
 
 
d609ac9
c95447d
becbb53
c95447d
 
 
 
 
 
 
 
 
 
428030b
c95447d
 
 
 
428030b
c95447d
 
 
 
d609ac9
c95447d
becbb53
c95447d
 
 
 
 
 
 
 
428030b
c95447d
 
 
428030b
c95447d
becbb53
c95447d
 
428030b
c95447d
becbb53
c95447d
 
0f21577
 
428030b
c95447d
becbb53
 
c95447d
 
becbb53
 
 
 
 
c95447d
 
 
 
 
 
 
 
 
 
becbb53
c95447d
428030b
becbb53
 
428030b
becbb53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428030b
becbb53
 
 
428030b
 
becbb53
 
428030b
becbb53
 
 
428030b
becbb53
 
 
 
 
 
 
d609ac9
428030b
becbb53
 
428030b
becbb53
 
 
 
 
 
 
 
 
a1f0616
becbb53
 
428030b
becbb53
 
 
 
428030b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
becbb53
 
 
 
 
 
 
 
 
428030b
becbb53
 
428030b
becbb53
428030b
becbb53
 
428030b
becbb53
428030b
 
becbb53
428030b
becbb53
428030b
becbb53
 
428030b
becbb53
428030b
becbb53
 
 
428030b
becbb53
 
d609ac9
becbb53
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
import streamlit as st
import pandas as pd
import random
import time
import string
import gspread
import os
import json
import datetime
import re

from oauth2client.service_account import ServiceAccountCredentials

# Set page config at the very beginning
st.set_page_config(page_title="LLM Output Evaluation", layout="wide")

# Define the primary highlight color (keeping it consistent with previous apps)
HIGHLIGHT_COLOR = "#2c7be5"

# --- ALL UTILITY FUNCTIONS DEFINED AT THE TOP (Solving NameError) ---

def highlight_keyword(sentence, keyword, color=HIGHLIGHT_COLOR):
    """Highlights a specific keyword in a sentence, ignoring case."""
    # Use word boundaries (\b) to match whole words and ignore case
    return re.sub(r'\b' + re.escape(keyword) + r'\b',
                  r"<strong style='color:" + color + ";'>\g<0></strong>",
                  sentence, flags=re.IGNORECASE)

def generate_passcode(worker_id):
    suffix = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))
    return f"EXP2-pilot-W{worker_id:02d}-{suffix}"

def get_google_creds():
    service_account_json = os.getenv("SERVICE_ACCOUNT_JSON")
    if service_account_json:
        try:
            creds_dict = json.loads(service_account_json)
            scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
            creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
            return gspread.authorize(creds)
        except json.JSONDecodeError:
            st.error("Invalid JSON format in SERVICE_ACCOUNT_JSON environment variable. Please ensure it's a single, valid JSON string.")
            return None
        except Exception as e:
            st.error(f"Error loading Google credentials: {e}")
            return None
    else:
        st.error("Google service account credentials (SERVICE_ACCOUNT_JSON) not found in environment variables. Please configure your Streamlit app secrets or local environment.")
        return None
        
def upload_to_google_drive(response_df):
    if response_df.empty:
        st.warning("No responses to upload.")
        return

    try:
        client = get_google_creds()
        if client is None:
            st.error("❌ Google credentials not loaded. Cannot upload results.")
            return

        sheet_name = "EXP2-pilot" # Sheet name for Experiment 2
        
        try:
            sheet = client.open(sheet_name).sheet1
        except gspread.exceptions.SpreadsheetNotFound:
            st.info(f"Creating new Google Sheet: {sheet_name}")
            sheet = client.create(sheet_name).sheet1
        
        # Get current headers from the sheet
        current_sheet_headers = sheet.row_values(1) if sheet.row_count > 0 else []
        expected_headers = list(response_df.columns)

        # Add headers if the sheet is empty or headers don't match
        if not current_sheet_headers or current_sheet_headers != expected_headers:
            # if sheet.row_count > 0:
            #     st.warning("Google Sheet headers do not match. Data will be appended, but consider manual alignment or creating a new sheet/worksheet.")            
            if not current_sheet_headers: # Only add if sheet is truly empty after potential clear
                sheet.append_row(expected_headers)
                # st.info("Added headers to the Google Sheet.")
            # elif current_sheet_headers != expected_headers:
            #     st.error("Existing sheet headers mismatch. Data will be appended, but columns might be misaligned.")


        # Prepare data: Replace NaN, inf with empty string, then convert to list of lists
        response_df_clean = response_df.replace([float("inf"), float("-inf")], None).fillna("")
        data_to_upload = response_df_clean.values.tolist()

        # Append all rows at once for efficiency
        if data_to_upload:
            sheet.append_rows(data_to_upload)
            st.success("βœ… Your responses have been recorded successfully.")
            # Clear responses after successful upload to prevent re-uploading on rerun
            st.session_state.responses = [] 
        else:
            st.warning("No new responses to upload.")

    except Exception as e:
        st.error("❌ Error uploading to Google Drive:")
        st.error(f"Details: {e}")

# Function to record responses for the current section
def record_section_responses(idx, sec_idx, current_sample_data, current_section_title, acc_score, comp_score, interp_score):
    worker_id = st.session_state.get("worker_id", "N/A")
    passcode = st.session_state.get("passcode", "N/A")
    timestamp = datetime.datetime.now().isoformat()
    
    # Calculate response_time_sec *before* appending to state, as time.time() changes.
    start_time_for_section = st.session_state.get("response_start_time", time.time())
    response_time = time.time() - start_time_for_section

    # Define common fields for all metrics from this section
    base_record = {
        "timestamp": timestamp,
        "worker_id": worker_id,
        "passcode": passcode,
        "sample_index": idx,
        "section_index_within_sample": sec_idx,
        "section_title": current_section_title,
        "original_text": current_sample_data["text"],
        "keyword": current_sample_data["keyword"],
        "response_time_sec": response_time,
    }

    # Record each metric as a separate row
    st.session_state.responses.append({**base_record, "metric": "Accuracy", "score": acc_score})
    st.session_state.responses.append({**base_record, "metric": "Completeness", "score": comp_score})
    st.session_state.responses.append({**base_record, "metric": "Interpretability", "score": interp_score})

def generate_rating_prompt(section_title: str) -> str:
    # Remove leading number and colon
    if ". " in section_title:
        section_title = section_title.split(". ", 1)[1]
    if ":" in section_title:
        section_name = section_title.split(":", 1)[0].strip()
    else:
        section_name = section_title.strip()

    section_name = section_name.lower()

    if "engaged event" in section_name:
        return "How well does this capture the events involving the keyword in this situation? More specifically: "
    elif "generalizable propert" in section_name: # 'propert' for 'property' or 'properties'
        return "How well does this reflect the relevant properties of the keyword in this situation? More specifically: "
    elif "evoked emotion" in section_name:
        return "How well does this capture the emotions evoked by the keyword in this situation? More specifically: "
    else:
        return f"How well does this describe the {section_name}? More specifically: "


# --- Data Definition for Samples (Moved to after utility functions) ---
stimuli_list = [
    {
        "text": "The mournful cry of a pair of crows and a single lost lamb added an eeriness to the scene.",
        "keyword": "crow",
        "scene_output": {
            "1. Engaged Events: What is happening in the situation?": [
                "They emit a mournful cry",
                "Their presence adds eeriness to the scene"
            ],
            "2. Generalizable Properties: What are the relevant properties of crow in the situation?": [
                "They are often associated with foreboding or ominous situations",
                "Their vocalizations can enhance the emotional tone of a setting"
            ],
            "3. Evoked Emotions: Which emotions do you observe in the situation?": [
                "Eerie: Their cries contribute to a haunting atmosphere.",
                "Mourning: Their sound suggests themes of loss and sorrow."
            ]
        }
    },
    {
        "text": "Not knowing what else to do, I got up. Tea, I told myself. Chamomile. Or white. White tea is soothing, and there's nothing in it that sets me off.",
        "keyword": "tea",
        "scene_output": {
            "1. Engaged Events: What is happening in the situation?": [ # Corrected
                "PersonX considers chamomile tea",
                "PersonX considers white tea",
                "PersonX plans to prepare tea"
            ],
            "2. Generalizable Properties: What are the relevant properties of tea in the situation?": [ # Corrected
                "It is associated with comfort and relaxation",
                "It has various types that can cater to different needs"
            ],
            "3. Evoked Emotions: Which emotions do you observe in the situation?": [ # Corrected
                "Comfort: The choice of tea is aimed at providing solace.",
                "Uncertainty: The initial indecision reflects a search for clarity."
            ]
        }
    },
    {
        "text": "One morning when Tessie lifted the lid of the crate, she found a beautiful monarch butterfly clinging upside down from the broken cocoon.",
        "keyword": "butterfly",
        "scene_output": {
            "1. Engaged Events: What is happening in the situation?": [
                "AnimalX clings to ObjectY",
                "AnimalX emerges from ObjectY"
            ],
            "2. Generalizable Properties: What are the relevant properties of butterfly in the situation?": [
                "It symbolizes transformation and beauty",
                "It represents new beginnings after a period of change"
            ],
            "3. Evoked Emotions: Which emotions do you observe in the situation?": [
                "Wonder: The discovery of a butterfly can evoke feelings of awe and appreciation for nature."
            ]
        }
    }
]


# --- Page Functions ---

def instructions_1():
    st.title("Experiment 2: LLM Scene Abstraction Evaluation")
    st.header("πŸ“– Instructions (1/2)")
    st.write(f"""
    Welcome to Experiment 2! Here’s how it works:

    - You will read a sentence that contains a specific <span style='color:{HIGHLIGHT_COLOR}; font-weight:500;'>**keyword**</span>.
    - You will then see <span style='color:{HIGHLIGHT_COLOR}; font-weight:500;'>**scene-level information about the keyword** in the given situation</span>, generated by a large language model (LLM).

        - The information is organized into three sections:
          1. **Engaged Events** β€” What is happening to the keyword in this situation?
          2. **Generalizable Properties** β€” What context-relevant properties of the keyword are revealed through this situation?
          3. **Evoked Emotions** β€” What emotions are associated with the keyword in this scene, and why?
    <br>
    Your task is to **evaluate each section** based on how well it reflects the information conveyed in the original sentence.

    - For each section, please rate the following dimensions on a 1–5 scale:
      - **Accuracy** β€” How accurate is it? Is the content factually consistent with the sentence?
      - **Completeness** β€” How complete and rich is it? Does it fully capture the relevant aspects of the keyword?
      - **Interpretability** β€” How interpretable is it? Is it easy to understand?

    <br>
    If you have questions or feedback, please feel free to let us know via email.
    <br><br>

    """, unsafe_allow_html=True)
    if st.button("Next ➑️"):
        st.session_state.step = "instructions_2"
        st.rerun()
    st.stop()


def instructions_2():
    st.title("Experiment 2: LLM Scene Abstraction Evaluation")
    st.header("πŸ“– Instructions (2/2)")
    st.write(f"""
    <b>Placeholder notation guide</b><br>

    In the scene descriptions, you will encounter placeholder labels like <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>PersonX</span> and <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>AnimalX</span>. These can be interpreted as follows:

    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>PersonX</span>: someone in the scene  
    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>PersonY</span>: another individual in the scene  
    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>AnimalX</span>: some animal in the scene  
    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>ObjectX</span>: some non-living object in the scene  

    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>PersonGroupX</span>: a group of people
    - <span style='color:{HIGHLIGHT_COLOR}; font-weight:600;'>AnimalGroupX</span>: a group of animals (e.g., a flock of birds, a pack of wolves)  

    These labels are used instead of specific names to help you focus on the roles and actions of each entity in the scene, rather than their exact names or identities.

    When you're ready, click below to begin!
    <br>
    <br>
    """, unsafe_allow_html=True)

    if st.button("Start practicing ▢️"):
        st.session_state.step = "training"
        # The response_start_time will be set inside the training() function
        # when the first section is actually displayed.
        st.rerun()
    st.stop()

def training():
    st.title("Experiment 2: LLM Scene Abstraction Evaluation")

    stimuli = stimuli_list # Using the predefined stimuli_list for training
    idx = st.session_state.training_index

    # --- Handle Training Completion ---
    if idx >= len(stimuli):
        st.session_state.training_complete = True
        st.header("πŸŽ‰ Practice Complete!")
        st.markdown("""
        <div style='font-size:18px; line-height:1.6;'>
            You've successfully completed the training phase of the experiment. Great work! 🎯<br><br>
            If you have any questions, suggestions, or feedback about the task, please let us know.<br>
            If everything is clear, just let us know that you're ready to proceed to the main experiment.
        </div>
        """, unsafe_allow_html=True)

        # Generate passcode if not already generated (e.g., for direct training start)
        if st.session_state.passcode is None:
            st.session_state.passcode = generate_passcode(st.session_state.get("worker_id", 0))
            
        # Prepare DataFrame for upload from stored responses
        response_df = pd.DataFrame(st.session_state.responses)
        
        # Ensure column order for consistency in Google Sheet
        # Define all possible columns that could be in a response record
        all_possible_cols = [
            "timestamp", "worker_id", "passcode", "sample_index", "section_index_within_sample", 
            "section_title", "original_text", "keyword", "metric", "score", "response_time_sec"
        ]
        
        # Filter and reorder DataFrame columns to match expected order
        final_cols = [col for col in all_possible_cols if col in response_df.columns]
        response_df = response_df[final_cols]

        upload_to_google_drive(response_df)

        st.markdown("#### πŸ”‘ Your Unique Completion Code")
        st.code(st.session_state.passcode)
        st.stop()

    # --- Display Current Sample and Section ---
    current_sample_data = stimuli[idx]
    total_samples = len(stimuli)

    # πŸ”„ Initialize section_index or reset for new sample
    if "section_index" not in st.session_state or \
       st.session_state.section_index >= len(list(current_sample_data['scene_output'].keys())):
        st.session_state.section_index = 0 # Reset for new sample
        # Reset timer only when moving to a new *sample* or if it's the very first display
        st.session_state.response_start_time = time.time()

    section_keys = list(current_sample_data['scene_output'].keys())
    sec_idx = st.session_state.section_index
    current_section_title = section_keys[sec_idx]

    # Ensure response_start_time is set for this particular section display
    # This specifically starts/restarts the timer for *this* section if it's new.
    # It's also set by the `st.session_state.section_index = 0` block above.
    if "response_start_time" not in st.session_state or st.session_state.response_start_time == 0:
         st.session_state.response_start_time = time.time()
    
    with st.form(key=f"form_{idx}_{sec_idx}"):

        # --- Left Column: Sentence and Section Summary ---
        left_col, right_col = st.columns([1,1])

        with left_col:
            st.markdown(f"<p style='color: gray; font-size: 15px;'>Sentence {idx + 1} of {total_samples}</p>", unsafe_allow_html=True)
            # Keyword display
            st.markdown(
                f"<p style='font-size:18px; font-weight: bold; color: {HIGHLIGHT_COLOR};'>Keyword: {current_sample_data.get('keyword', 'N/A')}</p>",
                unsafe_allow_html=True
            )

            # Text box
            st.markdown("Text:")
            text = current_sample_data['text']
            keyword = current_sample_data['keyword']
            pattern = re.compile(re.escape(keyword), re.IGNORECASE)
            text_with_bold = pattern.sub(r"<b>\g<0></b>", text, count=1)
            st.markdown(
                f"""
                <div style='border: 1px solid #ccc; border-radius: 6px;
                            padding: 12px; background-color: #ffffff;
                            color: #000000; font-size: 16px; line-height: 1.6;
                            margin-bottom: 1.2rem;'>
                    {text_with_bold}
                </div>
                """,
                unsafe_allow_html=True
            )

            # Section title and description box
            st.markdown("Scene information:")
            section_title_parts = current_section_title.split(":")
            bold_title = section_title_parts[0].strip() if len(section_title_parts) >= 1 else current_section_title
            subtitle = section_title_parts[1].strip() if len(section_title_parts) == 2 else ""

            st.markdown(
                f"""
                <div style='border-left: 4px solid {HIGHLIGHT_COLOR}; background-color: #ffffff;
                            color: #000000; padding: 12px 16px; border-radius: 6px;
                            margin: 1rem 0 0.8rem 0; font-size: 16px; font-weight: 600;'>
                    <div><b>{bold_title}</b></div>
                    <div style='font-weight: normal; font-size: 15px; margin-top: 4px;'>( {subtitle} )</div>
                </div>
                """,
                unsafe_allow_html=True
            )

            # Scene output bullets
            bullets = current_sample_data['scene_output'][current_section_title]
            st.markdown(
                "<ul style='margin-bottom: 0.2rem; padding-left: 1.2rem;'>" +
                "".join(f"<li style='margin-bottom: 0.2rem; font-size:17px;'>{b}</li>" for b in bullets) +
                "</ul>", unsafe_allow_html=True
            )

        # --- Right Column: Evaluation ---
        with right_col:
            prompt_text = generate_rating_prompt(current_section_title)
            highlight = "the keyword"
            if highlight in prompt_text:
                prompt_text = prompt_text.replace(
                    highlight,
                    f"<b style='color:{HIGHLIGHT_COLOR};'>{highlight}</b>"
                )
            st.markdown(
                f"<p style='font-size:16px; font-weight:normal; margin-bottom: 1.2rem;'>{prompt_text}</p>",
                unsafe_allow_html=True
            )

            # Rating Keys (using session state to retrieve prior selections)
            acc_key = f"rating_acc_{idx}_{sec_idx}"
            comp_key = f"rating_comp_{idx}_{sec_idx}"
            interp_key = f"rating_interp_{idx}_{sec_idx}"

            # Retrieve current selected values from session state to pre-fill radio buttons
            current_acc_val = st.session_state.get(acc_key)
            current_comp_val = st.session_state.get(comp_key)
            current_interp_val = st.session_state.get(interp_key)

            # Accuracy
            st.markdown("<p style='font-size:16px;'>How accurate is it? Is the content factually consistent with the sentence?</p>", unsafe_allow_html=True)
            acc = st.radio(
                label="Accuracy",
                options=[1,2,3,4,5],
                index=current_acc_val - 1 if current_acc_val else None, # Convert value (1-5) to index (0-4)
                key=acc_key,
                horizontal=True,
                label_visibility="collapsed"
            )

            st.markdown("""
                <div class='radio-description'>
                1 = Very inaccurate, 
                2 = Inaccurate, 
                3 = Neutral, 
                4 = Accurate, 
                5 = Very accurate
                </div>
            """, unsafe_allow_html=True)

            st.markdown("<hr style='margin: 1rem 0;'/>", unsafe_allow_html=True)

            # Completeness
            st.markdown("<p style='font-size:16px;'>How complete and rich is it? Does it fully capture the relevant aspects of the keyword?</p>", unsafe_allow_html=True)
            comp = st.radio(
                label="Completeness",
                options=[1,2,3,4,5],
                index=current_comp_val - 1 if current_comp_val else None,
                key=comp_key,
                horizontal=True,
                label_visibility="collapsed"
            )

            st.markdown("""
                <div class='radio-description'>
                1 = Very incomplete, 
                2 = Incomplete, 
                3 = Moderate, 
                4 = Mostly complete, 
                5 = Very complete
                </div>
            """, unsafe_allow_html=True)

            st.markdown("<hr style='margin: 1rem 0;'/>", unsafe_allow_html=True)

            # Interpretability
            st.markdown("<p style='font-size:16px;'>How interpretable is it? Is it easy to understand?</p>", unsafe_allow_html=True)
            interp = st.radio(
                label="Interpretability",
                options=[1,2,3,4,5],
                index=current_interp_val - 1 if current_interp_val else None,
                key=interp_key,
                horizontal=True,
                label_visibility="collapsed"
            )

            st.markdown("""
                <div class='radio-description'>
                1 = Very difficult to interpret, 
                2 = Difficult, 
                3 = Moderate, 
                4 = Easy, 
                5 = Very easy to interpret
                </div>
            """, unsafe_allow_html=True)

            # Navigation Button within the form
            st.markdown("<div style='margin-top: 2rem;'></div>", unsafe_allow_html=True)
            submit_button = st.form_submit_button("Next ➑️")

    # --- Form submission handler ---
    if submit_button:
        # Validate all ratings are selected
        if acc is None or comp is None or interp is None:
            st.warning("⚠️ Please complete all ratings before proceeding.")
            st.stop()

        # Validate response time
        response_time = time.time() - st.session_state.response_start_time
        
        if response_time < 5: # Minimum 5 seconds for careful evaluation per section
            st.warning("⚠️ Please take enough time to read and evaluate carefully before proceeding.")
            st.stop()

        # If all validations pass, record responses for this section
        record_section_responses(
            idx=idx,
            sec_idx=sec_idx,
            current_sample_data=current_sample_data,
            current_section_title=current_section_title,
            acc_score=acc,
            comp_score=comp,
            interp_score=interp
        )

        # Move to the next section or next sample
        if st.session_state.section_index < len(section_keys) - 1:
            st.session_state.section_index += 1
            st.session_state.response_start_time = time.time() # Reset timer for next section
            st.rerun()
        else:
            st.session_state.section_index = 0 # Reset section for next sample
            st.session_state.training_index += 1 # Move to next sample
            st.session_state.response_start_time = time.time() # Reset timer for the first section of the new sample
            st.rerun()
    st.stop()


# --- Main App Flow Manager ---
def instructions_page_manager():
    # Page 0: Worker ID Input (first logical step)
    if st.session_state.step == "worker_id_input":
        st.title("Welcome to Experiment 2")
        st.write("Please enter your participant ID to begin the pilot run:")

        with st.form(key='worker_id_form'):
            participant_input = st.text_input("Participant ID (e.g., 4)")
            submit_btn = st.form_submit_button("Submit")

        if submit_btn:
            try:
                worker_id = int(participant_input)
                st.session_state.worker_id = worker_id
                st.session_state.passcode = generate_passcode(worker_id) 
                st.session_state.step = "instructions_1" # Move to instructions page 1
                st.rerun()
            except ValueError:
                st.error("Please enter a valid numeric ID.")
        st.stop()

    # Page 1: Instructions (1/2)
    elif st.session_state.step == "instructions_1":
        instructions_1()
    
    # Page 2: Instructions (2/2)
    elif st.session_state.step == "instructions_2":
        instructions_2()
    
    # Training Phase
    elif st.session_state.step == "training":
        training()
    
    # Training Complete Page
    elif st.session_state.step == "training_complete":
        st.header("πŸŽ‰ Practice Complete!")
        st.markdown("""
            You have completed the practice phase! Please let us know if you had any questions or comments on the task/experiment. If everything is clear, we will provide you the link for the main experiment.
        """, unsafe_allow_html=True)

        # Prepare DataFrame for upload
        response_df = pd.DataFrame(st.session_state.responses)
        
        # Define all possible columns for the final DataFrame for Google Sheets
        expected_upload_cols = [
            "timestamp", "worker_id", "passcode", "sample_index", "section_index_within_sample", 
            "section_title", "original_text", "keyword", "metric", "score", "response_time_sec"
        ]
        
        # Filter and reorder DataFrame columns to match expected order for upload
        final_response_df = response_df[[col for col in expected_upload_cols if col in response_df.columns]]
        
        upload_to_google_drive(final_response_df)

        st.markdown("#### πŸ”‘ Your Unique Completion Code")
        st.code(st.session_state.passcode)
        st.stop()


# --- Main App Entry Point ---
if __name__ == "__main__":
    # Initialize session state variables if they don't exist
    if "step" not in st.session_state:
        st.session_state.step = "worker_id_input" # Initial step
    if "worker_id" not in st.session_state:
        st.session_state.worker_id = None
    if "passcode" not in st.session_state:
        st.session_state.passcode = None
    if "training_index" not in st.session_state:
        st.session_state.training_index = 0
    if "section_index" not in st.session_state: # To track current section within a sample
        st.session_state.section_index = 0
    if "responses" not in st.session_state:
        st.session_state.responses = [] # Stores all collected individual rating records
    if "response_start_time" not in st.session_state: # Timer for each section evaluation
        st.session_state.response_start_time = 0 # Initialize to 0, actual time.time() set when section displayed

    # Global CSS styling (moved here for consistency as it applies globally)
    st.markdown("""
        <style>
        .stApp {
            font-size: 17px !important;
            font-family: 'Verdana', sans-serif !important;
        }
        /* Standardize radio button label size */
        div[data-testid="stRadio"] label,
        div[data-testid="stRadio"] label > div > span,
        div[data-testid="stRadio"] p { /* Target the paragraph inside label for consistent sizing */
            font-family: 'Verdana', sans-serif !important;
            font-size: 17px !important; /* Main content font size */
        }
        div[data-testid="stMarkdownContainer"] p,
        div[data-testid="stVerticalBlock"] p { /* General paragraph text */
            font-size: 17px !important;
            font-family: 'Verdana', sans-serif !important;
        }
        /* Specific highlight for instructions */
        .highlight-blue {
            color: #2c7be5; /* Using direct color for this specific style */
            font-weight: 500;
        }
        /* General highlight red (if used for warnings/important text) */
        .highlight-red {
            color: #D9534F;
            font-weight: bold;
        }
        /* For smaller descriptive text near radio buttons */
        .radio-description {
            font-size: 14px !important; /* Smaller font for descriptions */
            color: gray !important;
            line-height: 1.5 !important;
        }
        </style>
    """, unsafe_allow_html=True)

    # Call the manager function to control the app flow
    instructions_page_manager()