File size: 7,492 Bytes
0b70f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# csv_to_bimisc.py
# One-pass converter: dataset CSV -> rolling-history BiMISC-style JSONL
# Usage:
#   python csv_to_bimisc.py --in dataset/test.csv --out dataset/converted_conversations/bimisc_pretest.jsonl --history 6
#
# Notes:
# - Works with your current train/valid/test schema (conv_id/utterance_idx/speaker_idx/utterance/...).
# - If the CSV lacks conv_id, everything becomes a single conversation.
# - Strips leading "User:", "Bot:", "Client:", "Therapist:", numeric "1:", "2:", and bracketed/parenthesized variants.

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple, Iterable

import pandas as pd

REPO_ROOT = Path(__file__).resolve().parents[1]
IN_PATH = REPO_ROOT / "data" / "psychologist" / "test.csv"
OUT_PATH = REPO_ROOT / "data" / "psychologist" / "pre_annotate.jsonl"

# ----------------------------
# I/O args
# ----------------------------
def parse_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="in_path", type=str,
                default="dataset/test.csv", help="Input CSV path")
    ap.add_argument("--out", dest="out_path", type=str,
                default="dataset/bimisc_pretest.jsonl", help="Output JSONL path")
    ap.add_argument("--history", dest="history_window", type=int,
                    default=6, help="Rolling history window size")
    return ap.parse_args()


# ----------------------------
# Loaders (from dataset_to_jsonl.py semantics)
# ----------------------------
def load_train_valid(path: Path) -> pd.DataFrame:
    # Standard CSV loader with tolerant parsing
    return pd.read_csv(path, engine="python", on_bad_lines="skip", encoding="utf-8")

def load_test_like(path: Path) -> pd.DataFrame:
    # Quirky loader for test.csv with messy commas (same heuristic from your script)
    lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines()
    if not lines:
        return pd.DataFrame()
    header = lines[0].split(",")
    rows, buf = [], ""
    for line in lines[1:]:
        buf = line if not buf else f"{buf} {line}"
        parts = buf.split(",")
        if len(parts) >= 8:
            fixed = parts[:7] + [",".join(parts[7:])]
            rows.append(fixed)
            buf = ""
    cols = header[:8] if len(header) >= 8 else [f"c{i}" for i in range(8)]
    return pd.DataFrame(rows, columns=cols)

def smart_load_csv(path: Path) -> pd.DataFrame:
    # If file name contains "test", use the special loader; else use standard
    name = path.name.lower()
    if "test" in name:
        return load_test_like(path)
    return load_train_valid(path)

# ----------------------------
# Cleaning (from dataset_to_jsonl.py)
# ----------------------------
def clean_text(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    for col in ["prompt","utterance","tags","context"]:
        if col in df.columns:
            df[col] = (df[col].astype(str)
                               .str.replace("_comma_", ",", regex=False)
                               .str.replace("\r"," ", regex=False)
                               .str.replace("\n"," ", regex=False)
                               .str.strip())
    for col in ["utterance_idx","speaker_idx"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
    return df

# ----------------------------
# Conversation assembler (from dataset_to_jsonl.py)
# ----------------------------
def _ensure_conv_id(df: pd.DataFrame) -> pd.DataFrame:
    cand_cols = ["conv_id","conversation_id","dialogue_id","episode_id","episode_idx"]
    found = next((c for c in cand_cols if c in df.columns), None)
    if found:
        return df.rename(columns={found: "conv_id"})
    df = df.copy()
    df["conv_id"] = 0
    return df

def transcript_from_conv(df_conv: pd.DataFrame) -> str:
    parts = []
    speaker = df_conv.get("speaker_idx")
    for _, r in df_conv.sort_values("utterance_idx", na_position="first").iterrows():
        who = "User" if (speaker is not None and r.get("speaker_idx", 0) == 0) else "Bot"
        utt = str(r.get("utterance","")).strip()
        parts.append(f"{who}: {utt}")
    return "\n".join(parts)

def build_conversation_only(df: pd.DataFrame) -> pd.DataFrame:
    df = _ensure_conv_id(df)
    keep_cols = ["conv_id","utterance_idx","speaker_idx","utterance","context","prompt"]
    df2 = df[[c for c in keep_cols if c in df.columns]].copy()
    df2 = df2.sort_values(["conv_id","utterance_idx"])
    out_rows = []
    for conv_id, g in df2.groupby("conv_id"):
        conv_text = transcript_from_conv(g)
        out = {
            "conv_id": conv_id,
            "conversation": conv_text,
            "context": g["context"].iloc[0] if "context" in g.columns else None,
            "prompt":  g["prompt"].iloc[0]  if "prompt"  in g.columns else None,
        }
        out_rows.append(out)
    return pd.DataFrame(out_rows)

# ----------------------------
# Prefix stripping + turn parsing (from jsonl_to_proper.py)
# ----------------------------
PREFIX_RE = re.compile(
    r"""^\s*

        (?:

          (?:user|bot|client|therapist)     # named roles

          |[12]                              # numeric speaker ids

          |\[(?:user|bot|client|therapist)\] # bracketed roles

          |\((?:user|bot|client|therapist)\) # parenthesized roles

        )

        \s*[:)\]-]*\s*                       # trailing separators

    """,
    re.IGNORECASE | re.VERBOSE,
)

def _strip_prefix(text: str) -> str:
    return PREFIX_RE.sub("", text).strip()

def _split_lines(conv_text: str) -> List[str]:
    return [ln.strip() for ln in re.split(r"\r?\n+", conv_text.strip()) if ln.strip()]

def parse_turns(conv_text: str) -> List[Tuple[str, str]]:
    lines = _split_lines(conv_text)
    turns: List[Tuple[str, str]] = []
    for i, ln in enumerate(lines):
        clean = _strip_prefix(ln)
        if not clean:
            continue
        role = "Client" if i % 2 == 0 else "Therapist"
        turns.append((role, clean))
    return turns

def yield_items(turns: List[Tuple[str, str]], history_window: int = 6) -> Iterable[Dict[str, Any]]:
    for i, (role, text) in enumerate(turns):
        hist = turns[max(0, i - history_window):i]
        yield {
            "history": [{"role": r, "text": t} for r, t in hist],
            "utterance_role": role,      # "Client" or "Therapist"
            "utterance_text": text,
        }

# ----------------------------
# End-to-end
# ----------------------------
def main():
    in_path = IN_PATH
    out_path = OUT_PATH
    out_path.parent.mkdir(parents=True, exist_ok=True)

    df = smart_load_csv(in_path)
    df = clean_text(df)
    conv_df = build_conversation_only(df)

    written = 0
    with out_path.open("w", encoding="utf-8") as fout:
        for _, row in conv_df.iterrows():
            conv_text = (row.get("conversation") or "").strip()
            if not conv_text:
                continue
            turns = parse_turns(conv_text)
            for item in yield_items(turns, history_window=6):
                fout.write(json.dumps(item, ensure_ascii=False) + "\n")
                written += 1

    print(f"{in_path} -> {out_path} | wrote {written} items")

if __name__ == "__main__":
    main()