Spaces:
Sleeping
Sleeping
| import json | |
| import numpy as np | |
| import random | |
| import uuid | |
| def load_from_jsonl(filename, n=np.inf): | |
| data = [] | |
| with open(filename, 'r') as file: | |
| for i, line in enumerate(file): | |
| if i >= n: # stop after reading n lines | |
| break | |
| data.append(json.loads(line)) | |
| return data | |
| def append_id(conversations_no_id): | |
| conversations = [] | |
| for conversation in conversations_no_id: | |
| conversations.append({ | |
| 'conv_id': uuid.uuid4().hex, | |
| 'transcript': conversation['transcript'] | |
| }) | |
| return conversations | |
| def save_to_jsonl(data, filename): | |
| with open(filename, 'w') as file: | |
| for item in data: | |
| json_line = json.dumps(item) | |
| file.write(json_line + '\n') | |
| def get_conversation(data, min_length=0): | |
| conv = random.choice(data) | |
| transcript = conv['transcript'] | |
| slice_index = random.randint(min_length, len(transcript) - 1) | |
| conv_slice = transcript[slice_index] | |
| return { | |
| 'conv_id': conv['conv_id'], | |
| 'slice_idx': slice_index, | |
| 'transcript': conv_slice | |
| } | |
| # def pad_transcript(transcript, max_length): | |
| # padding_count = max_length - len(transcript) | |
| # if padding_count > 0: | |
| # for _ in range(padding_count): | |
| # transcript.append({'speaker': '', 'response': ''}) | |
| # return transcript | |
| def get_last_response(transcript): | |
| for turn in reversed(transcript): | |
| if turn['speaker'] and turn['response']: | |
| return turn['response'] | |