Spaces:
Running
on
Zero
Running
on
Zero
| TASK_TOKEN_MAP = { | |
| "vc": "<|task_vc|>", | |
| "tts": "<|task_tts|>", | |
| "asr": "<|task_asr|>", | |
| "s2s": "<|task_s2s|>", | |
| "t2s": "<|task_t2s|>", | |
| "understand": "<|task_understand|>", | |
| "caption": "<|task_cap|>", | |
| "controllable_tts": "<|task_controllable_tts|>", | |
| "prompt_tts": "<|task_prompt_tts|>", | |
| "speech_edit": "<|task_edit|>", | |
| } | |
| LEVELS_MAP = { | |
| "very_low": 0, | |
| "low": 1, | |
| "moderate": 2, | |
| "high": 3, | |
| "very_high": 4, | |
| } | |
| LEVELS_MAP_UI = { | |
| 1: 'very_low', | |
| 2: 'low', | |
| 3: 'moderate', | |
| 4: 'high', | |
| 5: 'very_high' | |
| } | |
| GENDER_MAP = { | |
| "female": 0, | |
| "male": 1, | |
| } | |
| AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4} | |
| EMO_MAP = { | |
| "UNKNOWN": 0, | |
| "NEUTRAL": 1, | |
| "ANGRY": 2, | |
| "HAPPY": 3, | |
| "SAD": 4, | |
| "FEARFUL": 5, | |
| "DISGUSTED": 6, | |
| "SURPRISED": 7, | |
| "SARCASTIC": 8, | |
| "EXCITED": 9, | |
| "SLEEPY": 10, | |
| "CONFUSED": 11, | |
| "EMPHASIS": 12, | |
| "LAUGHING": 13, | |
| "SINGING": 14, | |
| "WORRIED": 15, | |
| "WHISPER": 16, | |
| "ANXIOUS": 17, | |
| "NO-AGREEMENT": 18, | |
| "APOLOGETIC": 19, | |
| "CONCERNED": 20, | |
| "ENUNCIATED": 21, | |
| "ASSERTIVE": 22, | |
| "ENCOURAGING": 23, | |
| "CONTEMPT": 24, | |
| } | |
| class TokenParser: | |
| """Turn label to special token""" | |
| def __init__(self): | |
| pass | |
| """Parse the attributes of a person.""" | |
| def __init__(self): | |
| pass | |
| def age(age: str) -> str: | |
| """Turn age token.""" | |
| age_id = AGE_MAP[age] | |
| return f"<|age_{age_id}|>" | |
| def gender(gender: str) -> str: | |
| """Turn gender token.""" | |
| gender_id = GENDER_MAP[gender] | |
| return f"<|gender_{gender_id}|>" | |
| def mel_value(mel: int): | |
| """Turn special token of mel scale pitch.""" | |
| mel = max(0, int(mel)) | |
| mel = min(1000, int(mel)) | |
| return f"<|pitch_value_{mel}|>" | |
| def mel_level(level: str): | |
| """Turn special token of mel level.""" | |
| level_tag = LEVELS_MAP[level] | |
| return f"<|pitch_label_{level_tag}|>" | |
| def pitch_var_value(pitch_std: int): | |
| """Turn special token of pitch_std value.""" | |
| assert isinstance(pitch_std, int) | |
| pitch_std = max(0, int(pitch_std)) | |
| pitch_std = min(10, int(pitch_std)) | |
| return f"<|pitch_var_value_{pitch_std}|>" | |
| def pitch_var_level(level: str): | |
| """Turn special token of pitch std level.""" | |
| level_tag = LEVELS_MAP[level] | |
| return f"<|pitch_var_label_{level_tag}|>" | |
| def loudness_value(loudness: int): | |
| """Turn special toak of loudness value [0, 30]""" | |
| assert loudness >= 0 | |
| loudness = max(0, int(loudness)) | |
| loudness = min(30, int(loudness)) | |
| return f"<|loudness_value_{loudness}|>" | |
| def loudness_level(level: str): | |
| """Turn special token of loudness level.""" | |
| level_tag = LEVELS_MAP[level] | |
| return f"<|loudness_label_{level_tag}|>" | |
| def speed_value(speed: int): | |
| """Turn special token of speed value.""" | |
| speed = max(0, int(speed)) | |
| speed = min(10, int(speed)) | |
| return f"<|speed_value_{speed}|>" | |
| def speed_level(level: str): | |
| """Turn special token of speed level.""" | |
| level_tag = LEVELS_MAP[level] | |
| return f"<|speed_label_{level_tag}|>" | |
| def task(task: str) -> str: | |
| """Turn special token of task.""" | |
| assert task in TASK_TOKEN_MAP.keys() | |
| return TASK_TOKEN_MAP[task] | |
| def emotion(emotion: str): | |
| emo_id = EMO_MAP[emotion] | |
| return f"<|emotion_{emo_id}|>" | |
| # test | |
| if __name__ == "__main__": | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer" | |
| ) | |
| tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"] | |
| ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"] | |
| genders = ["female", "female", "female", "male", "male"] | |
| mels = [100, 200, 300, 400, 500] | |
| mel_levels = ["very_low", "low", "moderate", "high", "very_high"] | |
| loudnesses = [1, 10, 23, 19, 30] | |
| loudness_levels = ["very_low", "low", "moderate", "high", "very_high"] | |
| emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"] | |
| for i in range(5): | |
| task = TokenParser.task(tasks[i]) | |
| age = TokenParser.age(ages[i]) | |
| gender = TokenParser.gender(genders[i]) | |
| mel = TokenParser.mel_value(mels[i]) | |
| mel_level = TokenParser.mel_level(mel_levels[i]) | |
| loudness = TokenParser.loudness_value(loudnesses[i]) | |
| loudness_level = TokenParser.loudness_level(loudness_levels[i]) | |
| emotion = TokenParser.emotion(emotions[i]) | |
| inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion] | |
| inputs = "".join(inputs) | |
| ids = tokenizer.encode(inputs, add_special_tokens=False) | |
| print(ids) | |
| print("decode", tokenizer.decode(ids)) | |