Spaces:
Sleeping
Sleeping
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.models.auto import CONFIG_MAPPING | |
| class HiggsAudioEncoderConfig(PretrainedConfig): | |
| """Configuration of the Audio encoder in Higgs-Audio.""" | |
| model_type = "higgs_audio_encoder" | |
| def __init__( | |
| self, | |
| num_mel_bins=128, | |
| encoder_layers=32, | |
| encoder_attention_heads=20, | |
| encoder_ffn_dim=5120, | |
| encoder_layerdrop=0.0, | |
| d_model=1280, | |
| dropout=0.0, | |
| attention_dropout=0.0, | |
| activation_function="gelu", | |
| activation_dropout=0.0, | |
| scale_embedding=False, | |
| init_std=0.02, | |
| max_source_positions=1500, | |
| pad_token_id=128001, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.num_mel_bins = num_mel_bins | |
| self.d_model = d_model | |
| self.encoder_layers = encoder_layers | |
| self.encoder_attention_heads = encoder_attention_heads | |
| self.encoder_ffn_dim = encoder_ffn_dim | |
| self.dropout = dropout | |
| self.attention_dropout = attention_dropout | |
| self.activation_function = activation_function | |
| self.activation_dropout = activation_dropout | |
| self.encoder_layerdrop = encoder_layerdrop | |
| self.num_hidden_layers = encoder_layers | |
| self.init_std = init_std | |
| self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True | |
| self.max_source_positions = max_source_positions | |
| self.pad_token_id = pad_token_id | |
| class HiggsAudioConfig(PretrainedConfig): | |
| r""" | |
| This is the configuration class for the HiggsAudioModel. | |
| Args: | |
| text_config (`Union[AutoConfig, dict]`): | |
| The config object or dictionary of the text backbone. | |
| audio_encoder_config (`Union[AutoConfig, dict]`): | |
| The config object or dictionary of the whisper encoder. | |
| The audio encoder will be bidirectional and will be only available for audio understanding. | |
| audio_tokenizer_config | |
| The config object or dictionary of the audio tokenizer. | |
| audio_adapter_type | |
| The type of audio adapter to use. We support two types of adapter: | |
| - stack: | |
| We stack additional Transformer layers after the main LLM backbone for audio generation. | |
| - dual_ffn: | |
| For selected part of the LLM backbone, we replace the text FFN with a dual FFN architecture | |
| that contains an additional audio FFN. The audio FFN will be triggered when the location is marked for audio tokens. | |
| - dual_ffn_fast_forward: | |
| We pick a few layers in the LLM backbone to plug-in the audio FFN. For the remaining layers, | |
| the audio hidden states will be directly fast-forward to the next layer. | |
| This reduces the computational cost for audio generation. | |
| audio_embed_avg (`bool`, *optional*, defaults to False): | |
| Whether to average the audio embeddings before sending them to the text attention layer. | |
| audio_ffn_hidden_size | |
| The hidden size of the audio feedforward network in dual-path FFN | |
| audio_ffn_intermediate_size | |
| The intermediate size of the audio feedforward network in dual-path FFN | |
| audio_dual_ffn_layers | |
| The layers in the LLM backbone to plug-in the dual FFN layer (mixture of audio FFN and text FFN). | |
| audio_decoder_proj_num_attention (`int`, *optional*, defaults to 0): | |
| The number of attention heads in the audio decoder projection layer. | |
| use_delay_pattern (`bool`, *optional*, defaults to False): | |
| Whether to use delay pattern in the audio decoder. | |
| skip_audio_tower (`bool`, *optional*, defaults to False): | |
| Whether to skip the audio tower in the audio encoder. | |
| use_audio_out_embed_projector (`bool`, *optional*, defaults to False): | |
| Whether to use an embedding projector to map audio out embeddings. | |
| use_audio_out_self_attention (`bool`, *optional*, defaults to False): | |
| Whether to use self-attention to aggregate information from audio-tokens before sending to the text attention layer. | |
| audio_num_codebooks (`int`, *optional*, defaults to 12): | |
| The number of codebooks in RVQGAN. | |
| audio_codebook_size (`int`, *optional*, defaults to 1024): | |
| The size of each codebook in RVQGAN. | |
| audio_stream_bos_id | |
| The id of the bos in the audio stream | |
| audio_stream_eos_id | |
| The id of the eos in the audio stream | |
| audio_bos_token (`str`, *optional*, defaults to "<|audio_bos|>"): | |
| The special `<|audio_bos|>` token. In Higgs-Audio, it is mapped to 128011, | |
| which is the index of `<|reserved_special_token_3|>` in Llama-3.1-8B-Instruct's tokenizer. | |
| audio_eos_token (`str`, *optional*, defaults to "<|audio_eos|>"): | |
| The special `<|audio_eos|>` token. We use 128012 as the default value, | |
| which is the index of `<|reserved_special_token_4|>` in Llama-3.1-8B-Instruct's tokenizer. | |
| audio_out_bos_token (`str`, *optional*, defaults to "<|audio_out_bos|>"): | |
| The special `<|audio_out_bos|>` token. We use 128013 as the default value, | |
| which is the index of `<|reserved_special_token_5|>` in Llama-3.1-8B-Instruct's tokenizer. | |
| audio_token (`str`, *optional*, defaults to "<|AUDIO|>"): | |
| The special `<|AUDIO|>` token. We use 128015 as the default value, | |
| which is the index of `<|reserved_special_token_7|>` in Llama-3.1-8B-Instruct's tokenizer. | |
| This token indicates that the location should be filled in with whisper features. | |
| audio_out_token (`str`, *optional*, defaults to "<|AUDIO_OUT|>"): | |
| The special `<|AUDIO_OUT|>` token. We use 128016 as the default value, | |
| which is the index of `<|reserved_special_token_8|>` in Llama-3.1-8B-Instruct's tokenizer. | |
| This token indicates that the location should be filled in with audio tokens extracted via audio tokenizer. | |
| """ | |
| model_type = "higgs_audio" | |
| is_composition = True | |
| def __init__( | |
| self, | |
| text_config=None, | |
| audio_encoder_config=None, | |
| audio_tokenizer_config=None, | |
| audio_adapter_type="stack", | |
| audio_embed_avg=False, | |
| audio_ffn_hidden_size=4096, | |
| audio_ffn_intermediate_size=14336, | |
| audio_dual_ffn_layers=None, | |
| audio_decoder_proj_num_layers=0, | |
| encode_whisper_embed=True, | |
| encode_audio_in_tokens=False, | |
| use_delay_pattern=False, | |
| skip_audio_tower=False, | |
| use_audio_out_embed_projector=False, | |
| use_audio_out_self_attention=False, | |
| use_rq_transformer=False, | |
| rq_transformer_hidden_size=None, | |
| rq_transformer_intermediate_size=None, | |
| rq_transformer_num_attention_heads=None, | |
| rq_transformer_num_key_value_heads=None, | |
| rq_transformer_num_hidden_layers=3, | |
| audio_num_codebooks=12, | |
| audio_codebook_size=1024, | |
| audio_stream_bos_id=1024, | |
| audio_stream_eos_id=1025, | |
| audio_bos_token="<|audio_bos|>", | |
| audio_eos_token="<|audio_eos|>", | |
| audio_out_bos_token="<|audio_out_bos|>", | |
| audio_in_token="<|AUDIO|>", | |
| audio_out_token="<|AUDIO_OUT|>", | |
| audio_in_token_idx=128015, | |
| audio_out_token_idx=128016, | |
| pad_token_id=128001, | |
| audio_out_bos_token_id=128013, | |
| audio_eos_token_id=128012, | |
| **kwargs, | |
| ): | |
| if isinstance(audio_encoder_config, dict): | |
| audio_encoder_config["model_type"] = ( | |
| audio_encoder_config["model_type"] if "model_type" in audio_encoder_config else "higgs_audio_encoder" | |
| ) | |
| audio_encoder_config = CONFIG_MAPPING[audio_encoder_config["model_type"]](**audio_encoder_config) | |
| elif audio_encoder_config is None: | |
| audio_encoder_config = HiggsAudioEncoderConfig() | |
| if isinstance(text_config, dict): | |
| text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" | |
| text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) | |
| elif text_config is None: | |
| text_config = CONFIG_MAPPING["llama"]() | |
| assert audio_adapter_type in [ | |
| "stack", | |
| "dual_ffn", | |
| "dual_ffn_fast_forward", | |
| ], f"Invalid audio adapter type: {audio_adapter_type}" | |
| if audio_adapter_type.startswith("dual_ffn"): | |
| assert audio_dual_ffn_layers is not None, ( | |
| "audio_dual_ffn_layers must be specified when using dual_ffn adapter." | |
| ) | |
| self.text_config = text_config | |
| self.audio_encoder_config = audio_encoder_config | |
| self.audio_tokenizer_config = audio_tokenizer_config | |
| self.audio_adapter_type = audio_adapter_type | |
| self.audio_embed_avg = audio_embed_avg | |
| self.audio_ffn_hidden_size = audio_ffn_hidden_size | |
| self.audio_ffn_intermediate_size = audio_ffn_intermediate_size | |
| self.audio_dual_ffn_layers = audio_dual_ffn_layers | |
| self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers | |
| self.encode_whisper_embed = encode_whisper_embed | |
| self.encode_audio_in_tokens = encode_audio_in_tokens | |
| self.use_delay_pattern = use_delay_pattern | |
| self.skip_audio_tower = skip_audio_tower | |
| self.use_audio_out_embed_projector = use_audio_out_embed_projector | |
| self.use_audio_out_self_attention = use_audio_out_self_attention | |
| self.use_rq_transformer = use_rq_transformer | |
| if self.use_rq_transformer: | |
| assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!" | |
| self.rq_transformer_hidden_size = rq_transformer_hidden_size | |
| self.rq_transformer_intermediate_size = rq_transformer_intermediate_size | |
| self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads | |
| self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads | |
| self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers | |
| if use_rq_transformer: | |
| # For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified. | |
| if self.rq_transformer_hidden_size is None: | |
| self.rq_transformer_hidden_size = text_config.hidden_size | |
| assert self.rq_transformer_hidden_size % 128 == 0 | |
| if self.rq_transformer_intermediate_size is None: | |
| self.rq_transformer_intermediate_size = text_config.intermediate_size | |
| if self.rq_transformer_num_attention_heads is None: | |
| self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128 | |
| if self.rq_transformer_num_key_value_heads is None: | |
| self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4 | |
| assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0 | |
| assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0 | |
| self.audio_num_codebooks = audio_num_codebooks | |
| self.audio_codebook_size = audio_codebook_size | |
| self.audio_bos_token = audio_bos_token | |
| self.audio_eos_token = audio_eos_token | |
| self.audio_out_bos_token = audio_out_bos_token | |
| self.audio_in_token = audio_in_token | |
| self.audio_out_token = audio_out_token | |
| self.audio_in_token_idx = audio_in_token_idx | |
| self.audio_out_token_idx = audio_out_token_idx | |
| self.audio_stream_bos_id = audio_stream_bos_id | |
| self.audio_stream_eos_id = audio_stream_eos_id | |
| self.audio_out_bos_token_id = audio_out_bos_token_id | |
| self.audio_eos_token_id = audio_eos_token_id | |
| super().__init__(**kwargs) | |
| self.pad_token_id = pad_token_id | |