InstaDeepAI
/

ChatNT

@@ -28,7 +28,6 @@ class RotaryEmbeddingConfig:
 class PerceiverResamplerConfig:
     """
     Parameters to initialize an PerceiverResampler model.
     Args:
         emb_layer_norm_before: Whether to use layer norm before the first attention
             layer.
@@ -93,9 +92,7 @@ class PerceiverResamplerConfig:
 class GptConfig:
     """
     Parameters to initialize a Gpt model.
     NOTE: the pad token is not defined
     Args:
         vocab_size: Token vocabulary.
         eos_token_id: used to stop sentence generation
@@ -191,7 +188,6 @@ class GptConfig:
 class NucleotideTransformerConfig:
     """
     Parameters to initialize an NT model.
     Args:
         alphabet_size: Token vocabulary.
         pad_token_id: ID of pad token.
@@ -364,21 +360,20 @@ class ChatNTConfig(PretrainedConfig):
         return output
-class ChatNTDecoder(nn.Module):
     def __init__(
         self,
         gpt_config: GptConfig,
         seq_token_id: int,
     ):
         """
-        Initializes the ChatNT decoder, using a GPT model for text generation with
         bio embeddings.
         Args:
             gpt_config: Configuration for the GPT model
             seq_token_id: Index of the SEQ token
         """
-        super(ChatNTDecoder, self).__init__()
         self.gpt_config = gpt_config
         self.seq_token_id = seq_token_id
@@ -390,13 +385,11 @@ class ChatNTDecoder(nn.Module):
     ) -> torch.Tensor:
         """
         Forward pass through the model.
         Args:
             english_token_ids: Tensor of English token IDs with shape
                 (batch_size, num_english_tokens).
             projected_bio_embeddings: Optional tensor of bio embeddings with shape
                                       (batch_size, num_bio_sequences, ?, embed_dim).
         Returns:
             torch.Tensor: The logits from the GPT model,
                 shaped (batch_size, num_english_tokens, vocab_size).
@@ -452,13 +445,11 @@ class ChatNTDecoder(nn.Module):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Inserts resampled embeddings in input_embeddings, starting at the SEQ token
         Args:
             tokens (torch.Tensor): Shape (batch_size, num_tokens)
             input_embeddings (torch.Tensor): Shape (batch_size, num_tokens, embed_dim)
             resampled_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_sequences, bio_sequence_length, embed_dim)
         Returns:
             Tuple[torch.Tensor, torch.Tensor]:
                 - input_embeddings with resampled_embeddings inserted at the SEQ token
@@ -521,11 +512,9 @@ class ChatNTDecoder(nn.Module):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Removes the logits corresponding to the unused embeddings.
         Args:
             tokens: Input english tokens.
             logits: Input logits.
         Returns:
             Cleaned logits, last values will be equal to 0.
         """
@@ -582,7 +571,7 @@ class ChatNTDecoder(nn.Module):
         return logits_acc, tokens_acc
-class ChatNT(PreTrainedModel):
     config_class = ChatNTConfig
     def __init__(self, config: ChatNTConfig) -> None:
@@ -625,11 +614,11 @@ class ChatNT(PreTrainedModel):
         # Correct seq_token_id
         self.seq_token_id -= 1
-        self.chatnt_encoder = ChatNTEncoder(nt_config=self.nt_config)
-        self.chatnt_decoder = ChatNTDecoder(
             gpt_config=self.gpt_config, seq_token_id=self.seq_token_id
         )
-        self.projection_model = MultiModalPerceiverResamplerProjection(
             perceiver_resampler_config=self.perceiver_resampler_config,
             input_embed_dim=self.nt_config.embed_dim,
             embed_dim=self.gpt_config.embed_dim,
@@ -645,27 +634,21 @@ class ChatNT(PreTrainedModel):
         projected_bio_embeddings: torch.Tensor = None,
     ) -> dict[str, torch.Tensor]:
         """
         Args:
             multi_omics_tokens_ids (Tuple[torch.Tensor, torch.Tensor]):
                 english_tokens_ids: Represents the prompt tokens (english tokens)
                     Shape (batch_size, num_english_tokens)
                 bio_tokens_ids: Represents the bio sequences tokens
                     Shape (batch_size, num_bio_sequences, num_bio_tokens)
             projection_english_tokens_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
             projected_bio_embeddings (projected_bio_embeddings, optional):
                 Shape (batch_size, num_bio_sequencse, ?, embed_dim).
                 Defaults to None.
         Returns:
             dict[str, torch.Tensor] containing:
                 - logits:
                     Shape (batch_size, num_tokens, vocab_size)
                 - projected_bio_embeddings:
                     Shape (batch_size, num_bio_sequences, ?, embed_dim)
         """
@@ -702,7 +685,7 @@ class ChatNT(PreTrainedModel):
             if projected_bio_embeddings is None:
                 # Compute bio sequences embeddings
                 bio_embeddings_list = [
-                    self.chatnt_encoder(bio_token_ids=bio_token_ids[:, bio_seq_num])
                     for bio_seq_num in range(num_bio_sequences)
                 ]
@@ -718,7 +701,7 @@ class ChatNT(PreTrainedModel):
                 projected_bio_embeddings = torch.stack(projected_bio_embeddings, dim=1)
         # decode
-        logits = self.chatnt_decoder(
             english_token_ids=english_token_ids,
             projected_bio_embeddings=projected_bio_embeddings,
         )
@@ -741,7 +724,6 @@ class TorchRotaryEmbedding(torch.nn.Module):
     def _create_sinusoidal_positions(self, device: torch.device) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
         Returns:
             Sinusoidal positions of shape (self.max_seq_len, self.dim).
         """
@@ -774,11 +756,9 @@ class TorchRotaryEmbedding(torch.nn.Module):
     def _rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
         """
         Prepare a tensor to apply the RoPE mechanism.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
         Returns:
             The even indices in the last dimension have their sign flipped.
             Tensor of shape (batch_size, seq_len, num_heads, head_dim).
@@ -795,12 +775,10 @@ class TorchRotaryEmbedding(torch.nn.Module):
     ) -> torch.Tensor:
         """
         Applies rotary embeddings to x.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
             sincos: Tuple of sine and cosine tensors for position encoding.
         Returns:
             RoPE embeddings tensor.
         """
@@ -818,12 +796,10 @@ class TorchRotaryEmbedding(torch.nn.Module):
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Applies rotary embeddings to k and q.
         Args:
             k: key tensor of shape (batch_size, seq_len, num_heads, head_dim),
             q: value tensor of shape (batch_size, seq_len, num_heads, head_dim),
             positions: optional positions offset useful when caching,
         Returns:
             RoPE embeddings for the keys and values.
         """
@@ -1141,11 +1117,9 @@ def build_causal_attention_mask(
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     Args:
         batch_size: Batch size.
         seq_len: Length of the sequences.
     Returns:
         Batch of causal masks.
     """
@@ -1498,12 +1472,12 @@ class RobertaLMHead(nn.Module):
         return {"embeddings": embeddings, "logits": logits}
-class NucleotideTransformer(nn.Module):
     def __init__(
         self,
         nt_config: NucleotideTransformerConfig,
     ):
-        super(NucleotideTransformer, self).__init__()
         self.nt_config = nt_config
         # Other cases are not implemented
@@ -1551,13 +1525,11 @@ class NucleotideTransformer(nn.Module):
     ) -> torch.Tensor:
         """
         Computes the embeddings based on the input tokens.
         Args:
             tokens: Input tokens out of the tokenizer of shape (batch_size, seq_len).
             attention_mask: Attention mask of shape (batch_size, 1, seq_len, seq_len).
                 If no mask is provided, a mask by default which equals 1 over all non
                 pad tokens and 0 over pad tokens is computed.
         Returns:
             Dictionary containing the final embeddings and logits.
         """
@@ -1585,11 +1557,9 @@ def build_padding_attention_mask(
 ) -> torch.Tensor:
     """
     Builds a padding mask from a sequence of tokens by masking <pad> in the attention.
     Args:
         tokens: Batch of sequences of shape (batch_size, seq_len).
         pad_token_id: Int corresponding to the <pad> token to mask.
     Returns:
         Batch of attention masks, masking out <pad> tokens.
     """
@@ -1599,14 +1569,14 @@ def build_padding_attention_mask(
     return padding_mask
-class ChatNTEncoder(nn.Module):
     def __init__(
         self,
         nt_config: NucleotideTransformerConfig,
     ):
-        super(ChatNTEncoder, self).__init__()
         self.nt_config = nt_config
-        self.nt_model = NucleotideTransformer(self.nt_config)
     def forward(
         self,
@@ -1616,7 +1586,6 @@ class ChatNTEncoder(nn.Module):
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
         Returns:
             torch.Tensor:
                 Shape (batch_size, num_bio_tokens, embed_dim)
@@ -1626,7 +1595,7 @@ class ChatNTEncoder(nn.Module):
         return bio_embeddings
-class MultiModalPerceiverResamplerBlock(nn.Module):
     def __init__(
         self,
         num_heads: int,
@@ -1714,7 +1683,7 @@ class MultiModalPerceiverResamplerBlock(nn.Module):
         return {"embeddings": x}
-class MultiModalPerceiverResampler(nn.Module):
     """
     Perceiver Resampler model, made of successive PerceiverResamplerBlocks.
     """
@@ -1726,7 +1695,6 @@ class MultiModalPerceiverResampler(nn.Module):
     ):
         """
         Initialize a Perceiver Resampler model.
         Args:
             config: Dataclass containing model hyperparameters.
             name: Name for module (custom will break weight loading).
@@ -1736,7 +1704,7 @@ class MultiModalPerceiverResampler(nn.Module):
         self.name = name
         self.layers = nn.ModuleList(
             [
-                MultiModalPerceiverResamplerBlock(
                     num_heads=self.config.attention_heads,
                     embed_dim=self.config.embed_dim,
                     key_size=self.config.key_size,
@@ -1823,7 +1791,7 @@ class MultiModalPerceiverResampler(nn.Module):
         return outs
-class MultiModalPerceiverResamplerProjection(nn.Module):
     def __init__(
         self,
         perceiver_resampler_config: PerceiverResamplerConfig,
@@ -1843,7 +1811,7 @@ class MultiModalPerceiverResamplerProjection(nn.Module):
         self.bio_projection = nn.Linear(input_embed_dim, embed_dim)
         self.token_embedding = nn.Embedding(english_vocab_size, embed_dim)
-        self.perceiver_resampler = MultiModalPerceiverResampler(config=self.config)
     def forward(
         self,
@@ -1855,10 +1823,8 @@ class MultiModalPerceiverResamplerProjection(nn.Module):
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
             bio_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_tokens, embed_dim)
             english_token_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
         """
@@ -1901,3 +1867,4 @@ def build_perceiver_padding_attention_mask(
     padding_mask = padding_mask[:, None, None, :]
     padding_mask = padding_mask.repeat(1, 1, resampled_length, 1)  # noqa
     return padding_mask

 class PerceiverResamplerConfig:
     """
     Parameters to initialize an PerceiverResampler model.
     Args:
         emb_layer_norm_before: Whether to use layer norm before the first attention
             layer.
 class GptConfig:
     """
     Parameters to initialize a Gpt model.
     NOTE: the pad token is not defined
     Args:
         vocab_size: Token vocabulary.
         eos_token_id: used to stop sentence generation
 class NucleotideTransformerConfig:
     """
     Parameters to initialize an NT model.
     Args:
         alphabet_size: Token vocabulary.
         pad_token_id: ID of pad token.
         return output
+class TorchBioBrainDecoder(nn.Module):
     def __init__(
         self,
         gpt_config: GptConfig,
         seq_token_id: int,
     ):
         """
+        Initializes the BioBrain decoder, using a GPT model for text generation with
         bio embeddings.
         Args:
             gpt_config: Configuration for the GPT model
             seq_token_id: Index of the SEQ token
         """
+        super(TorchBioBrainDecoder, self).__init__()
         self.gpt_config = gpt_config
         self.seq_token_id = seq_token_id
     ) -> torch.Tensor:
         """
         Forward pass through the model.
         Args:
             english_token_ids: Tensor of English token IDs with shape
                 (batch_size, num_english_tokens).
             projected_bio_embeddings: Optional tensor of bio embeddings with shape
                                       (batch_size, num_bio_sequences, ?, embed_dim).
         Returns:
             torch.Tensor: The logits from the GPT model,
                 shaped (batch_size, num_english_tokens, vocab_size).
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Inserts resampled embeddings in input_embeddings, starting at the SEQ token
         Args:
             tokens (torch.Tensor): Shape (batch_size, num_tokens)
             input_embeddings (torch.Tensor): Shape (batch_size, num_tokens, embed_dim)
             resampled_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_sequences, bio_sequence_length, embed_dim)
         Returns:
             Tuple[torch.Tensor, torch.Tensor]:
                 - input_embeddings with resampled_embeddings inserted at the SEQ token
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Removes the logits corresponding to the unused embeddings.
         Args:
             tokens: Input english tokens.
             logits: Input logits.
         Returns:
             Cleaned logits, last values will be equal to 0.
         """
         return logits_acc, tokens_acc
+class TorchMultiOmicsModel(PreTrainedModel):
     config_class = ChatNTConfig
     def __init__(self, config: ChatNTConfig) -> None:
         # Correct seq_token_id
         self.seq_token_id -= 1
+        self.biobrain_encoder = TorchBioBrainEncoder(nt_config=self.nt_config)
+        self.biobrain_decoder = TorchBioBrainDecoder(
             gpt_config=self.gpt_config, seq_token_id=self.seq_token_id
         )
+        self.projection_model = TorchMultiModalPerceiverResamplerProjection(
             perceiver_resampler_config=self.perceiver_resampler_config,
             input_embed_dim=self.nt_config.embed_dim,
             embed_dim=self.gpt_config.embed_dim,
         projected_bio_embeddings: torch.Tensor = None,
     ) -> dict[str, torch.Tensor]:
         """
         Args:
             multi_omics_tokens_ids (Tuple[torch.Tensor, torch.Tensor]):
                 english_tokens_ids: Represents the prompt tokens (english tokens)
                     Shape (batch_size, num_english_tokens)
                 bio_tokens_ids: Represents the bio sequences tokens
                     Shape (batch_size, num_bio_sequences, num_bio_tokens)
             projection_english_tokens_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
             projected_bio_embeddings (projected_bio_embeddings, optional):
                 Shape (batch_size, num_bio_sequencse, ?, embed_dim).
                 Defaults to None.
         Returns:
             dict[str, torch.Tensor] containing:
                 - logits:
                     Shape (batch_size, num_tokens, vocab_size)
                 - projected_bio_embeddings:
                     Shape (batch_size, num_bio_sequences, ?, embed_dim)
         """
             if projected_bio_embeddings is None:
                 # Compute bio sequences embeddings
                 bio_embeddings_list = [
+                    self.biobrain_encoder(bio_token_ids=bio_token_ids[:, bio_seq_num])
                     for bio_seq_num in range(num_bio_sequences)
                 ]
                 projected_bio_embeddings = torch.stack(projected_bio_embeddings, dim=1)
         # decode
+        logits = self.biobrain_decoder(
             english_token_ids=english_token_ids,
             projected_bio_embeddings=projected_bio_embeddings,
         )
     def _create_sinusoidal_positions(self, device: torch.device) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
         Returns:
             Sinusoidal positions of shape (self.max_seq_len, self.dim).
         """
     def _rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
         """
         Prepare a tensor to apply the RoPE mechanism.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
         Returns:
             The even indices in the last dimension have their sign flipped.
             Tensor of shape (batch_size, seq_len, num_heads, head_dim).
     ) -> torch.Tensor:
         """
         Applies rotary embeddings to x.
         Args:
             x: Tensor of shape (batch_size, seq_len, num_heads, head_dim),
             typically this is the key or query tensor.
             sincos: Tuple of sine and cosine tensors for position encoding.
         Returns:
             RoPE embeddings tensor.
         """
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Applies rotary embeddings to k and q.
         Args:
             k: key tensor of shape (batch_size, seq_len, num_heads, head_dim),
             q: value tensor of shape (batch_size, seq_len, num_heads, head_dim),
             positions: optional positions offset useful when caching,
         Returns:
             RoPE embeddings for the keys and values.
         """
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     Args:
         batch_size: Batch size.
         seq_len: Length of the sequences.
     Returns:
         Batch of causal masks.
     """
         return {"embeddings": embeddings, "logits": logits}
+class TorchNucleotideTransformer(nn.Module):
     def __init__(
         self,
         nt_config: NucleotideTransformerConfig,
     ):
+        super(TorchNucleotideTransformer, self).__init__()
         self.nt_config = nt_config
         # Other cases are not implemented
     ) -> torch.Tensor:
         """
         Computes the embeddings based on the input tokens.
         Args:
             tokens: Input tokens out of the tokenizer of shape (batch_size, seq_len).
             attention_mask: Attention mask of shape (batch_size, 1, seq_len, seq_len).
                 If no mask is provided, a mask by default which equals 1 over all non
                 pad tokens and 0 over pad tokens is computed.
         Returns:
             Dictionary containing the final embeddings and logits.
         """
 ) -> torch.Tensor:
     """
     Builds a padding mask from a sequence of tokens by masking <pad> in the attention.
     Args:
         tokens: Batch of sequences of shape (batch_size, seq_len).
         pad_token_id: Int corresponding to the <pad> token to mask.
     Returns:
         Batch of attention masks, masking out <pad> tokens.
     """
     return padding_mask
+class TorchBioBrainEncoder(nn.Module):
     def __init__(
         self,
         nt_config: NucleotideTransformerConfig,
     ):
+        super(TorchBioBrainEncoder, self).__init__()
         self.nt_config = nt_config
+        self.nt_model = TorchNucleotideTransformer(self.nt_config)
     def forward(
         self,
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
         Returns:
             torch.Tensor:
                 Shape (batch_size, num_bio_tokens, embed_dim)
         return bio_embeddings
+class TorchMultiModalPerceiverResamplerBlock(nn.Module):
     def __init__(
         self,
         num_heads: int,
         return {"embeddings": x}
+class TorchMultiModalPerceiverResampler(nn.Module):
     """
     Perceiver Resampler model, made of successive PerceiverResamplerBlocks.
     """
     ):
         """
         Initialize a Perceiver Resampler model.
         Args:
             config: Dataclass containing model hyperparameters.
             name: Name for module (custom will break weight loading).
         self.name = name
         self.layers = nn.ModuleList(
             [
+                TorchMultiModalPerceiverResamplerBlock(
                     num_heads=self.config.attention_heads,
                     embed_dim=self.config.embed_dim,
                     key_size=self.config.key_size,
         return outs
+class TorchMultiModalPerceiverResamplerProjection(nn.Module):
     def __init__(
         self,
         perceiver_resampler_config: PerceiverResamplerConfig,
         self.bio_projection = nn.Linear(input_embed_dim, embed_dim)
         self.token_embedding = nn.Embedding(english_vocab_size, embed_dim)
+        self.perceiver_resampler = TorchMultiModalPerceiverResampler(config=self.config)
     def forward(
         self,
         Args:
             bio_token_ids (torch.Tensor):
                 Shape (batch_size, num_bio_tokens)
             bio_embeddings (torch.Tensor):
                 Shape (batch_size, num_bio_tokens, embed_dim)
             english_token_ids (torch.Tensor):
                 Shape (batch_size, num_english_tokens)
         """
     padding_mask = padding_mask[:, None, None, :]
     padding_mask = padding_mask.repeat(1, 1, resampled_length, 1)  # noqa
     return padding_mask