Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

Puffin / src /models /connector /modeling_connector.py

KangLiao

init

ace9173 about 2 months ago

raw

history blame

21.5 kB

	# coding=utf-8
	# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""PyTorch Connector model."""

	import math
	import warnings
	from typing import Any, Optional, Tuple, Union

	import torch
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn.init import _calculate_fan_in_and_fan_out

	from transformers.activations import ACT2FN
	from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
	from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import (
	ModelOutput,
	is_flash_attn_2_available,
	is_flash_attn_greater_or_equal_2_10,
	logging,
	replace_return_docstrings,
	torch_int,
	)
	from .configuration_connector import ConnectorConfig


	if is_flash_attn_2_available():
	from transformers.modeling_flash_attention_utils import _flash_attention_forward


	logger = logging.get_logger(__name__)


	def init_weights(module):
	"""Initialize the weights"""
	if isinstance(module, nn.Embedding):
	default_flax_embed_init(module.weight)
	elif isinstance(module, ConnectorAttention):
	nn.init.xavier_uniform_(module.q_proj.weight)
	nn.init.xavier_uniform_(module.k_proj.weight)
	nn.init.xavier_uniform_(module.v_proj.weight)
	nn.init.xavier_uniform_(module.out_proj.weight)
	nn.init.zeros_(module.q_proj.bias)
	nn.init.zeros_(module.k_proj.bias)
	nn.init.zeros_(module.v_proj.bias)
	nn.init.zeros_(module.out_proj.bias)
	elif isinstance(module, ConnectorMLP):
	nn.init.xavier_uniform_(module.fc1.weight)
	nn.init.xavier_uniform_(module.fc2.weight)
	nn.init.normal_(module.fc1.bias, std=1e-6)
	nn.init.normal_(module.fc2.bias, std=1e-6)
	elif isinstance(module, (nn.Linear, nn.Conv2d)):
	lecun_normal_(module.weight)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	def _trunc_normal_(tensor, mean, std, a, b):
	# Cut & paste from PyTorch official master until it's in a few official releases - RW
	# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

	if (mean < a - 2 * std) or (mean > b + 2 * std):
	warnings.warn(
	"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
	"The distribution of values may be incorrect.",
	stacklevel=2,
	)

	# Values are generated by using a truncated uniform distribution and
	# then using the inverse CDF for the normal distribution.
	# Get upper and lower cdf values
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)

	# Uniformly fill tensor with values from [l, u], then translate to
	# [2l-1, 2u-1].
	tensor.uniform_(2 * l - 1, 2 * u - 1)

	# Use inverse cdf transform for normal distribution to get truncated
	# standard normal
	tensor.erfinv_()

	# Transform to proper mean, std
	tensor.mul_(std * math.sqrt(2.0))
	tensor.add_(mean)

	# Clamp to ensure it's in the proper range
	tensor.clamp_(min=a, max=b)


	def trunc_normal_tf_(
	tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
	) -> torch.Tensor:
	"""Fills the input Tensor with values drawn from a truncated
	normal distribution. The values are effectively drawn from the
	normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
	with values outside :math:`[a, b]` redrawn until they are within
	the bounds. The method used for generating the random values works
	best when :math:`a \\leq \text{mean} \\leq b`.

	NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
	bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
	and the result is subsequently scaled and shifted by the mean and std args.

	Args:
	tensor: an n-dimensional `torch.Tensor`
	mean: the mean of the normal distribution
	std: the standard deviation of the normal distribution
	a: the minimum cutoff value
	b: the maximum cutoff value
	"""
	with torch.no_grad():
	_trunc_normal_(tensor, 0, 1.0, a, b)
	tensor.mul_(std).add_(mean)


	def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
	fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
	if mode == "fan_in":
	denom = fan_in
	elif mode == "fan_out":
	denom = fan_out
	elif mode == "fan_avg":
	denom = (fan_in + fan_out) / 2

	variance = scale / denom

	if distribution == "truncated_normal":
	# constant is stddev of standard normal truncated to (-2, 2)
	trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
	elif distribution == "normal":
	with torch.no_grad():
	tensor.normal_(std=math.sqrt(variance))
	elif distribution == "uniform":
	bound = math.sqrt(3 * variance)
	with torch.no_grad():
	tensor.uniform_(-bound, bound)
	else:
	raise ValueError(f"invalid distribution {distribution}")


	def lecun_normal_(tensor):
	variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")


	def default_flax_embed_init(tensor):
	variance_scaling_(tensor, mode="fan_in", distribution="normal")


	class ConnectorAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""Input shape: Batch x Time x Channel"""

	batch_size, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)

	k_v_seq_len = key_states.shape[-2]
	attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale

	if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
	f" {attn_weights.size()}"
	)

	if attention_mask is not None:
	if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
	)
	attn_weights = attn_weights + attention_mask

	# upcast attention to fp32
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
	attn_output = torch.matmul(attn_weights, value_states)

	if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights


	class ConnectorFlashAttention2(ConnectorAttention):
	"""
	ConnectorAttention flash attention module. This module inherits from `ConnectorAttention` as the weights of the module stays
	untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
	flash attention and deal with padding tokens in case the input contains any of them.
	"""

	is_causal = False

	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
	# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
	self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

	# Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.LongTensor] = None,
	output_attentions: bool = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	output_attentions = False

	batch_size, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	# Flash attention requires the input to have the shape
	# batch_size x seq_length x head_dim x hidden_dim
	# therefore we just need to keep the original shape
	query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)

	# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	# to be able to avoid many of these transpose/reshape/view.
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	dropout_rate = self.dropout if self.training else 0.0

	# In PEFT, usually we cast the layer norms in float32 for training stability reasons
	# therefore the input hidden states gets silently casted in float32. Hence, we need
	# cast them back in the correct dtype just to be sure everything works as expected.
	# This might slowdown training & inference so it is recommended to not cast the LayerNorms
	# in fp32.

	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	if torch.is_autocast_enabled():
	target_dtype = torch.get_autocast_gpu_dtype()
	# Handle the case where the model is quantized
	elif hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj.weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	attn_output = _flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	q_len,
	dropout=dropout_rate,
	is_causal=self.is_causal,
	use_top_left_mask=self._flash_attn_uses_top_left_mask,
	)

	attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
	attn_output = self.out_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights


	class ConnectorSdpaAttention(ConnectorAttention):
	"""
	Connector attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
	`ConnectorAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
	SDPA API.
	"""

	is_causal = False

	# Adapted from ConnectorAttention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	if output_attentions:
	# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	logger.warning_once(
	"ConnectorModel is using ConnectorSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
	'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
	)
	return super().forward(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)

	batch_size, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)

	# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
	# Reference: https://github.com/pytorch/pytorch/issues/112577.
	if query_states.device.type == "cuda" and attention_mask is not None:
	query_states = query_states.contiguous()
	key_states = key_states.contiguous()
	value_states = value_states.contiguous()

	# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
	# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
	is_causal = True if self.is_causal and q_len > 1 else False

	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query_states,
	key_states,
	value_states,
	attn_mask=attention_mask,
	dropout_p=self.dropout if self.training else 0.0,
	is_causal=is_causal,
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.view(batch_size, q_len, self.embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, None


	CONNECTOR_ATTENTION_CLASSES = {
	"eager": ConnectorAttention,
	"flash_attention_2": ConnectorFlashAttention2,
	"sdpa": ConnectorSdpaAttention,
	}


	# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Connector
	class ConnectorMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states


	class ConnectorEncoderLayer(nn.Module):
	def __init__(self, config: ConnectorConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.self_attn = CONNECTOR_ATTENTION_CLASSES[config._attn_implementation](config=config)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = ConnectorMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	# Ignore copy
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`):
	Input to the layer of shape `(batch, seq_len, embed_dim)`.
	attention_mask (`torch.FloatTensor`):
	Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
	output_attentions (`bool`, optional, defaults to `False`):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs


	# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->Connector
	class ConnectorEncoder(nn.Module):
	def __init__(self, config: ConnectorConfig):
	super().__init__()
	self.config = config
	self.layers = nn.ModuleList([ConnectorEncoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False
	self.apply(init_weights)

	def forward(self, inputs_embeds):
	hidden_states = inputs_embeds
	for encoder_layer in self.layers:
	if self.gradient_checkpointing and self.training:
	layer_outputs = torch.utils.checkpoint.checkpoint(
	encoder_layer.__call__,
	hidden_states,
	None,
	False,
	use_reentrant=False
	)
	else:
	layer_outputs = encoder_layer(
	hidden_states,
	None,
	output_attentions=False,
	)

	hidden_states = layer_outputs[0]

	return hidden_states