Upload folder using huggingface_hub
Browse files- config.json +0 -1
- configuration_baichuan.py +0 -2
- modeling_baichuan.py +15 -19
    	
        config.json
    CHANGED
    
    | @@ -6,7 +6,6 @@ | |
| 6 | 
             
                "AutoConfig": "configuration_baichuan.BaichuanConfig",
         | 
| 7 | 
             
                "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
         | 
| 8 | 
             
              },
         | 
| 9 | 
            -
              "tokenizer_class": "BaichuanTokenizer",
         | 
| 10 | 
             
              "bos_token_id": 1,
         | 
| 11 | 
             
              "eos_token_id": 2,
         | 
| 12 | 
             
              "hidden_act": "silu",
         | 
|  | |
| 6 | 
             
                "AutoConfig": "configuration_baichuan.BaichuanConfig",
         | 
| 7 | 
             
                "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
         | 
| 8 | 
             
              },
         | 
|  | |
| 9 | 
             
              "bos_token_id": 1,
         | 
| 10 | 
             
              "eos_token_id": 2,
         | 
| 11 | 
             
              "hidden_act": "silu",
         | 
    	
        configuration_baichuan.py
    CHANGED
    
    | @@ -46,7 +46,6 @@ class BaichuanConfig(PretrainedConfig): | |
| 46 | 
             
                    bos_token_id=1,
         | 
| 47 | 
             
                    eos_token_id=2,
         | 
| 48 | 
             
                    tie_word_embeddings=False,
         | 
| 49 | 
            -
                    z_loss_weight=0,
         | 
| 50 | 
             
                    **kwargs,
         | 
| 51 | 
             
                ):
         | 
| 52 | 
             
                    self.vocab_size = vocab_size
         | 
| @@ -59,7 +58,6 @@ class BaichuanConfig(PretrainedConfig): | |
| 59 | 
             
                    self.initializer_range = initializer_range
         | 
| 60 | 
             
                    self.rms_norm_eps = rms_norm_eps
         | 
| 61 | 
             
                    self.use_cache = use_cache
         | 
| 62 | 
            -
                    self.z_loss_weight = z_loss_weight
         | 
| 63 | 
             
                    super().__init__(
         | 
| 64 | 
             
                        pad_token_id=pad_token_id,
         | 
| 65 | 
             
                        bos_token_id=bos_token_id,
         | 
|  | |
| 46 | 
             
                    bos_token_id=1,
         | 
| 47 | 
             
                    eos_token_id=2,
         | 
| 48 | 
             
                    tie_word_embeddings=False,
         | 
|  | |
| 49 | 
             
                    **kwargs,
         | 
| 50 | 
             
                ):
         | 
| 51 | 
             
                    self.vocab_size = vocab_size
         | 
|  | |
| 58 | 
             
                    self.initializer_range = initializer_range
         | 
| 59 | 
             
                    self.rms_norm_eps = rms_norm_eps
         | 
| 60 | 
             
                    self.use_cache = use_cache
         | 
|  | |
| 61 | 
             
                    super().__init__(
         | 
| 62 | 
             
                        pad_token_id=pad_token_id,
         | 
| 63 | 
             
                        bos_token_id=bos_token_id,
         | 
    	
        modeling_baichuan.py
    CHANGED
    
    | @@ -502,7 +502,6 @@ class NormHead(nn.Module): | |
| 502 | 
             
                def forward(self, hidden_states):
         | 
| 503 | 
             
                    if self.training:
         | 
| 504 | 
             
                        norm_weight = nn.functional.normalize(self.weight)
         | 
| 505 | 
            -
                        self.first_flag = True
         | 
| 506 | 
             
                    elif self.first_flag:
         | 
| 507 | 
             
                        self.first_flag = False
         | 
| 508 | 
             
                        self.weight = nn.Parameter(nn.functional.normalize(self.weight))
         | 
| @@ -529,7 +528,7 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel): | |
| 529 | 
             
                    self.model = BaichuanModel(config)
         | 
| 530 |  | 
| 531 | 
             
                    self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
         | 
| 532 | 
            -
                    if hasattr(config, "quantization_config") and  | 
| 533 | 
             
                        try:
         | 
| 534 | 
             
                            from .quantizer import quantize_offline, init_model_weight_int4
         | 
| 535 | 
             
                        except ImportError:
         | 
| @@ -609,23 +608,22 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel): | |
| 609 | 
             
                        model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
         | 
| 610 | 
             
                        state_dict = torch.load(model_file, map_location="cpu") 
         | 
| 611 | 
             
                        model.is_quantized = True
         | 
| 612 | 
            -
             | 
| 613 | 
             
                        device_map = kwargs.pop("device_map", None)
         | 
| 614 | 
             
                        torch_dtype = kwargs.pop("torch_dtype", None)
         | 
| 615 |  | 
| 616 | 
            -
                         | 
| 617 | 
            -
             | 
| 618 | 
            -
             | 
| 619 | 
            -
                             | 
| 620 | 
            -
             | 
| 621 | 
            -
             | 
| 622 | 
            -
             | 
| 623 | 
            -
             | 
| 624 | 
            -
             | 
| 625 | 
            -
             | 
| 626 | 
            -
             | 
| 627 | 
            -
             | 
| 628 | 
            -
                            
         | 
| 629 | 
             
                        model = init_model_weight_int4(config, model, state_dict)
         | 
| 630 |  | 
| 631 | 
             
                        # Set model in evaluation mode to deactivate DropOut modules by default
         | 
| @@ -706,11 +704,9 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel): | |
| 706 | 
             
                        loss_fct = CrossEntropyLoss()
         | 
| 707 | 
             
                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
         | 
| 708 | 
             
                        shift_labels = shift_labels.view(-1)
         | 
| 709 | 
            -
                        softmax_normalizer = shift_logits.max(-1).values ** 2
         | 
| 710 | 
            -
                        z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
         | 
| 711 | 
             
                        # Enable model parallelism
         | 
| 712 | 
             
                        shift_labels = shift_labels.to(shift_logits.device)
         | 
| 713 | 
            -
                        loss = loss_fct(shift_logits, shift_labels) | 
| 714 |  | 
| 715 | 
             
                    if not return_dict:
         | 
| 716 | 
             
                        output = (logits,) + outputs[1:]
         | 
|  | |
| 502 | 
             
                def forward(self, hidden_states):
         | 
| 503 | 
             
                    if self.training:
         | 
| 504 | 
             
                        norm_weight = nn.functional.normalize(self.weight)
         | 
|  | |
| 505 | 
             
                    elif self.first_flag:
         | 
| 506 | 
             
                        self.first_flag = False
         | 
| 507 | 
             
                        self.weight = nn.Parameter(nn.functional.normalize(self.weight))
         | 
|  | |
| 528 | 
             
                    self.model = BaichuanModel(config)
         | 
| 529 |  | 
| 530 | 
             
                    self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
         | 
| 531 | 
            +
                    if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
         | 
| 532 | 
             
                        try:
         | 
| 533 | 
             
                            from .quantizer import quantize_offline, init_model_weight_int4
         | 
| 534 | 
             
                        except ImportError:
         | 
|  | |
| 608 | 
             
                        model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
         | 
| 609 | 
             
                        state_dict = torch.load(model_file, map_location="cpu") 
         | 
| 610 | 
             
                        model.is_quantized = True
         | 
| 611 | 
            +
                                    
         | 
| 612 | 
             
                        device_map = kwargs.pop("device_map", None)
         | 
| 613 | 
             
                        torch_dtype = kwargs.pop("torch_dtype", None)
         | 
| 614 |  | 
| 615 | 
            +
                        kwargs = {"no_split_module_classes": model._no_split_modules}
         | 
| 616 | 
            +
                        target_dtype = CustomDtype.INT4
         | 
| 617 | 
            +
                        max_memory = get_balanced_memory(
         | 
| 618 | 
            +
                            model,
         | 
| 619 | 
            +
                            dtype=target_dtype,
         | 
| 620 | 
            +
                            low_zero=(device_map == "balanced_low_0"),
         | 
| 621 | 
            +
                            max_memory=None,
         | 
| 622 | 
            +
                            **kwargs,
         | 
| 623 | 
            +
                        )
         | 
| 624 | 
            +
                        kwargs["max_memory"] = max_memory
         | 
| 625 | 
            +
                        
         | 
| 626 | 
            +
                        device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
         | 
|  | |
| 627 | 
             
                        model = init_model_weight_int4(config, model, state_dict)
         | 
| 628 |  | 
| 629 | 
             
                        # Set model in evaluation mode to deactivate DropOut modules by default
         | 
|  | |
| 704 | 
             
                        loss_fct = CrossEntropyLoss()
         | 
| 705 | 
             
                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
         | 
| 706 | 
             
                        shift_labels = shift_labels.view(-1)
         | 
|  | |
|  | |
| 707 | 
             
                        # Enable model parallelism
         | 
| 708 | 
             
                        shift_labels = shift_labels.to(shift_logits.device)
         | 
| 709 | 
            +
                        loss = loss_fct(shift_logits, shift_labels)
         | 
| 710 |  | 
| 711 | 
             
                    if not return_dict:
         | 
| 712 | 
             
                        output = (logits,) + outputs[1:]
         | 

