Source code for mindformers.models.llama.llama

# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""LLaMA models' APIs."""
import copy

import numpy as np

import mindspore.common.dtype as mstype
from mindspore import Tensor, nn
from mindspore import mint
from mindspore.context import ParallelMode
from mindspore.ops import operations as P
from mindspore.parallel._utils import _get_parallel_mode, _is_sharding_propagation

from mindformers.core.loss.loss import CrossEntropyLoss
from mindformers.mindformer_book import MindFormerBook
from mindformers.models.modeling_utils import PreTrainedModel
from mindformers.models.utils import LayerSetting, check_fine_grain_interleave_valid
from mindformers.modules.layers import Linear, FreqsMgr
from mindformers.modules.transformer import LowerTriangularMaskWithDynamic
from mindformers.modules.transformer.op_parallel_config import _check_config
from mindformers.tools.register.register import MindFormerModuleType, MindFormerRegister
from mindformers.tools.utils import get_disable_custom_fa, get_predict_run_mode, get_use_rope_self_define

from .llama_config import LlamaConfig
from .llama_layer import LlamaEmbedding, LlamaRMSNorm
from .llama_transformer import LLamaDecodeLayer
from .llama_interleave import LLamaDecodeLayerInterleave
from ..utils import lazy_inline
from ...tools.logger import logger

__all__ = ['LlamaModel', 'LlamaForCausalLM']


class LlamaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = LlamaConfig
    base_model_prefix = "llama"


class LlamaModel(LlamaPreTrainedModel):
    r"""
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
    Args:
        config(LlamaConfig): the config of network

    Returns:
            output: Tensor, the output of llama decoderlayer

    Examples:
        >>> from mindformers import LlamaModel
        >>> network = LlamaModel.from_pretrained('llama_7b')
        >>> type(network)
        <class 'mindformers.models.llama.llama.LlamaModel'>
    """
    _support_list = MindFormerBook.get_model_support_list()['llama']

    def __init__(self,
                 config: LlamaConfig = None):
        super().__init__(config, auto_prefix=True)
        _check_config(config.parallel_config)
        self.dtype = config.compute_dtype
        self.hidden_size = config.hidden_size
        self.num_layers = config.num_layers
        self.n_head = config.num_heads
        self.head_dim = self.hidden_size // self.n_head
        self.pad_token_id = config.pad_token_id
        self.is_first_iteration = True
        self.use_past = config.use_past
        self.use_flash_attention = config.use_flash_attention
        self.use_ring_attention = config.use_ring_attention
        self.parallel_decoding = config.parallel_decoding_params is not None
        self.concat = P.Concat(-1)
        self.cast = P.Cast()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        # default open internal kernel boost
        self.disable_custom_fa = get_disable_custom_fa()
        logger.info("Open prefill flatten and disable custom flash attention op:{}".format(self.disable_custom_fa))
        if self.disable_custom_fa:
            self.prefill_flatten_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), 1))

        if config.moe_config.expert_num > 1:
            logger.info("MoE config is provided, use MoE FFN")
        else:
            logger.info("MoE config is None, use normal FFN")
        if not self.use_flash_attention and self.use_ring_attention:
            raise ValueError(f"When the ring_attention = True, the flash_attention must be True ")
        self.use_rope_self_define = get_use_rope_self_define()

        self.freqs_mgr = FreqsMgr(head_dim=self.head_dim,
                                  seq_length=config.seq_length,
                                  max_position_embedding=config.max_position_embedding,
                                  rotary_dtype=config.rotary_dtype,
                                  theta=config.theta,
                                  scaling_factor=config.scaling_factor,
                                  extend_method=config.extend_method,
                                  parallel_config=config.parallel_config)
        self.casual_mask = LowerTriangularMaskWithDynamic(seq_length=config.seq_length,
                                                          compute_type=config.compute_dtype,
                                                          is_dynamic=config.is_dynamic,
                                                          pad_token_id=config.pad_token_id,
                                                          use_flash_attention=config.use_flash_attention,
                                                          use_attn_mask_compression=config.use_attn_mask_compression)
        self.tok_embeddings = LlamaEmbedding(vocab_table_size=config.vocab_size,
                                             embedding_size=config.hidden_size,
                                             param_init_type=config.embedding_init_type,
                                             parallel_optimizer=config.parallel_optimizer)
        self.fine_grain_interleave = check_fine_grain_interleave_valid(config.fine_grain_interleave,
                                                                       config.parallel_config)
        self.layers = nn.CellList()
        self.layer_setting = LayerSetting(config.num_layers,
                                          config.offset,
                                          config.parallel_config,
                                          config.pp_interleave_num)
        for layer_id in range(config.num_layers):
            if self.fine_grain_interleave:
                layer = LLamaDecodeLayerInterleave(config.batch_size,
                                                   config.seq_length,
                                                   layer_id,
                                                   dim=config.hidden_size,
                                                   n_heads=config.num_heads,
                                                   num_layers=config.num_layers,
                                                   multiple_of=config.multiple_of,
                                                   n_kv_heads=config.n_kv_heads,
                                                   intermediate_size=config.intermediate_size,
                                                   ffn_dim_multiplier=config.ffn_dim_multiplier,
                                                   norm_eps=config.rms_norm_eps,
                                                   qkv_has_bias=config.qkv_has_bias,
                                                   qkv_concat=config.qkv_concat,
                                                   compute_dtype=config.compute_dtype,
                                                   layernorm_compute_dtype=config.layernorm_compute_type,
                                                   softmax_compute_dtype=config.softmax_compute_type,
                                                   rotary_dtype=config.rotary_dtype,
                                                   param_init_type=config.param_init_type,
                                                   use_flash_attention=config.use_flash_attention,
                                                   use_ring_attention=config.use_ring_attention,
                                                   use_attn_mask_compression=config.use_attn_mask_compression,
                                                   fine_grain_interleave=config.fine_grain_interleave,
                                                   parallel_config=config.parallel_config)
            else:
                layer = LLamaDecodeLayer(layer_id,
                                         dim=config.hidden_size,
                                         n_heads=config.num_heads,
                                         n_kv_heads=config.n_kv_heads,
                                         intermediate_size=config.intermediate_size,
                                         multiple_of=config.multiple_of,
                                         ffn_dim_multiplier=config.ffn_dim_multiplier,
                                         norm_eps=config.rms_norm_eps,
                                         qkv_has_bias=config.qkv_has_bias,
                                         qkv_concat=config.qkv_concat,
                                         compute_dtype=config.compute_dtype,
                                         layernorm_compute_dtype=config.layernorm_compute_type,
                                         softmax_compute_dtype=config.softmax_compute_type,
                                         rotary_dtype=config.rotary_dtype,
                                         param_init_type=config.param_init_type,
                                         use_past=config.use_past,
                                         use_flash_attention=config.use_flash_attention,
                                         use_ring_attention=config.use_ring_attention,
                                         use_attn_mask_compression=config.use_attn_mask_compression,
                                         block_size=config.block_size,
                                         num_blocks=config.num_blocks,
                                         use_rope_slice=config.use_rope_slice,
                                         moe_config=config.moe_config,
                                         parallel_config=config.parallel_config,
                                         parallel_decoding=self.parallel_decoding,
                                         fused_kernel=config.fused_rms_norm
                                         )
            self.layer_setting(layer, layer_id)
            self.layers.append(layer)
        self.norm_out = LlamaRMSNorm(config.hidden_size, config.rms_norm_eps,
                                     compute_type=config.layernorm_compute_type,
                                     fused_kernel=config.fused_rms_norm)
        dp = config.parallel_config.data_parallel
        cp = config.parallel_config.context_parallel
        if not (_get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation()):
            self.tok_embeddings.pipeline_stage = 0
            if config.parallel_config.pipeline_stage > 1:
                self.norm_out.pipeline_stage = config.parallel_config.pipeline_stage - 1
                self.tok_embeddings.set_comm_fusion(2)
                self.norm_out.set_comm_fusion(2)
            else:
                self.tok_embeddings.set_comm_fusion(config.parallel_config.gradient_aggregation_group)
                self.norm_out.set_comm_fusion(config.parallel_config.gradient_aggregation_group)

            self.tok_embeddings.shard(config.parallel_config)
            self.casual_mask.shard(config.parallel_config)
            self.concat.shard(((dp, 1, 1, 1), (dp, 1, 1, 1)))
            if self.fine_grain_interleave:
                self.norm_out.shard((dp * cp, 1))
            else:
                self.norm_out.shard((dp, cp, 1))

    # pylint: disable=W0613
    def construct(self, tokens: Tensor, input_embeds=None, batch_valid_length=None, batch_index=None,
                  zactivate_len=None, block_tables=None, slot_mapping=None, prefix_keys_values=None,
                  attention_mask=None, position_ids=None, q_seq_lens=None):
        """
        Forward of llama model.

        Args:
            tokens: the tokenized inputs with datatype int32
            input_embeds: the embedding Tensor of tokens, Tensor of shape:math:`(batch_size, seq/_length, hidden_size)`.
                Default None.
            batch_valid_length(Tensor): the past calculated the index with datatype int32, used for incremental
                prediction. Tensor of shape :math:`(batch_size,)`. Default None.
            block_tables (Tensor[int64]): Store mapping tables for each sequence.
            slot_mapping (Tensor[int32]): Store token cache physical slot index.
        Returns:
            output: Tensor, the output of llama decoderlayer
        """
        # preprocess
        bs, seq_len = self.shape(tokens)
        if self.parallel_decoding:
            # FA with TH layout, mask is 2D, FA with BSH layout, mask is 4D
            mask = attention_mask
            freqs_cis = self.freqs_mgr.increment_multi_ids(position_ids)
        else:
            mask = None
            if self.use_past:
                if self.is_first_iteration:
                    if self.use_rope_self_define:
                        freqs_cis = self.freqs_mgr(seq_len)
                    else:
                        freqs_cis = self.freqs_mgr.prefill(bs, seq_len)

                    if self.use_flash_attention:
                        if self.disable_custom_fa:  # only support fp16
                            mask = self.prefill_flatten_mask
                            freqs_cis = self.freqs_mgr.prefill_flatten()
                    else:
                        mask = self.casual_mask(tokens)  # mask: [bs, seq, seq]

                    if prefix_keys_values is not None:
                        if mask is None:
                            mask = self.casual_mask(tokens)
                        prefix_length = prefix_keys_values[0].shape[2]
                        prefix_mask = Tensor(np.zeros((bs, 1, seq_len, prefix_length)), dtype=mask.dtype)
                        mask = self.concat((prefix_mask, mask))
                else:
                    freqs_cis = self.freqs_mgr.increment(batch_valid_length)
            else:
                if not self.use_ring_attention:
                    mask = self.casual_mask(tokens)
                freqs_cis = self.freqs_mgr(seq_len)
                if prefix_keys_values is not None:
                    prefix_length = prefix_keys_values[0].shape[2]
                    prefix_mask = Tensor(np.zeros((bs, 1, seq_len, prefix_length)), dtype=mask.dtype)
                    mask = self.concat((prefix_mask, mask))

        # tokens: [bs, seq/1]
        if input_embeds is not None:
            h = self.cast(input_embeds, self.dtype)
        else:
            h = self.cast(self.tok_embeddings(tokens), self.dtype)
        h = self.reshape(h, (bs, seq_len, self.hidden_size))
        # h: [bs, seq/1, hidden_dim]
        for i in range(self.num_layers):
            prefix_kv = prefix_keys_values[i] if prefix_keys_values is not None else None
            h = self.layers[i](h, freqs_cis, mask, batch_valid_length=batch_valid_length, block_tables=block_tables,
                               slot_mapping=slot_mapping, prefix_keys_values=prefix_kv, q_seq_lens=q_seq_lens)
        output = self.norm_out(h)
        return output


[docs]@MindFormerRegister.register(MindFormerModuleType.MODELS) class LlamaForCausalLM(LlamaPreTrainedModel): r""" Provide llama training loss or logits through network. Args: config (LlamaConfig): The config of llama model. Default: `None` . Inputs: - **input_ids** (Tensor) - the indices of input sequence tokens in the vocabulary with data type Int64/Int32, Tensor of shape :math:`(batch, seq\_length)`. - **labels** (Tensor, optional) - the labels of inputs with data type Int64/Int32, Tensor of shape :math:`(batch, seq\_length)` . Default: ``None`` . - **input_position** (Tensor, optional) - the position ids of inputs (at incremental reasoning mode) which is an increasing sequence with data type Int64/Int32, Tensor :math:`(batch, seq\_length)`. Default: ``None`` . - **position_ids** (Tensor, optional) - the position ids of inputs which is an increasing sequence with data type Int64/Int32, Tensor :math:`(batch, seq\_length)`. Default: ``None`` . - **attention_mask** (Tensor, optional) - input sentences padding mask, where 0 indicates padding position with data type Int64/Int32, Tensor of shape :math:`(batch, seq\_length)`. Default: ``None`` . - **input_embeds** (Tensor, optional) - the embedding of inputs with data type Float32/Float16, Tensor of shape :math:`(batch, seq\_length, hidden\_size)`. Default: ``None`` . - **init_reset** (Tensor, optional) - A Bool tensor with shape [1], used to clear the past key parameter and past value parameter used in the incremental prediction. Only valid when use_past is True. Tensor of shape :math:`(1)`. Default: ``Tensor([True])`` . - **batch_valid_length** (Tensor, optional) - Int32 tensor with shape [batch_size] the past calculated the index. Used for incremental prediction when the use_past is True. Default: ``None`` . - **block_tables** (Tensor, optional) - Int64 type Tensor, Store mapping tables for each sequence. Default: ``None`` . - **slot_mapping** (Tensor, optional) - Int32 type Tensor, token cache physical slot index. Default:``None`` . Outputs: Tensor. If it is in training mode, the output Tensor contains loss; If it is in prediction mode, the output Tensor contains logits; If it is in evaluation mode, the output Tensor contains logits, tokens, and input masks. Examples: >>> from mindformers.models.llama import LlamaConfig, LlamaForCausalLM >>> import mindspore as ms >>> ms.set_context(mode=0) >>> config = LlamaConfig(batch_size=2) >>> network = LlamaForCausalLM(config=config) >>> type(network) <class 'mindformers.models.llama.llama.LlamaForCausalLM'> >>> from mindformers import LlamaForCausalLM >>> network = LlamaForCausalLM.from_pretrained('llama_7b') >>> type(network) <class 'mindformers.models.llama.llama.LlamaForCausalLM'> """ _support_list = MindFormerBook.get_model_support_list()['llama'] @lazy_inline def __init__(self, config: LlamaConfig = None): super(LlamaForCausalLM, self).__init__(config, auto_prefix=True) _check_config(config.parallel_config) self.config = config self.ignore_token_id = config.ignore_token_id self.pad_token_id = config.pad_token_id self.use_past = config.use_past self.vocab_size = config.vocab_size self.is_first_iteration = True self.disable_custom_fa = get_disable_custom_fa() self.shape = P.Shape() self.reshape = P.Reshape() self.cast = P.Cast() self.slice = P.StridedSlice() self.not_equal = P.NotEqual() self.mul = P.Mul() self.add = P.Add() self.ones = P.Ones() self.gather = P.Gather(1) self.prefill_gather_flatten = P.Gather() self.sub_batch_valid_len = P.Sub() self.model = LlamaModel(config=config) self.lm_head = Linear(in_channels=config.hidden_size, out_channels=config.vocab_size, has_bias=False, compute_dtype=config.compute_dtype, param_init_type=config.param_init_type, weight_init="normal") # meta default: xavier_normal if config.tie_word_embeddings: self.lm_head.weight = self.model.tok_embeddings.embedding_weight mp = config.parallel_config.model_parallel vocab_size = config.vocab_size loss_parallel_config = copy.deepcopy(config.parallel_config) if vocab_size % mp != 0: logger.warning("The vocab size of Loss is: %s, it is not divide by model_parallel: %s", vocab_size, mp) logger.warning("Now, the model_parallel num of Loss will be changed: mp = 1") loss_parallel_config.model_parallel = 1 loss_parallel_config.data_parallel *= loss_parallel_config.context_parallel self.loss = CrossEntropyLoss(parallel_config=loss_parallel_config) dp = config.parallel_config.data_parallel mp = config.parallel_config.model_parallel cp = config.parallel_config.context_parallel if not (_get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation()): self.slice.shard(((dp, 1),)) self.not_equal.shard(((dp, 1), ())) self.mul.shard(((dp, 1), (dp, 1))) self.add.shard(((dp, 1), ())) self.gather.shard(((dp, 1, 1), (dp,))) self.prefill_gather_flatten.shard(((dp, 1, 1), (dp,))) self.sub_batch_valid_len.shard(((1,), ())) if config.parallel_config.vocab_emb_dp or (vocab_size % mp != 0): self.lm_head.shard(strategy_matmul=((dp * cp, 1), (1, 1))) else: self.lm_head.shard(strategy_matmul=((dp * cp, 1), (mp, 1))) if config.parallel_config.pipeline_stage > 1: self.lm_head.pipeline_stage = config.parallel_config.pipeline_stage - 1 self.load_checkpoint(config) self.predict_run_mode = get_predict_run_mode() logger.info("Predict run mode:{}".format(self.predict_run_mode)) llm_boost_kwargs = {"config": config} if config.llm_backend: from mindspore.experimental.llm_boost.register import LlmBoostRegister self.llm_boost = LlmBoostRegister.get_instance(config.llm_backend, "Llama", **llm_boost_kwargs) self.llm_boost.init() self.is_set_kvcache = False self.parallel_decoding = config.parallel_decoding_params is not None def to_embeddings(self, tokens): """return embedding tokens""" return self.model.tok_embeddings(tokens) def prepare_inputs_for_prefill_flatten(self, input_ids, batch_valid_length, slot_mapping, model_inputs): """prepare inputs ids for prefill flatten""" batch_valid_length_bs = batch_valid_length.shape[0] input_ids_bs = input_ids.shape[0] if batch_valid_length_bs == input_ids_bs and batch_valid_length_bs > 1: input_ids_list = [] for i in range(batch_valid_length_bs): context_len = batch_valid_length[i] input_ids_list.append(input_ids[i][:context_len]) input_ids = np.concatenate(input_ids_list, 0) input_ids = input_ids.reshape((1, -1)) slot_mapping = np.delete(slot_mapping, np.where(slot_mapping == -1)) model_inputs["input_ids"] = Tensor.from_numpy(input_ids.astype(np.int32)) model_inputs["slot_mapping"] = Tensor.from_numpy(slot_mapping) return model_inputs def prepare_inputs_for_generation(self, input_ids, **kwargs): model_inputs = {} if self.config.is_dynamic and "origin_inputs" in kwargs: input_ids = kwargs["origin_inputs"] model_inputs["input_ids"] = Tensor.from_numpy( input_ids.astype(np.int32)) if hasattr(self, 'llm_boost'): batch_valid_length = kwargs.get("valid_length_each_example") block_tables = kwargs.get("block_tables") slot_mapping = kwargs.get("slot_mapping") prefill = kwargs.get("prefill") bs = batch_valid_length.shape[0] position_ids_list = [ np.arange(context_len, dtype=np.int64) for context_len in batch_valid_length] if input_ids.shape[-1] == 1: input_ids = np.concatenate(input_ids, 0) else: input_ids_list = [] for i in range(bs): context_len = batch_valid_length[i] if prefill: input_ids_list.append(input_ids[i][:context_len]) else: input_ids_list.append( input_ids[i][context_len - 1:context_len]) input_ids = np.concatenate(input_ids_list, 0) position_ids = np.concatenate(position_ids_list, 0) slot_mapping = np.delete( slot_mapping, np.where(slot_mapping == -1)) lm_head_indices = np.cumsum(batch_valid_length, dtype=np.int64) - 1 seq_lens = batch_valid_length.tolist() model_inputs["llm_boost_inputs"] = { "input_ids": Tensor.from_numpy(input_ids), "position_ids": Tensor.from_numpy(position_ids), "lm_head_indices": Tensor.from_numpy(lm_head_indices), "block_tables": Tensor.from_numpy(block_tables), "slot_mapping": Tensor.from_numpy(slot_mapping), "batch_valid_length": Tensor.from_numpy(batch_valid_length), "seq_lens": seq_lens } prefill = kwargs.get("prefill") if self.disable_custom_fa and prefill: batch_valid_length = kwargs.get("valid_length_each_example") slot_mapping = kwargs.get("slot_mapping") model_inputs = self.prepare_inputs_for_prefill_flatten(input_ids, batch_valid_length, slot_mapping, model_inputs) return model_inputs # pylint: disable=W0613 def prepare_inputs_for_predict_layout(self, input_ids, **kwargs): """Get Llama model input tuple for transform ckpt.""" input_ids = Tensor(input_ids, mstype.int32) labels = Tensor(kwargs["labels"]) if "labels" in kwargs else None bs, seq = input_ids.shape[0], input_ids.shape[1] slot_mapping = Tensor(np.ones(shape=tuple([bs * seq])), mstype.int32) prefix_keys_values = Tensor(kwargs["prefix_keys_values"]) if "prefix_keys_values" in kwargs else None return input_ids, labels, None, None, None, None, None, None, None, None, None, slot_mapping, prefix_keys_values def set_dynamic_inputs(self, **kwargs): dynamic_input_ids = Tensor(shape=[None, None], dtype=mstype.int32) dynamic_batch_valid_length = Tensor(shape=[None, None], dtype=mstype.int32) dynamic_block_tables = Tensor(shape=[None, None], dtype=mstype.int32) dynamic_slot_mapping = Tensor(shape=[None], dtype=mstype.int32) have_prefix_keys_values = getattr(kwargs, "have_prefix_keys_values", False) dynamic_position_ids = Tensor(shape=[None, None], dtype=mstype.int32) if self.parallel_decoding else None dynamic_mask = Tensor(shape=[None, None], dtype=mstype.float16) if self.parallel_decoding else None dynamic_q_seq_lens = Tensor(shape=[None], dtype=mstype.int32) if self.parallel_decoding else None if have_prefix_keys_values: dynamic_prefix_keys_values = Tensor(shape=[2, None, None, None, None], dtype=mstype.float16) self.set_inputs(dynamic_input_ids, None, None, dynamic_position_ids, dynamic_mask, None, None, dynamic_batch_valid_length, None, None, dynamic_block_tables, dynamic_slot_mapping, dynamic_prefix_keys_values, None, dynamic_q_seq_lens) elif self.use_past: self.set_inputs(dynamic_input_ids, None, None, dynamic_position_ids, dynamic_mask, None, None, dynamic_batch_valid_length, None, None, dynamic_block_tables, dynamic_slot_mapping, None, None, dynamic_q_seq_lens) elif kwargs.get("pre_gather", False): self.set_inputs(dynamic_input_ids, None, None, None, None, None, None, dynamic_batch_valid_length, None, None, None, None, None) else: self.set_inputs(dynamic_input_ids, None, None, None, None, None, None, None, None, None, None, None, None) logger.info("Set dynamic input for llama.") def add_flags_custom(self, is_first_iteration): """Add customized attributes for specific cells in the model.""" self.add_flags(is_first_iteration=is_first_iteration) self.model.add_flags(is_first_iteration=is_first_iteration) for layer in self.model.layers: layer.add_flags(is_first_iteration=is_first_iteration) layer.attention.infer_attention.add_flags(is_first_iteration=is_first_iteration) # pylint: disable=W0613 def construct(self, input_ids, labels=None, input_position=None, position_ids=None, attention_mask=None, input_embeds=None, init_reset=None, batch_valid_length=None, batch_index=None, zactivate_len=None, block_tables=None, slot_mapping=None, prefix_keys_values=None, llm_boost_inputs=None, q_seq_lens=None): r""" LlamaForCausalLM forward. Args: input_ids(Tensor): the tokenized inputs with datatype int32, Tensor of shape :math:`(batch, seq\_length)`. labels(Tensor): the tokenized labels with datatype int32, Tensor of shape :math:`(batch, seq\_length)`. input_position(Tensor): current position, used by model.predict. position_ids(Tensor): Reserved param, not used. attention_mask(Tensor): Reserved param, not used. input_embeds(Tensor): the input embedding Tensor of shape :math:`(batch, seq\_length, hidden_size)`. Default None. init_reset(bool, optional): A bool tensor with shape [1], used to clear the past key parameter and past value parameter used in the incremental prediction. Default True. batch_valid_length(Tensor): the past calculated the index with datatype int32, used for incremental prediction. Tensor of shape :math:`(batch_size,)`. Default None. block_tables (Tensor[int64]): Store mapping tables for each sequence. slot_mapping (Tensor[int32]): Store token cache physical slot index. q_seq_lens (Tensor[int32]): In parallel decoding, the query may be flattened. The Paged Attention operator need `q_seq_lens` to obtain the length information. Returns: Tensor, The loss or (logits, tokens, input_mask) of the network. """ if hasattr(self, 'llm_boost'): if not self.is_set_kvcache: self.llm_boost.set_kvcache() self.is_set_kvcache = True self.llm_boost.add_flags(is_first_iteration=self.is_first_iteration) llm_boost_inputs["cos_embed"] = self.model.freqs_mgr.freqs_cos llm_boost_inputs["sin_embed"] = self.model.freqs_mgr.freqs_sin return self.llm_boost.forward(llm_boost_inputs) bsz, seqlen = self.shape(input_ids) if self.use_past: if not isinstance(batch_valid_length, Tensor): batch_valid_length = self.ones((bsz,), mstype.int32) if self.training: tokens = self.slice(input_ids, (0, 0), (bsz, seqlen - 1), (1, 1)) else: tokens = input_ids if batch_valid_length is not None: batch_valid_length = self.reshape(batch_valid_length, (-1,)) output = self.model(tokens, input_embeds, batch_valid_length, batch_index, zactivate_len, block_tables, \ slot_mapping, prefix_keys_values, attention_mask, position_ids, q_seq_lens) pre_gather = (not self.use_past or self.is_first_iteration) and batch_valid_length is not None if self.parallel_decoding and self.is_first_iteration: output = output.reshape(-1, output.shape[-1]) output = output[self.sub_batch_valid_len(batch_valid_length, 1)] elif pre_gather: if self.disable_custom_fa: batch_valid_length = mint.cumsum(batch_valid_length, 0) output = self.prefill_gather_flatten(output, self.sub_batch_valid_len(batch_valid_length, 1), 1) else: output = self.gather(output, self.sub_batch_valid_len(batch_valid_length, 1), 1) logits = self.lm_head(output) input_mask = self.cast(self.not_equal(tokens, self.pad_token_id), mstype.float32) if labels is None: labels = self.slice(input_ids, (0, 1), (bsz, seqlen), (1, 1)) else: if labels.ndim > 1: if self.training: labels = self.slice(labels, (0, 1), (bsz, seqlen), (1, 1)) label_mask = self.cast(self.not_equal(labels, self.ignore_token_id), mstype.float32) input_mask = self.mul(input_mask, label_mask) if not self.training: logits = self.cast(logits, mstype.float32) if self.predict_run_mode: logits = self.reshape(logits, (-1, logits.shape[-1])) return logits return logits, tokens, input_mask if logits.ndim > 2: logits = self.reshape(logits, (-1, logits.shape[-1])) logits = self.cast(logits, mstype.float32) labels = self.reshape(labels, (-1,)) input_mask = self.reshape(input_mask, (-1,)) loss = self.loss(logits, labels, input_mask) return loss def kvcache(self, layer_idx): key_cache = self.model.layers[layer_idx].attention.infer_attention.paged_attention_mgr.key_cache value_cache = self.model.layers[layer_idx].attention.infer_attention.paged_attention_mgr.value_cache return key_cache, value_cache def convert_name(self, weight_name): """convert HuggingFace weight name to MindFormers weight name""" weight_name = weight_name.replace('embed_tokens.', 'tok_embeddings.') weight_name = weight_name.replace('.self_attn.q_proj.', '.attention.wq.') weight_name = weight_name.replace('.self_attn.k_proj.', '.attention.wk.') weight_name = weight_name.replace('.self_attn.v_proj.', '.attention.wv.') weight_name = weight_name.replace('.self_attn.o_proj.', '.attention.wo.') weight_name = weight_name.replace('.mlp.gate_proj.', '.feed_forward.w1.') weight_name = weight_name.replace('.mlp.down_proj.', '.feed_forward.w2.') weight_name = weight_name.replace('.mlp.up_proj.', '.feed_forward.w3.') weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.') weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') weight_name = weight_name.replace('.norm.', '.norm_out.') weight_name = weight_name.replace('output.', 'lm_head.') weight_name = weight_name.replace('.tok_embeddings.weight', '.tok_embeddings.embedding_weight') return weight_name def convert_weight_dict(self, source_dict): """convert HuggingFace weight dict to MindFormers weight dict""" target_dict = {} for k, v in source_dict.items(): k = self.convert_name(k) target_dict.update({k: v}) return target_dict def convert_map_dict(self, source_dict): """convert HuggingFace map dict to MindFormers map dict""" target_dict = {} for k, v in source_dict.items(): k = self.convert_name(k) target_dict.update({k: v}) return target_dict