Skip to content

vllm.model_executor.models.bart

PyTorch BART model.

logger module-attribute

logger = get_logger(__name__)

BartCrossAttention

Bases: Module

Source code in vllm/model_executor/models/bart.py
class BartCrossAttention(nn.Module):

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        bias: bool = True,
        config: Optional[BartConfig] = None,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        self.embed_dim = embed_dim
        self.total_num_heads = num_heads
        self.total_num_kv_heads = self.total_num_heads
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(f"embed_dim must be divisible by num_heads "
                             f"(got `embed_dim`: {self.embed_dim}"
                             f" and `num_heads`: {num_heads}).")
        self.scaling = self.head_dim**-0.5

        # TP sharding sizes is accounted for within "*Parallel" layers.
        self.qkv_proj = QKVCrossParallelLinear(self.d_model,
                                               self.d_model //
                                               self.total_num_heads,
                                               self.total_num_heads,
                                               self.total_num_kv_heads,
                                               bias,
                                               quant_config=quant_config)

        self.out_proj = RowParallelLinear(
            embed_dim,
            embed_dim,
            bias=bias,
            quant_config=quant_config,
        )

        tp_world_size = get_tensor_model_parallel_world_size()
        assert self.total_num_heads % tp_world_size == 0
        self.num_heads = self.total_num_heads // tp_world_size

        if self.total_num_kv_heads >= tp_world_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_world_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_world_size % self.total_num_kv_heads == 0
        self.num_kv_heads = self.num_heads  # No GQA in bart
        self.attn = Attention(self.num_heads,
                              self.head_dim,
                              self.scaling,
                              num_kv_heads=self.num_kv_heads,
                              cache_config=cache_config,
                              quant_config=quant_config,
                              prefix=f"{prefix}.attn",
                              attn_type=AttentionType.ENCODER_DECODER)

    def forward(
        self,
        decoder_hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Input shape: Batch x Time x Channel"""

        q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states)

        attn_output = self.attn(q, k, v)

        output, _ = self.out_proj(attn_output)
        return output

attn instance-attribute

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    attn_type=ENCODER_DECODER,
)

config instance-attribute

config = config

d_model instance-attribute

d_model = d_model

embed_dim instance-attribute

embed_dim = embed_dim

head_dim instance-attribute

head_dim = embed_dim // num_heads

num_heads instance-attribute

num_heads = total_num_heads // tp_world_size

num_kv_heads instance-attribute

num_kv_heads = num_heads

out_proj instance-attribute

out_proj = RowParallelLinear(
    embed_dim,
    embed_dim,
    bias=bias,
    quant_config=quant_config,
)

qkv_proj instance-attribute

qkv_proj = QKVCrossParallelLinear(
    d_model,
    d_model // total_num_heads,
    total_num_heads,
    total_num_kv_heads,
    bias,
    quant_config=quant_config,
)

scaling instance-attribute

scaling = head_dim ** -0.5

total_num_heads instance-attribute

total_num_heads = num_heads

total_num_kv_heads instance-attribute

total_num_kv_heads = total_num_heads

__init__

__init__(
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
):
    super().__init__()
    self.d_model = config.d_model
    self.embed_dim = embed_dim
    self.total_num_heads = num_heads
    self.total_num_kv_heads = self.total_num_heads
    self.head_dim = embed_dim // num_heads
    self.config = config

    if (self.head_dim * num_heads) != self.embed_dim:
        raise ValueError(f"embed_dim must be divisible by num_heads "
                         f"(got `embed_dim`: {self.embed_dim}"
                         f" and `num_heads`: {num_heads}).")
    self.scaling = self.head_dim**-0.5

    # TP sharding sizes is accounted for within "*Parallel" layers.
    self.qkv_proj = QKVCrossParallelLinear(self.d_model,
                                           self.d_model //
                                           self.total_num_heads,
                                           self.total_num_heads,
                                           self.total_num_kv_heads,
                                           bias,
                                           quant_config=quant_config)

    self.out_proj = RowParallelLinear(
        embed_dim,
        embed_dim,
        bias=bias,
        quant_config=quant_config,
    )

    tp_world_size = get_tensor_model_parallel_world_size()
    assert self.total_num_heads % tp_world_size == 0
    self.num_heads = self.total_num_heads // tp_world_size

    if self.total_num_kv_heads >= tp_world_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_world_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_world_size % self.total_num_kv_heads == 0
    self.num_kv_heads = self.num_heads  # No GQA in bart
    self.attn = Attention(self.num_heads,
                          self.head_dim,
                          self.scaling,
                          num_kv_heads=self.num_kv_heads,
                          cache_config=cache_config,
                          quant_config=quant_config,
                          prefix=f"{prefix}.attn",
                          attn_type=AttentionType.ENCODER_DECODER)

forward

forward(
    decoder_hidden_states: Tensor,
    encoder_hidden_states: Optional[Tensor] = None,
) -> Tensor

Input shape: Batch x Time x Channel

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    decoder_hidden_states: torch.Tensor,
    encoder_hidden_states: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """Input shape: Batch x Time x Channel"""

    q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states)

    attn_output = self.attn(q, k, v)

    output, _ = self.out_proj(attn_output)
    return output

BartDecoder

Bases: Module

Transformer decoder consisting of config.decoder_layers layers. Each layer is a [BartDecoderLayer] Args: config: BartConfig embed_tokens (nn.Embedding): output embedding

Source code in vllm/model_executor/models/bart.py
class BartDecoder(nn.Module):
    """
    Transformer decoder consisting of *config.decoder_layers* layers.
    Each layer is a [`BartDecoderLayer`]
    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(
        self,
        config: BartConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        lora_config: Optional[LoRAConfig] = None,
        embed_tokens: Optional[nn.Embedding] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.cache_config = cache_config
        self.quant_config = quant_config
        self.lora_config = lora_config
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                    config.d_model,
                                                    embed_scale=embed_scale)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )

        self.layers = nn.ModuleList(
            [BartDecoderLayer(config,cache_config,quant_config,
            prefix=f"{prefix}.layers.{layer_idx}") \
             for layer_idx in range(config.decoder_layers)])

        self.layernorm_embedding = nn.LayerNorm(config.d_model)

    def forward(
        self,
        decoder_input_ids: torch.Tensor,
        decoder_positions: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor],
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            decoder_input_ids: Indices of *decoder* input sequence tokens 
                in the vocabulary.
                Padding will be ignored by default should you provide it.
            decoder_positions: Positions of *decoder* input sequence tokens.
            encoder_hidden_states: Tensor of encoder output embeddings.
        Returns:
            Decoder output torch.Tensor
        """
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(decoder_input_ids)
        else:
            decoder_positions = inputs_embeds[:, -1]

        # embed positions
        embed_pos = self.embed_positions(decoder_positions)
        embed_pos = embed_pos.to(inputs_embeds.device)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)

        # decoder layers

        for decoder_layer in self.layers:
            hidden_states = decoder_layer(
                decoder_hidden_states=hidden_states,
                encoder_hidden_states=encoder_hidden_states,
            )

        return hidden_states

cache_config instance-attribute

cache_config = cache_config

embed_positions instance-attribute

embed_positions = BartLearnedPositionalEmbedding(
    max_position_embeddings, d_model
)

embed_tokens instance-attribute

embed_tokens = BartScaledWordEmbedding(
    vocab_size, d_model, embed_scale=embed_scale
)

layernorm_embedding instance-attribute

layernorm_embedding = LayerNorm(d_model)

layers instance-attribute

layers = ModuleList(
    [
        (
            BartDecoderLayer(
                config,
                cache_config,
                quant_config,
                prefix=f"{prefix}.layers.{layer_idx}",
            )
        )
        for layer_idx in (range(decoder_layers))
    ]
)

lora_config instance-attribute

lora_config = lora_config

max_target_positions instance-attribute

max_target_positions = max_position_embeddings

quant_config instance-attribute

quant_config = quant_config

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[Embedding] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[nn.Embedding] = None,
    prefix: str = "",
):
    super().__init__()
    self.cache_config = cache_config
    self.quant_config = quant_config
    self.lora_config = lora_config
    self.max_target_positions = config.max_position_embeddings
    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                config.d_model,
                                                embed_scale=embed_scale)

    if embed_tokens is not None:
        self.embed_tokens.weight = embed_tokens.weight

    self.embed_positions = BartLearnedPositionalEmbedding(
        config.max_position_embeddings,
        config.d_model,
    )

    self.layers = nn.ModuleList(
        [BartDecoderLayer(config,cache_config,quant_config,
        prefix=f"{prefix}.layers.{layer_idx}") \
         for layer_idx in range(config.decoder_layers)])

    self.layernorm_embedding = nn.LayerNorm(config.d_model)

forward

forward(
    decoder_input_ids: Tensor,
    decoder_positions: Tensor,
    encoder_hidden_states: Optional[Tensor],
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
decoder_input_ids Tensor

Indices of decoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
decoder_positions Tensor

Positions of decoder input sequence tokens.

required
encoder_hidden_states Optional[Tensor]

Tensor of encoder output embeddings.

required

Returns: Decoder output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    decoder_input_ids: torch.Tensor,
    decoder_positions: torch.Tensor,
    encoder_hidden_states: Optional[torch.Tensor],
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        decoder_input_ids: Indices of *decoder* input sequence tokens 
            in the vocabulary.
            Padding will be ignored by default should you provide it.
        decoder_positions: Positions of *decoder* input sequence tokens.
        encoder_hidden_states: Tensor of encoder output embeddings.
    Returns:
        Decoder output torch.Tensor
    """
    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(decoder_input_ids)
    else:
        decoder_positions = inputs_embeds[:, -1]

    # embed positions
    embed_pos = self.embed_positions(decoder_positions)
    embed_pos = embed_pos.to(inputs_embeds.device)

    hidden_states = inputs_embeds + embed_pos
    hidden_states = self.layernorm_embedding(hidden_states)

    # decoder layers

    for decoder_layer in self.layers:
        hidden_states = decoder_layer(
            decoder_hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
        )

    return hidden_states

BartDecoderLayer

Bases: Module

Source code in vllm/model_executor/models/bart.py
class BartDecoderLayer(nn.Module):

    def __init__(
        self,
        config: BartConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = BartDecoderSelfAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            config=config,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
        self.activation_fn = get_act_fn(config.activation_function)

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        '''
        afeldman-nm: personally I would call this "cross-attention",
        however I left the name as "encoder_attn" to maintain consistency
        with the name of the pretrained weights.
        '''
        self.encoder_attn = BartCrossAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            config=config,
            prefix=f"{prefix}.encoder_attn",
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        ffn_hidden_size = self.embed_dim
        ffn_intermediate_size = config.encoder_ffn_dim
        ffn_has_bias = True
        self.fc1 = ColumnParallelLinear(
            ffn_hidden_size,
            ffn_intermediate_size,
            bias=ffn_has_bias,
            quant_config=quant_config,
        )
        self.fc2 = RowParallelLinear(
            ffn_intermediate_size,
            ffn_hidden_size,
            bias=ffn_has_bias,
            quant_config=quant_config,
        )

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        decoder_hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
            encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
        Returns:
            Decoder layer output torch.Tensor
        """
        residual = decoder_hidden_states

        # Self Attention
        hidden_states = self.self_attn(hidden_states=decoder_hidden_states)

        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Cross-Attention Block

        residual = hidden_states

        hidden_states = self.encoder_attn(
            decoder_hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
        )

        hidden_states = residual + hidden_states
        hidden_states = self.encoder_attn_layer_norm(hidden_states)

        # Fully Connected
        residual = hidden_states
        fc1_out, _ = self.fc1(hidden_states)
        hidden_states = self.activation_fn(fc1_out)

        hidden_states, _ = self.fc2(hidden_states)

        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        return hidden_states

activation_fn instance-attribute

activation_fn = get_act_fn(activation_function)

embed_dim instance-attribute

embed_dim = d_model

encoder_attn instance-attribute

encoder_attn = BartCrossAttention(
    embed_dim,
    decoder_attention_heads,
    config=config,
    prefix=f"{prefix}.encoder_attn",
)

encoder_attn_layer_norm instance-attribute

encoder_attn_layer_norm = LayerNorm(embed_dim)

fc1 instance-attribute

fc1 = ColumnParallelLinear(
    ffn_hidden_size,
    ffn_intermediate_size,
    bias=ffn_has_bias,
    quant_config=quant_config,
)

fc2 instance-attribute

fc2 = RowParallelLinear(
    ffn_intermediate_size,
    ffn_hidden_size,
    bias=ffn_has_bias,
    quant_config=quant_config,
)

final_layer_norm instance-attribute

final_layer_norm = LayerNorm(embed_dim)

self_attn instance-attribute

self_attn = BartDecoderSelfAttention(
    embed_dim=embed_dim,
    num_heads=decoder_attention_heads,
    config=config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.self_attn",
)

self_attn_layer_norm instance-attribute

self_attn_layer_norm = LayerNorm(embed_dim)

afeldman-nm: personally I would call this "cross-attention", however I left the name as "encoder_attn" to maintain consistency with the name of the pretrained weights.

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
):
    super().__init__()
    self.embed_dim = config.d_model

    self.self_attn = BartDecoderSelfAttention(
        embed_dim=self.embed_dim,
        num_heads=config.decoder_attention_heads,
        config=config,
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.self_attn",
    )
    self.activation_fn = get_act_fn(config.activation_function)

    self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
    '''
    afeldman-nm: personally I would call this "cross-attention",
    however I left the name as "encoder_attn" to maintain consistency
    with the name of the pretrained weights.
    '''
    self.encoder_attn = BartCrossAttention(
        self.embed_dim,
        config.decoder_attention_heads,
        config=config,
        prefix=f"{prefix}.encoder_attn",
    )
    self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)

    ffn_hidden_size = self.embed_dim
    ffn_intermediate_size = config.encoder_ffn_dim
    ffn_has_bias = True
    self.fc1 = ColumnParallelLinear(
        ffn_hidden_size,
        ffn_intermediate_size,
        bias=ffn_has_bias,
        quant_config=quant_config,
    )
    self.fc2 = RowParallelLinear(
        ffn_intermediate_size,
        ffn_hidden_size,
        bias=ffn_has_bias,
        quant_config=quant_config,
    )

    self.final_layer_norm = nn.LayerNorm(self.embed_dim)

forward

forward(
    decoder_hidden_states: Tensor,
    encoder_hidden_states: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
decoder_hidden_states Tensor

torch.Tensor of decoder input embeddings.

required
encoder_hidden_states Optional[Tensor]

torch.Tensor of encoder input embeddings.

None

Returns: Decoder layer output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    decoder_hidden_states: torch.Tensor,
    encoder_hidden_states: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        decoder_hidden_states: torch.Tensor of *decoder* input embeddings.
        encoder_hidden_states: torch.Tensor of *encoder* input embeddings.
    Returns:
        Decoder layer output torch.Tensor
    """
    residual = decoder_hidden_states

    # Self Attention
    hidden_states = self.self_attn(hidden_states=decoder_hidden_states)

    hidden_states = residual + hidden_states
    hidden_states = self.self_attn_layer_norm(hidden_states)

    # Cross-Attention Block

    residual = hidden_states

    hidden_states = self.encoder_attn(
        decoder_hidden_states=hidden_states,
        encoder_hidden_states=encoder_hidden_states,
    )

    hidden_states = residual + hidden_states
    hidden_states = self.encoder_attn_layer_norm(hidden_states)

    # Fully Connected
    residual = hidden_states
    fc1_out, _ = self.fc1(hidden_states)
    hidden_states = self.activation_fn(fc1_out)

    hidden_states, _ = self.fc2(hidden_states)

    hidden_states = residual + hidden_states
    hidden_states = self.final_layer_norm(hidden_states)

    return hidden_states

BartDecoderSelfAttention

Bases: Module

Source code in vllm/model_executor/models/bart.py
class BartDecoderSelfAttention(nn.Module):

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        bias: bool = True,
        config: Optional[BartConfig] = None,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        self.embed_dim = embed_dim
        self.total_num_heads = num_heads
        self.total_num_kv_heads = self.total_num_heads
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(f"embed_dim must be divisible by num_heads "
                             f"(got `embed_dim`: {self.embed_dim}"
                             f" and `num_heads`: {num_heads}).")
        self.scaling = self.head_dim**-0.5

        self.qkv_proj = QKVParallelLinear(
            self.d_model,
            self.d_model // self.total_num_heads,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=bias,
            quant_config=quant_config,
        )

        self.out_proj = RowParallelLinear(
            embed_dim,
            embed_dim,
            bias=bias,
            quant_config=quant_config,
        )

        tp_world_size = get_tensor_model_parallel_world_size()
        assert self.total_num_heads % tp_world_size == 0
        self.num_heads = self.total_num_heads // tp_world_size

        if self.total_num_kv_heads >= tp_world_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_world_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_world_size % self.total_num_kv_heads == 0
        self.num_kv_heads = self.num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim

        self.attn = Attention(self.num_heads,
                              self.head_dim,
                              self.scaling,
                              num_kv_heads=self.num_kv_heads,
                              cache_config=cache_config,
                              quant_config=quant_config,
                              prefix=f"{prefix}.attn",
                              attn_type=AttentionType.DECODER)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """Input shape: Batch x Time x Channel"""

        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)

        attn_output = self.attn(q, k, v)

        output, _ = self.out_proj(attn_output)
        return output

attn instance-attribute

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    attn_type=DECODER,
)

config instance-attribute

config = config

d_model instance-attribute

d_model = d_model

embed_dim instance-attribute

embed_dim = embed_dim

head_dim instance-attribute

head_dim = embed_dim // num_heads

kv_size instance-attribute

kv_size = num_kv_heads * head_dim

num_heads instance-attribute

num_heads = total_num_heads // tp_world_size

num_kv_heads instance-attribute

num_kv_heads = num_heads

out_proj instance-attribute

out_proj = RowParallelLinear(
    embed_dim,
    embed_dim,
    bias=bias,
    quant_config=quant_config,
)

q_size instance-attribute

q_size = num_heads * head_dim

qkv_proj instance-attribute

qkv_proj = QKVParallelLinear(
    d_model,
    d_model // total_num_heads,
    total_num_heads,
    total_num_kv_heads,
    bias=bias,
    quant_config=quant_config,
)

scaling instance-attribute

scaling = head_dim ** -0.5

total_num_heads instance-attribute

total_num_heads = num_heads

total_num_kv_heads instance-attribute

total_num_kv_heads = total_num_heads

__init__

__init__(
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
):
    super().__init__()
    self.d_model = config.d_model
    self.embed_dim = embed_dim
    self.total_num_heads = num_heads
    self.total_num_kv_heads = self.total_num_heads
    self.head_dim = embed_dim // num_heads
    self.config = config

    if (self.head_dim * num_heads) != self.embed_dim:
        raise ValueError(f"embed_dim must be divisible by num_heads "
                         f"(got `embed_dim`: {self.embed_dim}"
                         f" and `num_heads`: {num_heads}).")
    self.scaling = self.head_dim**-0.5

    self.qkv_proj = QKVParallelLinear(
        self.d_model,
        self.d_model // self.total_num_heads,
        self.total_num_heads,
        self.total_num_kv_heads,
        bias=bias,
        quant_config=quant_config,
    )

    self.out_proj = RowParallelLinear(
        embed_dim,
        embed_dim,
        bias=bias,
        quant_config=quant_config,
    )

    tp_world_size = get_tensor_model_parallel_world_size()
    assert self.total_num_heads % tp_world_size == 0
    self.num_heads = self.total_num_heads // tp_world_size

    if self.total_num_kv_heads >= tp_world_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_world_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_world_size % self.total_num_kv_heads == 0
    self.num_kv_heads = self.num_heads
    self.q_size = self.num_heads * self.head_dim
    self.kv_size = self.num_kv_heads * self.head_dim

    self.attn = Attention(self.num_heads,
                          self.head_dim,
                          self.scaling,
                          num_kv_heads=self.num_kv_heads,
                          cache_config=cache_config,
                          quant_config=quant_config,
                          prefix=f"{prefix}.attn",
                          attn_type=AttentionType.DECODER)

forward

forward(hidden_states: Tensor) -> Tensor

Input shape: Batch x Time x Channel

Source code in vllm/model_executor/models/bart.py
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    """Input shape: Batch x Time x Channel"""

    qkv, _ = self.qkv_proj(hidden_states)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)

    attn_output = self.attn(q, k, v)

    output, _ = self.out_proj(attn_output)
    return output

BartEncoder

Bases: Module

Transformer encoder consisting of config.encoder_layers self attention layers. Each layer is a [BartEncoderLayer]. Args: config: BartConfig embed_tokens (nn.Embedding): output embedding

Source code in vllm/model_executor/models/bart.py
class BartEncoder(nn.Module):
    """
    Transformer encoder consisting of *config.encoder_layers*
    self attention layers. Each layer is a [`BartEncoderLayer`].
    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self,
                 config: BartConfig,
                 cache_config: Optional[CacheConfig] = None,
                 quant_config: Optional[QuantizationConfig] = None,
                 lora_config: Optional[LoRAConfig] = None,
                 embed_tokens: Optional[nn.Embedding] = None,
                 prefix: str = ""):
        super().__init__()

        self.cache_config = cache_config
        self.quant_config = quant_config
        self.lora_config = lora_config
        embed_dim = config.d_model
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                    embed_dim,
                                                    embed_scale=embed_scale)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        self.layers = nn.ModuleList([
            BartEncoderLayer(config,
                             cache_config,
                             quant_config,
                             prefix=f"{prefix}.layers.{layer_idx}")
            for layer_idx in range(config.encoder_layers)
        ])

        self.layernorm_embedding = nn.LayerNorm(embed_dim)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: Indices of *encoder* input sequence tokens in the 
                vocabulary.
                Padding will be ignored by default should you provide it.
            positions: Positions of *encoder* input sequence tokens.
        Returns:
            Decoder output torch.Tensor
        """
        # retrieve input_ids and inputs_embeds
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        embed_pos = self.embed_positions(positions)
        embed_pos = embed_pos.to(inputs_embeds.device)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)

        for encoder_layer in self.layers:
            hidden_states = encoder_layer(hidden_states=hidden_states)

        return hidden_states

cache_config instance-attribute

cache_config = cache_config

embed_positions instance-attribute

embed_positions = BartLearnedPositionalEmbedding(
    max_position_embeddings, embed_dim
)

embed_tokens instance-attribute

embed_tokens = BartScaledWordEmbedding(
    vocab_size, embed_dim, embed_scale=embed_scale
)

layernorm_embedding instance-attribute

layernorm_embedding = LayerNorm(embed_dim)

layers instance-attribute

layers = ModuleList(
    [
        (
            BartEncoderLayer(
                config,
                cache_config,
                quant_config,
                prefix=f"{prefix}.layers.{layer_idx}",
            )
        )
        for layer_idx in (range(encoder_layers))
    ]
)

lora_config instance-attribute

lora_config = lora_config

max_source_positions instance-attribute

max_source_positions = max_position_embeddings

quant_config instance-attribute

quant_config = quant_config

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[Embedding] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(self,
             config: BartConfig,
             cache_config: Optional[CacheConfig] = None,
             quant_config: Optional[QuantizationConfig] = None,
             lora_config: Optional[LoRAConfig] = None,
             embed_tokens: Optional[nn.Embedding] = None,
             prefix: str = ""):
    super().__init__()

    self.cache_config = cache_config
    self.quant_config = quant_config
    self.lora_config = lora_config
    embed_dim = config.d_model
    self.max_source_positions = config.max_position_embeddings
    embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

    self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                embed_dim,
                                                embed_scale=embed_scale)

    if embed_tokens is not None:
        self.embed_tokens.weight = embed_tokens.weight

    self.embed_positions = BartLearnedPositionalEmbedding(
        config.max_position_embeddings,
        embed_dim,
    )
    self.layers = nn.ModuleList([
        BartEncoderLayer(config,
                         cache_config,
                         quant_config,
                         prefix=f"{prefix}.layers.{layer_idx}")
        for layer_idx in range(config.encoder_layers)
    ])

    self.layernorm_embedding = nn.LayerNorm(embed_dim)

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

Indices of encoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
positions Tensor

Positions of encoder input sequence tokens.

required

Returns: Decoder output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: Indices of *encoder* input sequence tokens in the 
            vocabulary.
            Padding will be ignored by default should you provide it.
        positions: Positions of *encoder* input sequence tokens.
    Returns:
        Decoder output torch.Tensor
    """
    # retrieve input_ids and inputs_embeds
    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)

    embed_pos = self.embed_positions(positions)
    embed_pos = embed_pos.to(inputs_embeds.device)

    hidden_states = inputs_embeds + embed_pos
    hidden_states = self.layernorm_embedding(hidden_states)

    for encoder_layer in self.layers:
        hidden_states = encoder_layer(hidden_states=hidden_states)

    return hidden_states

BartEncoderAttention

Bases: Module

Source code in vllm/model_executor/models/bart.py
class BartEncoderAttention(nn.Module):

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        bias: bool = True,
        config: Optional[BartConfig] = None,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.d_model = config.d_model
        self.embed_dim = embed_dim
        self.total_num_heads = num_heads
        self.total_num_kv_heads = self.total_num_heads
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(f"embed_dim must be divisible by num_heads "
                             f"(got `embed_dim`: {self.embed_dim}"
                             f" and `num_heads`: {num_heads}).")
        self.scaling = self.head_dim**-0.5

        self.qkv_proj = QKVParallelLinear(
            self.d_model,
            self.d_model // self.total_num_heads,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=bias,
            quant_config=quant_config,
        )

        self.out_proj = RowParallelLinear(
            embed_dim,
            embed_dim,
            bias=bias,
            quant_config=quant_config,
        )

        tp_world_size = get_tensor_model_parallel_world_size()
        assert self.total_num_heads % tp_world_size == 0
        self.num_heads = self.total_num_heads // tp_world_size

        if self.total_num_kv_heads >= tp_world_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_world_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_world_size % self.total_num_kv_heads == 0
        self.num_kv_heads = self.num_heads
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim

        self.attn = Attention(self.num_heads,
                              self.head_dim,
                              self.scaling,
                              num_kv_heads=self.num_kv_heads,
                              cache_config=cache_config,
                              quant_config=quant_config,
                              prefix=f"{prefix}.attn",
                              attn_type=AttentionType.ENCODER)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """Input shape: Batch x Time x Channel"""

        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)

        attn_output = self.attn(q, k, v)

        output, _ = self.out_proj(attn_output)
        return output

attn instance-attribute

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    attn_type=ENCODER,
)

config instance-attribute

config = config

d_model instance-attribute

d_model = d_model

embed_dim instance-attribute

embed_dim = embed_dim

head_dim instance-attribute

head_dim = embed_dim // num_heads

kv_size instance-attribute

kv_size = num_kv_heads * head_dim

num_heads instance-attribute

num_heads = total_num_heads // tp_world_size

num_kv_heads instance-attribute

num_kv_heads = num_heads

out_proj instance-attribute

out_proj = RowParallelLinear(
    embed_dim,
    embed_dim,
    bias=bias,
    quant_config=quant_config,
)

q_size instance-attribute

q_size = num_heads * head_dim

qkv_proj instance-attribute

qkv_proj = QKVParallelLinear(
    d_model,
    d_model // total_num_heads,
    total_num_heads,
    total_num_kv_heads,
    bias=bias,
    quant_config=quant_config,
)

scaling instance-attribute

scaling = head_dim ** -0.5

total_num_heads instance-attribute

total_num_heads = num_heads

total_num_kv_heads instance-attribute

total_num_kv_heads = total_num_heads

__init__

__init__(
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    embed_dim: int,
    num_heads: int,
    bias: bool = True,
    config: Optional[BartConfig] = None,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
):
    super().__init__()
    self.d_model = config.d_model
    self.embed_dim = embed_dim
    self.total_num_heads = num_heads
    self.total_num_kv_heads = self.total_num_heads
    self.head_dim = embed_dim // num_heads
    self.config = config

    if (self.head_dim * num_heads) != self.embed_dim:
        raise ValueError(f"embed_dim must be divisible by num_heads "
                         f"(got `embed_dim`: {self.embed_dim}"
                         f" and `num_heads`: {num_heads}).")
    self.scaling = self.head_dim**-0.5

    self.qkv_proj = QKVParallelLinear(
        self.d_model,
        self.d_model // self.total_num_heads,
        self.total_num_heads,
        self.total_num_kv_heads,
        bias=bias,
        quant_config=quant_config,
    )

    self.out_proj = RowParallelLinear(
        embed_dim,
        embed_dim,
        bias=bias,
        quant_config=quant_config,
    )

    tp_world_size = get_tensor_model_parallel_world_size()
    assert self.total_num_heads % tp_world_size == 0
    self.num_heads = self.total_num_heads // tp_world_size

    if self.total_num_kv_heads >= tp_world_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_world_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_world_size % self.total_num_kv_heads == 0
    self.num_kv_heads = self.num_heads
    self.q_size = self.num_heads * self.head_dim
    self.kv_size = self.num_kv_heads * self.head_dim

    self.attn = Attention(self.num_heads,
                          self.head_dim,
                          self.scaling,
                          num_kv_heads=self.num_kv_heads,
                          cache_config=cache_config,
                          quant_config=quant_config,
                          prefix=f"{prefix}.attn",
                          attn_type=AttentionType.ENCODER)

forward

forward(hidden_states: Tensor) -> Tensor

Input shape: Batch x Time x Channel

Source code in vllm/model_executor/models/bart.py
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    """Input shape: Batch x Time x Channel"""

    qkv, _ = self.qkv_proj(hidden_states)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)

    attn_output = self.attn(q, k, v)

    output, _ = self.out_proj(attn_output)
    return output

BartEncoderLayer

Bases: Module

Source code in vllm/model_executor/models/bart.py
class BartEncoderLayer(nn.Module):

    def __init__(
        self,
        config: BartConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = BartEncoderAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            config=config,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.activation_fn = get_act_fn(config.activation_function)

        ffn_hidden_size = self.embed_dim
        ffn_intermediate_size = config.encoder_ffn_dim
        ffn_has_bias = True
        self.fc1 = ColumnParallelLinear(
            ffn_hidden_size,
            ffn_intermediate_size,
            bias=ffn_has_bias,
            quant_config=quant_config,
        )
        self.act = get_act_fn("gelu")
        self.fc2 = RowParallelLinear(
            ffn_intermediate_size,
            ffn_hidden_size,
            bias=ffn_has_bias,
            quant_config=quant_config,
        )

        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            hidden_states: torch.Tensor of *encoder* input embeddings.
        Returns:
            Encoder layer output torch.Tensor
        """
        residual = hidden_states
        hidden_states = self.self_attn(hidden_states=hidden_states)

        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
        fc1_out, _ = self.fc1(hidden_states)
        hidden_states = self.activation_fn(fc1_out)

        hidden_states, _ = self.fc2(hidden_states)

        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        if hidden_states.dtype == torch.float16 and (
                torch.isinf(hidden_states).any()
                or torch.isnan(hidden_states).any()):
            hidden_states = cast_overflow_tensors(hidden_states)

        return hidden_states

act instance-attribute

act = get_act_fn('gelu')

activation_fn instance-attribute

activation_fn = get_act_fn(activation_function)

embed_dim instance-attribute

embed_dim = d_model

fc1 instance-attribute

fc1 = ColumnParallelLinear(
    ffn_hidden_size,
    ffn_intermediate_size,
    bias=ffn_has_bias,
    quant_config=quant_config,
)

fc2 instance-attribute

fc2 = RowParallelLinear(
    ffn_intermediate_size,
    ffn_hidden_size,
    bias=ffn_has_bias,
    quant_config=quant_config,
)

final_layer_norm instance-attribute

final_layer_norm = LayerNorm(embed_dim)

self_attn instance-attribute

self_attn = BartEncoderAttention(
    embed_dim=embed_dim,
    num_heads=encoder_attention_heads,
    config=config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.self_attn",
)

self_attn_layer_norm instance-attribute

self_attn_layer_norm = LayerNorm(embed_dim)

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    prefix: str = "",
):
    super().__init__()
    self.embed_dim = config.d_model

    self.self_attn = BartEncoderAttention(
        embed_dim=self.embed_dim,
        num_heads=config.encoder_attention_heads,
        config=config,
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.self_attn",
    )
    self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
    self.activation_fn = get_act_fn(config.activation_function)

    ffn_hidden_size = self.embed_dim
    ffn_intermediate_size = config.encoder_ffn_dim
    ffn_has_bias = True
    self.fc1 = ColumnParallelLinear(
        ffn_hidden_size,
        ffn_intermediate_size,
        bias=ffn_has_bias,
        quant_config=quant_config,
    )
    self.act = get_act_fn("gelu")
    self.fc2 = RowParallelLinear(
        ffn_intermediate_size,
        ffn_hidden_size,
        bias=ffn_has_bias,
        quant_config=quant_config,
    )

    self.final_layer_norm = nn.LayerNorm(self.embed_dim)

forward

forward(hidden_states: Tensor) -> Tensor

Parameters:

Name Type Description Default
hidden_states Tensor

torch.Tensor of encoder input embeddings.

required

Returns: Encoder layer output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    r"""
    Args:
        hidden_states: torch.Tensor of *encoder* input embeddings.
    Returns:
        Encoder layer output torch.Tensor
    """
    residual = hidden_states
    hidden_states = self.self_attn(hidden_states=hidden_states)

    hidden_states = residual + hidden_states
    hidden_states = self.self_attn_layer_norm(hidden_states)

    residual = hidden_states
    fc1_out, _ = self.fc1(hidden_states)
    hidden_states = self.activation_fn(fc1_out)

    hidden_states, _ = self.fc2(hidden_states)

    hidden_states = residual + hidden_states
    hidden_states = self.final_layer_norm(hidden_states)

    if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any()
            or torch.isnan(hidden_states).any()):
        hidden_states = cast_overflow_tensors(hidden_states)

    return hidden_states

BartForConditionalGeneration

Bases: Module, SupportsV0Only, SupportsQuant

Source code in vllm/model_executor/models/bart.py
class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "decoder.": "model.decoder.",
            "encoder.": "model.encoder.",
            "shared.": "model.shared."
        },
        orig_to_new_substr={
            "beta": "bias",
            "gamma": "weight",
            "LayerNorm": "layernorm",
        },
    )

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

        super().__init__()
        config = vllm_config.model_config.hf_config
        lora_config = vllm_config.lora_config
        # currently all existing BART models have `tie_word_embeddings` enabled
        assert config.tie_word_embeddings
        self.config = config
        self.model = BartModel(vllm_config=vllm_config,
                               prefix=maybe_prefix(prefix, "model"))

        self.unpadded_vocab_size = config.vocab_size
        if lora_config:
            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.lm_head = BartParallelLMHead(config.vocab_size,
                                          config.d_model,
                                          embed_scale=embed_scale)
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        *,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: torch.Tensor of *decoder* input token ids.
            positions: torch.Tensor of *decoder* position indices.
            encoder_input_ids: torch.Tensor of *encoder* input token ids.
            encoder_positions: torch.Tensor of *encoder* position indices.
        Returns:
            Output torch.Tensor
        """
        return self.model(input_ids, positions, encoder_input_ids,
                          encoder_positions)

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        weights_tuple_list = list(weights)

        shared_embedding_weight = None
        for name, loaded_weight in weights_tuple_list:
            if ('shared.weight' in name
                    or 'encoder.embed_tokens.weight' in name
                    or 'decoder.embed_tokens.weight' in name
                    or 'lm_head.weight' in name):
                assert shared_embedding_weight is None, (
                    "Conflicting embedding weights.")
                shared_embedding_weight = loaded_weight

        loader = AutoWeightsLoader(
            self,
            skip_prefixes=(["cls.", "pooler."]),
        )
        loaded_params = loader.load_weights(weights_tuple_list,
                                            mapper=self.hf_to_vllm_mapper)

        if shared_embedding_weight is not None:
            weight_loader = getattr(self.lm_head.weight, "weight_loader",
                                    default_weight_loader)
            weight_loader(self.lm_head.weight, shared_embedding_weight)

            self.model.encoder.embed_tokens.weight = self.lm_head.weight
            self.model.decoder.embed_tokens.weight = self.lm_head.weight
            loaded_params.update({
                'model.encoder.embed_tokens.weight', 'lm_head.weight',
                'model.decoder.embed_tokens.weight'
            })

        return loaded_params

config instance-attribute

config = config

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "decoder.": "model.decoder.",
        "encoder.": "model.encoder.",
        "shared.": "model.shared.",
    },
    orig_to_new_substr={
        "beta": "bias",
        "gamma": "weight",
        "LayerNorm": "layernorm",
    },
)

lm_head instance-attribute

lm_head = BartParallelLMHead(
    vocab_size, d_model, embed_scale=embed_scale
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size
)

model instance-attribute

model = BartModel(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)

unpadded_vocab_size instance-attribute

unpadded_vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/bart.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

    super().__init__()
    config = vllm_config.model_config.hf_config
    lora_config = vllm_config.lora_config
    # currently all existing BART models have `tie_word_embeddings` enabled
    assert config.tie_word_embeddings
    self.config = config
    self.model = BartModel(vllm_config=vllm_config,
                           prefix=maybe_prefix(prefix, "model"))

    self.unpadded_vocab_size = config.vocab_size
    if lora_config:
        self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.lm_head = BartParallelLMHead(config.vocab_size,
                                      config.d_model,
                                      embed_scale=embed_scale)
    self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                            config.vocab_size)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/bart.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    *,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    **kwargs,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

torch.Tensor of decoder input token ids.

required
positions Tensor

torch.Tensor of decoder position indices.

required
encoder_input_ids Tensor

torch.Tensor of encoder input token ids.

required
encoder_positions Tensor

torch.Tensor of encoder position indices.

required

Returns: Output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    *,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    **kwargs,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: torch.Tensor of *decoder* input token ids.
        positions: torch.Tensor of *decoder* position indices.
        encoder_input_ids: torch.Tensor of *encoder* input token ids.
        encoder_positions: torch.Tensor of *encoder* position indices.
    Returns:
        Output torch.Tensor
    """
    return self.model(input_ids, positions, encoder_input_ids,
                      encoder_positions)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/bart.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    weights_tuple_list = list(weights)

    shared_embedding_weight = None
    for name, loaded_weight in weights_tuple_list:
        if ('shared.weight' in name
                or 'encoder.embed_tokens.weight' in name
                or 'decoder.embed_tokens.weight' in name
                or 'lm_head.weight' in name):
            assert shared_embedding_weight is None, (
                "Conflicting embedding weights.")
            shared_embedding_weight = loaded_weight

    loader = AutoWeightsLoader(
        self,
        skip_prefixes=(["cls.", "pooler."]),
    )
    loaded_params = loader.load_weights(weights_tuple_list,
                                        mapper=self.hf_to_vllm_mapper)

    if shared_embedding_weight is not None:
        weight_loader = getattr(self.lm_head.weight, "weight_loader",
                                default_weight_loader)
        weight_loader(self.lm_head.weight, shared_embedding_weight)

        self.model.encoder.embed_tokens.weight = self.lm_head.weight
        self.model.decoder.embed_tokens.weight = self.lm_head.weight
        loaded_params.update({
            'model.encoder.embed_tokens.weight', 'lm_head.weight',
            'model.decoder.embed_tokens.weight'
        })

    return loaded_params

BartLearnedPositionalEmbedding

Bases: VocabParallelEmbedding

This module learns positional embeddings up to a fixed maximum size.

Source code in vllm/model_executor/models/bart.py
class BartLearnedPositionalEmbedding(VocabParallelEmbedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # Bart is set up so that if padding_idx is
        # specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately.
        # Other models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(
        self,
        positions: torch.Tensor,
    ) -> torch.Tensor:
        """`input_ids' shape is expected to be [bsz x seqlen]."""
        return super().forward(positions + self.offset)

offset instance-attribute

offset = 2

__init__

__init__(num_embeddings: int, embedding_dim: int)
Source code in vllm/model_executor/models/bart.py
def __init__(self, num_embeddings: int, embedding_dim: int):
    # Bart is set up so that if padding_idx is
    # specified then offset the embedding ids by 2
    # and adjust num_embeddings appropriately.
    # Other models don't have this hack
    self.offset = 2
    super().__init__(num_embeddings + self.offset, embedding_dim)

forward

forward(positions: Tensor) -> Tensor

`input_ids' shape is expected to be [bsz x seqlen].

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    positions: torch.Tensor,
) -> torch.Tensor:
    """`input_ids' shape is expected to be [bsz x seqlen]."""
    return super().forward(positions + self.offset)

BartModel

Bases: Module, SupportsQuant

Source code in vllm/model_executor/models/bart.py
class BartModel(nn.Module, SupportsQuant):
    _tied_weights_keys = [
        "encoder.embed_tokens.weight",
        "decoder.embed_tokens.weight",
    ]

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config

        self.config = config

        lora_vocab = (lora_config.lora_extra_vocab_size *
                      (lora_config.max_loras or 1)) if lora_config else 0
        self.vocab_size = config.vocab_size + lora_vocab
        self.org_vocab_size = config.vocab_size

        self.encoder = BartEncoder(config,
                                   cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.encoder")
        self.decoder = BartDecoder(config,
                                   cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.decoder")

    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                encoder_input_ids: torch.Tensor,
                encoder_positions: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            input_ids: Indices of *decoder* input sequence tokens 
                in the vocabulary.
                Padding will be ignored by default should you provide it.
            positions: Positions of *decoder* input sequence tokens.
            encoder_input_ids: Indices of *encoder* input sequence tokens 
                in the vocabulary.
            encoder_positions: Positions of *encoder* input sequence tokens.
        Returns:
            Model output torch.Tensor
        """

        encoder_hidden_states = None

        if encoder_input_ids.numel() > 0:
            # Run encoder attention if a non-zero number of encoder tokens
            # are provided as input
            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                                 positions=encoder_positions)

        # decoder outputs consists of
        # (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            decoder_input_ids=input_ids,
            decoder_positions=positions,
            encoder_hidden_states=encoder_hidden_states)

        return decoder_outputs

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
        ]

        other_weights = []
        loaded_stacked_params = []
        model_params_dict = dict(self.named_parameters())

        for name, loaded_weight in weights:
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                if name not in model_params_dict:
                    continue
                param = model_params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                loaded_stacked_params.append(name)
                break
            else:
                if name in model_params_dict:
                    other_weights.append((name, loaded_weight))

        loader = AutoWeightsLoader(self)
        loaded_params = loader.load_weights(other_weights)
        loaded_params.update(loaded_stacked_params)
        return loaded_params

_tied_weights_keys class-attribute instance-attribute

_tied_weights_keys = [
    "encoder.embed_tokens.weight",
    "decoder.embed_tokens.weight",
]

config instance-attribute

config = config

decoder instance-attribute

decoder = BartDecoder(
    config,
    cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.decoder",
)

encoder instance-attribute

encoder = BartEncoder(
    config,
    cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.encoder",
)

org_vocab_size instance-attribute

org_vocab_size = vocab_size

vocab_size instance-attribute

vocab_size = vocab_size + lora_vocab

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/bart.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config
    lora_config = vllm_config.lora_config

    self.config = config

    lora_vocab = (lora_config.lora_extra_vocab_size *
                  (lora_config.max_loras or 1)) if lora_config else 0
    self.vocab_size = config.vocab_size + lora_vocab
    self.org_vocab_size = config.vocab_size

    self.encoder = BartEncoder(config,
                               cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.encoder")
    self.decoder = BartDecoder(config,
                               cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.decoder")

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

Indices of decoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
positions Tensor

Positions of decoder input sequence tokens.

required
encoder_input_ids Tensor

Indices of encoder input sequence tokens in the vocabulary.

required
encoder_positions Tensor

Positions of encoder input sequence tokens.

required

Returns: Model output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
            encoder_input_ids: torch.Tensor,
            encoder_positions: torch.Tensor) -> torch.Tensor:
    r"""
    Args:
        input_ids: Indices of *decoder* input sequence tokens 
            in the vocabulary.
            Padding will be ignored by default should you provide it.
        positions: Positions of *decoder* input sequence tokens.
        encoder_input_ids: Indices of *encoder* input sequence tokens 
            in the vocabulary.
        encoder_positions: Positions of *encoder* input sequence tokens.
    Returns:
        Model output torch.Tensor
    """

    encoder_hidden_states = None

    if encoder_input_ids.numel() > 0:
        # Run encoder attention if a non-zero number of encoder tokens
        # are provided as input
        encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                             positions=encoder_positions)

    # decoder outputs consists of
    # (dec_features, past_key_value, dec_hidden, dec_attn)
    decoder_outputs = self.decoder(
        decoder_input_ids=input_ids,
        decoder_positions=positions,
        encoder_hidden_states=encoder_hidden_states)

    return decoder_outputs

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/bart.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        ("qkv_proj", "q_proj", "q"),
        ("qkv_proj", "k_proj", "k"),
        ("qkv_proj", "v_proj", "v"),
    ]

    other_weights = []
    loaded_stacked_params = []
    model_params_dict = dict(self.named_parameters())

    for name, loaded_weight in weights:
        for (param_name, weight_name, shard_id) in stacked_params_mapping:
            if weight_name not in name:
                continue
            name = name.replace(weight_name, param_name)
            if name not in model_params_dict:
                continue
            param = model_params_dict[name]
            weight_loader = param.weight_loader
            weight_loader(param, loaded_weight, shard_id)
            loaded_stacked_params.append(name)
            break
        else:
            if name in model_params_dict:
                other_weights.append((name, loaded_weight))

    loader = AutoWeightsLoader(self)
    loaded_params = loader.load_weights(other_weights)
    loaded_params.update(loaded_stacked_params)
    return loaded_params

BartParallelLMHead

Bases: ParallelLMHead

This module overrides ParallelLMHead's forward by dividing by embeddings scale, yielding effectively the inverse of BartScaledWordEmbedding

Source code in vllm/model_executor/models/bart.py
class BartParallelLMHead(ParallelLMHead):
    """
    This module overrides ParallelLMHead's
    forward by dividing by embeddings scale,
    yielding effectively the inverse of
    BartScaledWordEmbedding
    """

    def __init__(self,
                 num_embeddings: int,
                 embedding_dim: int,
                 embed_scale: float = 1.0):
        super().__init__(num_embeddings, embedding_dim)
        self.embed_scale = embed_scale

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        return super().forward(input_ids) / self.embed_scale

embed_scale instance-attribute

embed_scale = embed_scale

__init__

__init__(
    num_embeddings: int,
    embedding_dim: int,
    embed_scale: float = 1.0,
)
Source code in vllm/model_executor/models/bart.py
def __init__(self,
             num_embeddings: int,
             embedding_dim: int,
             embed_scale: float = 1.0):
    super().__init__(num_embeddings, embedding_dim)
    self.embed_scale = embed_scale

forward

forward(input_ids: Tensor) -> Tensor
Source code in vllm/model_executor/models/bart.py
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
    return super().forward(input_ids) / self.embed_scale

BartScaledWordEmbedding

Bases: VocabParallelEmbedding

This module overrides VocabParallelEmbedding's forward by multiplying with embeddings scale.

Source code in vllm/model_executor/models/bart.py
class BartScaledWordEmbedding(VocabParallelEmbedding):
    """
    This module overrides VocabParallelEmbedding's 
    forward by multiplying with embeddings scale.
    """

    def __init__(self,
                 num_embeddings: int,
                 embedding_dim: int,
                 embed_scale: float = 1.0):
        super().__init__(num_embeddings, embedding_dim)
        self.embed_scale = embed_scale

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        return super().forward(input_ids) * self.embed_scale

embed_scale instance-attribute

embed_scale = embed_scale

__init__

__init__(
    num_embeddings: int,
    embedding_dim: int,
    embed_scale: float = 1.0,
)
Source code in vllm/model_executor/models/bart.py
def __init__(self,
             num_embeddings: int,
             embedding_dim: int,
             embed_scale: float = 1.0):
    super().__init__(num_embeddings, embedding_dim)
    self.embed_scale = embed_scale

forward

forward(input_ids: Tensor) -> Tensor
Source code in vllm/model_executor/models/bart.py
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
    return super().forward(input_ids) * self.embed_scale

MBartDecoder

Bases: Module

Transformer decoder consisting of config.decoder_layers layers. Each layer is a [BartDecoderLayer] Args: config: BartConfig embed_tokens (nn.Embedding): output embedding

Source code in vllm/model_executor/models/bart.py
class MBartDecoder(nn.Module):
    """
    Transformer decoder consisting of *config.decoder_layers* layers.
    Each layer is a [`BartDecoderLayer`]
    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(
        self,
        config: BartConfig,
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        lora_config: Optional[LoRAConfig] = None,
        embed_tokens: Optional[nn.Embedding] = None,
        prefix: str = "",
    ):
        super().__init__()
        self.cache_config = cache_config
        self.quant_config = quant_config
        self.lora_config = lora_config
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                    config.d_model,
                                                    embed_scale=embed_scale)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )

        self.layers = nn.ModuleList(
            [MBartDecoderLayer(config, cache_config, quant_config,
                               prefix=f"{prefix}.layers.{layer_idx}") \
             for layer_idx in range(config.decoder_layers)])

        self.layernorm_embedding = nn.LayerNorm(config.d_model)
        self.layer_norm = nn.LayerNorm(config.d_model)

    def forward(
        self,
        decoder_input_ids: torch.Tensor,
        decoder_positions: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor],
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            decoder_input_ids: Indices of *decoder* input sequence tokens 
                in the vocabulary.
                Padding will be ignored by default should you provide it.
            decoder_positions: Positions of *decoder* input sequence tokens.
            encoder_hidden_states: Tensor of encoder output embeddings.
        Returns:
            Decoder output torch.Tensor
        """
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(decoder_input_ids)
        else:
            decoder_positions = inputs_embeds[:, -1]

        # embed positions
        embed_pos = self.embed_positions(decoder_positions)
        embed_pos = embed_pos.to(inputs_embeds.device)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)

        # decoder layers

        for decoder_layer in self.layers:
            hidden_states = decoder_layer(
                decoder_hidden_states=hidden_states,
                encoder_hidden_states=encoder_hidden_states,
            )

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

cache_config instance-attribute

cache_config = cache_config

embed_positions instance-attribute

embed_positions = BartLearnedPositionalEmbedding(
    max_position_embeddings, d_model
)

embed_tokens instance-attribute

embed_tokens = BartScaledWordEmbedding(
    vocab_size, d_model, embed_scale=embed_scale
)

layer_norm instance-attribute

layer_norm = LayerNorm(d_model)

layernorm_embedding instance-attribute

layernorm_embedding = LayerNorm(d_model)

layers instance-attribute

layers = ModuleList(
    [
        (
            MBartDecoderLayer(
                config,
                cache_config,
                quant_config,
                prefix=f"{prefix}.layers.{layer_idx}",
            )
        )
        for layer_idx in (range(decoder_layers))
    ]
)

lora_config instance-attribute

lora_config = lora_config

max_target_positions instance-attribute

max_target_positions = max_position_embeddings

quant_config instance-attribute

quant_config = quant_config

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[Embedding] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(
    self,
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[nn.Embedding] = None,
    prefix: str = "",
):
    super().__init__()
    self.cache_config = cache_config
    self.quant_config = quant_config
    self.lora_config = lora_config
    self.max_target_positions = config.max_position_embeddings
    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                config.d_model,
                                                embed_scale=embed_scale)

    if embed_tokens is not None:
        self.embed_tokens.weight = embed_tokens.weight

    self.embed_positions = BartLearnedPositionalEmbedding(
        config.max_position_embeddings,
        config.d_model,
    )

    self.layers = nn.ModuleList(
        [MBartDecoderLayer(config, cache_config, quant_config,
                           prefix=f"{prefix}.layers.{layer_idx}") \
         for layer_idx in range(config.decoder_layers)])

    self.layernorm_embedding = nn.LayerNorm(config.d_model)
    self.layer_norm = nn.LayerNorm(config.d_model)

forward

forward(
    decoder_input_ids: Tensor,
    decoder_positions: Tensor,
    encoder_hidden_states: Optional[Tensor],
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
decoder_input_ids Tensor

Indices of decoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
decoder_positions Tensor

Positions of decoder input sequence tokens.

required
encoder_hidden_states Optional[Tensor]

Tensor of encoder output embeddings.

required

Returns: Decoder output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    decoder_input_ids: torch.Tensor,
    decoder_positions: torch.Tensor,
    encoder_hidden_states: Optional[torch.Tensor],
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        decoder_input_ids: Indices of *decoder* input sequence tokens 
            in the vocabulary.
            Padding will be ignored by default should you provide it.
        decoder_positions: Positions of *decoder* input sequence tokens.
        encoder_hidden_states: Tensor of encoder output embeddings.
    Returns:
        Decoder output torch.Tensor
    """
    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(decoder_input_ids)
    else:
        decoder_positions = inputs_embeds[:, -1]

    # embed positions
    embed_pos = self.embed_positions(decoder_positions)
    embed_pos = embed_pos.to(inputs_embeds.device)

    hidden_states = inputs_embeds + embed_pos
    hidden_states = self.layernorm_embedding(hidden_states)

    # decoder layers

    for decoder_layer in self.layers:
        hidden_states = decoder_layer(
            decoder_hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
        )

    hidden_states = self.layer_norm(hidden_states)
    return hidden_states

MBartDecoderLayer

Bases: BartDecoderLayer

Source code in vllm/model_executor/models/bart.py
class MBartDecoderLayer(BartDecoderLayer):

    def forward(
        self,
        decoder_hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        residual = decoder_hidden_states
        hidden_states = self.self_attn_layer_norm(decoder_hidden_states)

        # Self Attention
        hidden_states = self.self_attn(hidden_states=hidden_states)

        hidden_states = residual + hidden_states

        # Cross-Attention Block

        residual = hidden_states
        hidden_states = self.encoder_attn_layer_norm(hidden_states)

        hidden_states = self.encoder_attn(
            decoder_hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
        )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        fc1_out, _ = self.fc1(hidden_states)
        hidden_states = self.activation_fn(fc1_out)

        hidden_states, _ = self.fc2(hidden_states)

        hidden_states = residual + hidden_states

        return hidden_states

forward

forward(
    decoder_hidden_states: Tensor,
    encoder_hidden_states: Optional[Tensor] = None,
) -> Tensor
Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    decoder_hidden_states: torch.Tensor,
    encoder_hidden_states: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    residual = decoder_hidden_states
    hidden_states = self.self_attn_layer_norm(decoder_hidden_states)

    # Self Attention
    hidden_states = self.self_attn(hidden_states=hidden_states)

    hidden_states = residual + hidden_states

    # Cross-Attention Block

    residual = hidden_states
    hidden_states = self.encoder_attn_layer_norm(hidden_states)

    hidden_states = self.encoder_attn(
        decoder_hidden_states=hidden_states,
        encoder_hidden_states=encoder_hidden_states,
    )

    hidden_states = residual + hidden_states

    # Fully Connected
    residual = hidden_states
    hidden_states = self.final_layer_norm(hidden_states)
    fc1_out, _ = self.fc1(hidden_states)
    hidden_states = self.activation_fn(fc1_out)

    hidden_states, _ = self.fc2(hidden_states)

    hidden_states = residual + hidden_states

    return hidden_states

MBartEncoder

Bases: Module

Transformer encoder consisting of config.encoder_layers self attention layers. Each layer is a [BartEncoderLayer]. Args: config: BartConfig embed_tokens (nn.Embedding): output embedding

Source code in vllm/model_executor/models/bart.py
class MBartEncoder(nn.Module):
    """
    Transformer encoder consisting of *config.encoder_layers*
    self attention layers. Each layer is a [`BartEncoderLayer`].
    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self,
                 config: BartConfig,
                 cache_config: Optional[CacheConfig] = None,
                 quant_config: Optional[QuantizationConfig] = None,
                 lora_config: Optional[LoRAConfig] = None,
                 embed_tokens: Optional[nn.Embedding] = None,
                 prefix: str = ""):
        super().__init__()

        self.cache_config = cache_config
        self.quant_config = quant_config
        self.lora_config = lora_config
        embed_dim = config.d_model
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                    embed_dim,
                                                    embed_scale=embed_scale)

        if embed_tokens is not None:
            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        self.layers = nn.ModuleList([
            MBartEncoderLayer(config,
                              cache_config,
                              quant_config,
                              prefix=f"{prefix}.layers.{layer_idx}")
            for layer_idx in range(config.encoder_layers)
        ])

        self.layernorm_embedding = nn.LayerNorm(embed_dim)
        self.layer_norm = nn.LayerNorm(config.d_model)  # 改动

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: Indices of *encoder* input sequence tokens in the 
                vocabulary.
                Padding will be ignored by default should you provide it.
            positions: Positions of *encoder* input sequence tokens.
        Returns:
            Decoder output torch.Tensor
        """
        # retrieve input_ids and inputs_embeds
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        embed_pos = self.embed_positions(positions)
        embed_pos = embed_pos.to(inputs_embeds.device)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)

        for encoder_layer in self.layers:
            hidden_states = encoder_layer(hidden_states=hidden_states)

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

cache_config instance-attribute

cache_config = cache_config

embed_positions instance-attribute

embed_positions = BartLearnedPositionalEmbedding(
    max_position_embeddings, embed_dim
)

embed_tokens instance-attribute

embed_tokens = BartScaledWordEmbedding(
    vocab_size, embed_dim, embed_scale=embed_scale
)

layer_norm instance-attribute

layer_norm = LayerNorm(d_model)

layernorm_embedding instance-attribute

layernorm_embedding = LayerNorm(embed_dim)

layers instance-attribute

layers = ModuleList(
    [
        (
            MBartEncoderLayer(
                config,
                cache_config,
                quant_config,
                prefix=f"{prefix}.layers.{layer_idx}",
            )
        )
        for layer_idx in (range(encoder_layers))
    ]
)

lora_config instance-attribute

lora_config = lora_config

max_source_positions instance-attribute

max_source_positions = max_position_embeddings

quant_config instance-attribute

quant_config = quant_config

__init__

__init__(
    config: BartConfig,
    cache_config: Optional[CacheConfig] = None,
    quant_config: Optional[QuantizationConfig] = None,
    lora_config: Optional[LoRAConfig] = None,
    embed_tokens: Optional[Embedding] = None,
    prefix: str = "",
)
Source code in vllm/model_executor/models/bart.py
def __init__(self,
             config: BartConfig,
             cache_config: Optional[CacheConfig] = None,
             quant_config: Optional[QuantizationConfig] = None,
             lora_config: Optional[LoRAConfig] = None,
             embed_tokens: Optional[nn.Embedding] = None,
             prefix: str = ""):
    super().__init__()

    self.cache_config = cache_config
    self.quant_config = quant_config
    self.lora_config = lora_config
    embed_dim = config.d_model
    self.max_source_positions = config.max_position_embeddings
    embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

    self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
                                                embed_dim,
                                                embed_scale=embed_scale)

    if embed_tokens is not None:
        self.embed_tokens.weight = embed_tokens.weight

    self.embed_positions = BartLearnedPositionalEmbedding(
        config.max_position_embeddings,
        embed_dim,
    )
    self.layers = nn.ModuleList([
        MBartEncoderLayer(config,
                          cache_config,
                          quant_config,
                          prefix=f"{prefix}.layers.{layer_idx}")
        for layer_idx in range(config.encoder_layers)
    ])

    self.layernorm_embedding = nn.LayerNorm(embed_dim)
    self.layer_norm = nn.LayerNorm(config.d_model)  # 改动

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

Indices of encoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
positions Tensor

Positions of encoder input sequence tokens.

required

Returns: Decoder output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: Indices of *encoder* input sequence tokens in the 
            vocabulary.
            Padding will be ignored by default should you provide it.
        positions: Positions of *encoder* input sequence tokens.
    Returns:
        Decoder output torch.Tensor
    """
    # retrieve input_ids and inputs_embeds
    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)

    embed_pos = self.embed_positions(positions)
    embed_pos = embed_pos.to(inputs_embeds.device)

    hidden_states = inputs_embeds + embed_pos
    hidden_states = self.layernorm_embedding(hidden_states)

    for encoder_layer in self.layers:
        hidden_states = encoder_layer(hidden_states=hidden_states)

    hidden_states = self.layer_norm(hidden_states)
    return hidden_states

MBartEncoderLayer

Bases: BartEncoderLayer

Source code in vllm/model_executor/models/bart.py
class MBartEncoderLayer(BartEncoderLayer):

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            hidden_states: torch.Tensor of *encoder* input embeddings.
        Returns:
            Encoder layer output torch.Tensor
        """
        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states = self.self_attn(hidden_states=hidden_states)

        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        fc1_out, _ = self.fc1(hidden_states)
        hidden_states = self.activation_fn(fc1_out)

        hidden_states, _ = self.fc2(hidden_states)

        hidden_states = residual + hidden_states

        if hidden_states.dtype == torch.float16 and (
                torch.isinf(hidden_states).any()
                or torch.isnan(hidden_states).any()):
            hidden_states = cast_overflow_tensors(hidden_states)

        return hidden_states

forward

forward(hidden_states: Tensor) -> Tensor

Parameters:

Name Type Description Default
hidden_states Tensor

torch.Tensor of encoder input embeddings.

required

Returns: Encoder layer output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    r"""
    Args:
        hidden_states: torch.Tensor of *encoder* input embeddings.
    Returns:
        Encoder layer output torch.Tensor
    """
    residual = hidden_states
    hidden_states = self.self_attn_layer_norm(hidden_states)
    hidden_states = self.self_attn(hidden_states=hidden_states)

    hidden_states = residual + hidden_states

    residual = hidden_states
    hidden_states = self.final_layer_norm(hidden_states)
    fc1_out, _ = self.fc1(hidden_states)
    hidden_states = self.activation_fn(fc1_out)

    hidden_states, _ = self.fc2(hidden_states)

    hidden_states = residual + hidden_states

    if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any()
            or torch.isnan(hidden_states).any()):
        hidden_states = cast_overflow_tensors(hidden_states)

    return hidden_states

MBartForConditionalGeneration

Bases: Module, SupportsV0Only, SupportsQuant

Source code in vllm/model_executor/models/bart.py
class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
    base_model_prefix = "model"

    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "decoder.": "model.decoder.",
            "encoder.": "model.encoder.",
            "shared.": "model.shared."
        },
        orig_to_new_substr={
            "beta": "bias",
            "gamma": "weight",
            "LayerNorm": "layernorm",
        },
    )

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        lora_config = vllm_config.lora_config
        assert config.tie_word_embeddings
        self.config = config
        self.model = MBartModel(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))

        self.unpadded_vocab_size = config.vocab_size
        if lora_config:
            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.lm_head = BartParallelLMHead(config.vocab_size,
                                          config.d_model,
                                          embed_scale=embed_scale)

        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        *,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        return self.model(input_ids, positions, encoder_input_ids,
                          encoder_positions)

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
        ]
        model_params_dict = dict(self.named_parameters())
        loaded_params = set()
        remaining_weights = []
        shared_embedding_weight = None

        for name, loaded_weight in weights:
            if any(skip in name
                   for skip in ["cls.", "pooler.", "final_logits_bias"]):
                continue
            if any(embed_name in name for embed_name in [
                    'shared.weight', 'encoder.embed_tokens.weight',
                    'decoder.embed_tokens.weight'
            ]):
                if shared_embedding_weight is None:
                    shared_embedding_weight = loaded_weight
                continue
            is_stacked = False
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                vllm_name = name
                for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items(
                ):
                    vllm_name = vllm_name.replace(src, dst)
                for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items(
                ):
                    if vllm_name.startswith(src):
                        vllm_name = dst + vllm_name[len(src):]
                        break
                vllm_name = vllm_name.replace(weight_name, param_name)
                if vllm_name in model_params_dict:
                    param = model_params_dict[vllm_name]
                    weight_loader = getattr(param, "weight_loader",
                                            default_weight_loader)
                    weight_loader(param, loaded_weight, shard_id)
                    loaded_params.add(vllm_name)
                is_stacked = True
                break
            if not is_stacked:
                remaining_weights.append((name, loaded_weight))
        loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."])
        auto_loaded_params = loader.load_weights(remaining_weights,
                                                 mapper=self.hf_to_vllm_mapper)
        loaded_params.update(auto_loaded_params)
        if shared_embedding_weight is not None:
            lm_head_param = self.lm_head.weight
            weight_loader = getattr(lm_head_param, "weight_loader",
                                    default_weight_loader)
            weight_loader(lm_head_param, shared_embedding_weight)
            self.model.encoder.embed_tokens.weight = self.lm_head.weight
            self.model.decoder.embed_tokens.weight = self.lm_head.weight
            loaded_params.update({
                'model.encoder.embed_tokens.weight', 'lm_head.weight',
                'model.decoder.embed_tokens.weight'
            })
        return loaded_params

base_model_prefix class-attribute instance-attribute

base_model_prefix = 'model'

config instance-attribute

config = config

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "decoder.": "model.decoder.",
        "encoder.": "model.encoder.",
        "shared.": "model.shared.",
    },
    orig_to_new_substr={
        "beta": "bias",
        "gamma": "weight",
        "LayerNorm": "layernorm",
    },
)

lm_head instance-attribute

lm_head = BartParallelLMHead(
    vocab_size, d_model, embed_scale=embed_scale
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size
)

model instance-attribute

model = MBartModel(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)

unpadded_vocab_size instance-attribute

unpadded_vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/bart.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config = vllm_config.model_config.hf_config
    lora_config = vllm_config.lora_config
    assert config.tie_word_embeddings
    self.config = config
    self.model = MBartModel(vllm_config=vllm_config,
                            prefix=maybe_prefix(prefix, "model"))

    self.unpadded_vocab_size = config.vocab_size
    if lora_config:
        self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.lm_head = BartParallelLMHead(config.vocab_size,
                                      config.d_model,
                                      embed_scale=embed_scale)

    self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                            config.vocab_size)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/bart.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    *,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    **kwargs,
) -> Tensor
Source code in vllm/model_executor/models/bart.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    *,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    **kwargs,
) -> torch.Tensor:
    return self.model(input_ids, positions, encoder_input_ids,
                      encoder_positions)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/bart.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        ("qkv_proj", "q_proj", "q"),
        ("qkv_proj", "k_proj", "k"),
        ("qkv_proj", "v_proj", "v"),
    ]
    model_params_dict = dict(self.named_parameters())
    loaded_params = set()
    remaining_weights = []
    shared_embedding_weight = None

    for name, loaded_weight in weights:
        if any(skip in name
               for skip in ["cls.", "pooler.", "final_logits_bias"]):
            continue
        if any(embed_name in name for embed_name in [
                'shared.weight', 'encoder.embed_tokens.weight',
                'decoder.embed_tokens.weight'
        ]):
            if shared_embedding_weight is None:
                shared_embedding_weight = loaded_weight
            continue
        is_stacked = False
        for param_name, weight_name, shard_id in stacked_params_mapping:
            if weight_name not in name:
                continue
            vllm_name = name
            for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items(
            ):
                vllm_name = vllm_name.replace(src, dst)
            for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items(
            ):
                if vllm_name.startswith(src):
                    vllm_name = dst + vllm_name[len(src):]
                    break
            vllm_name = vllm_name.replace(weight_name, param_name)
            if vllm_name in model_params_dict:
                param = model_params_dict[vllm_name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight, shard_id)
                loaded_params.add(vllm_name)
            is_stacked = True
            break
        if not is_stacked:
            remaining_weights.append((name, loaded_weight))
    loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."])
    auto_loaded_params = loader.load_weights(remaining_weights,
                                             mapper=self.hf_to_vllm_mapper)
    loaded_params.update(auto_loaded_params)
    if shared_embedding_weight is not None:
        lm_head_param = self.lm_head.weight
        weight_loader = getattr(lm_head_param, "weight_loader",
                                default_weight_loader)
        weight_loader(lm_head_param, shared_embedding_weight)
        self.model.encoder.embed_tokens.weight = self.lm_head.weight
        self.model.decoder.embed_tokens.weight = self.lm_head.weight
        loaded_params.update({
            'model.encoder.embed_tokens.weight', 'lm_head.weight',
            'model.decoder.embed_tokens.weight'
        })
    return loaded_params

MBartModel

Bases: Module, SupportsQuant

Source code in vllm/model_executor/models/bart.py
class MBartModel(nn.Module, SupportsQuant):
    _tied_weights_keys = [
        "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
    ]

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config

        self.config = config

        lora_vocab = (lora_config.lora_extra_vocab_size *
                      (lora_config.max_loras or 1)) if lora_config else 0
        self.vocab_size = config.vocab_size + lora_vocab
        self.org_vocab_size = config.vocab_size

        self.encoder = MBartEncoder(config,
                                    cache_config,
                                    quant_config=quant_config,
                                    prefix=f"{prefix}.encoder")
        self.decoder = MBartDecoder(config,
                                    cache_config,
                                    quant_config=quant_config,
                                    prefix=f"{prefix}.decoder")

    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                encoder_input_ids: torch.Tensor,
                encoder_positions: torch.Tensor) -> torch.Tensor:
        r"""
        Args:
            input_ids: Indices of *decoder* input sequence tokens 
                in the vocabulary.
                Padding will be ignored by default should you provide it.
            positions: Positions of *decoder* input sequence tokens.
            encoder_input_ids: Indices of *encoder* input sequence tokens 
                in the vocabulary.
            encoder_positions: Positions of *encoder* input sequence tokens.
        Returns:
            Model output torch.Tensor
        """

        encoder_hidden_states = None

        if encoder_input_ids.numel() > 0:
            # Run encoder attention if a non-zero number of encoder tokens
            # are provided as input
            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                                 positions=encoder_positions)

        # decoder outputs consists of
        # (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            decoder_input_ids=input_ids,
            decoder_positions=positions,
            encoder_hidden_states=encoder_hidden_states)

        return decoder_outputs

_tied_weights_keys class-attribute instance-attribute

_tied_weights_keys = [
    "encoder.embed_tokens.weight",
    "decoder.embed_tokens.weight",
]

config instance-attribute

config = config

decoder instance-attribute

decoder = MBartDecoder(
    config,
    cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.decoder",
)

encoder instance-attribute

encoder = MBartEncoder(
    config,
    cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.encoder",
)

org_vocab_size instance-attribute

org_vocab_size = vocab_size

vocab_size instance-attribute

vocab_size = vocab_size + lora_vocab

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/bart.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config
    lora_config = vllm_config.lora_config

    self.config = config

    lora_vocab = (lora_config.lora_extra_vocab_size *
                  (lora_config.max_loras or 1)) if lora_config else 0
    self.vocab_size = config.vocab_size + lora_vocab
    self.org_vocab_size = config.vocab_size

    self.encoder = MBartEncoder(config,
                                cache_config,
                                quant_config=quant_config,
                                prefix=f"{prefix}.encoder")
    self.decoder = MBartDecoder(config,
                                cache_config,
                                quant_config=quant_config,
                                prefix=f"{prefix}.decoder")

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

Indices of decoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
positions Tensor

Positions of decoder input sequence tokens.

required
encoder_input_ids Tensor

Indices of encoder input sequence tokens in the vocabulary.

required
encoder_positions Tensor

Positions of encoder input sequence tokens.

required

Returns: Model output torch.Tensor

Source code in vllm/model_executor/models/bart.py
def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
            encoder_input_ids: torch.Tensor,
            encoder_positions: torch.Tensor) -> torch.Tensor:
    r"""
    Args:
        input_ids: Indices of *decoder* input sequence tokens 
            in the vocabulary.
            Padding will be ignored by default should you provide it.
        positions: Positions of *decoder* input sequence tokens.
        encoder_input_ids: Indices of *encoder* input sequence tokens 
            in the vocabulary.
        encoder_positions: Positions of *encoder* input sequence tokens.
    Returns:
        Model output torch.Tensor
    """

    encoder_hidden_states = None

    if encoder_input_ids.numel() > 0:
        # Run encoder attention if a non-zero number of encoder tokens
        # are provided as input
        encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                             positions=encoder_positions)

    # decoder outputs consists of
    # (dec_features, past_key_value, dec_hidden, dec_attn)
    decoder_outputs = self.decoder(
        decoder_input_ids=input_ids,
        decoder_positions=positions,
        encoder_hidden_states=encoder_hidden_states)

    return decoder_outputs

get_bsz_seq_len

get_bsz_seq_len(input_ids)
Source code in vllm/model_executor/models/bart.py
def get_bsz_seq_len(input_ids):
    shp = input_ids.shape
    ndim = len(shp)
    if ndim == 1:
        return 1, input_ids.numel()
    else:
        return shp[:2]