Skip to content

vllm.model_executor.models.florence2

ChannelAttention

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class ChannelAttention(nn.Module):

    def __init__(self, dim, groups=8, qkv_bias=True):
        super().__init__()

        self.groups = groups
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x, size):
        B, N, C = x.shape

        qkv = self.qkv(x).reshape(B, N, 3, self.groups,
                                  C // self.groups).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * (float(N)**-0.5)
        attention = q.transpose(-1, -2) @ k
        attention = attention.softmax(dim=-1)
        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
        x = x.transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        return x, size

groups instance-attribute

groups = groups

proj instance-attribute

proj = Linear(dim, dim)

qkv instance-attribute

qkv = Linear(dim, dim * 3, bias=qkv_bias)

__init__

__init__(dim, groups=8, qkv_bias=True)
Source code in vllm/model_executor/models/florence2.py
def __init__(self, dim, groups=8, qkv_bias=True):
    super().__init__()

    self.groups = groups
    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
    self.proj = nn.Linear(dim, dim)

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    B, N, C = x.shape

    qkv = self.qkv(x).reshape(B, N, 3, self.groups,
                              C // self.groups).permute(2, 0, 3, 1, 4)
    q, k, v = qkv[0], qkv[1], qkv[2]

    q = q * (float(N)**-0.5)
    attention = q.transpose(-1, -2) @ k
    attention = attention.softmax(dim=-1)
    x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
    x = x.transpose(1, 2).reshape(B, N, C)
    x = self.proj(x)
    return x, size

ChannelBlock

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class ChannelBlock(nn.Module):

    def __init__(self,
                 dim,
                 groups,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 drop_path_rate=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 conv_at_attn=True,
                 conv_at_ffn=True):
        super().__init__()

        self.conv1 = PreNorm(None, DepthWiseConv2d(
            dim, 3, 1, 1)) if conv_at_attn else None
        self.channel_attn = PreNorm(
            norm_layer(dim),
            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
        )
        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                                   1)) if conv_at_ffn else None
        self.ffn = PreNorm(
            norm_layer(dim),
            Mlp(in_features=dim,
                hidden_features=int(dim * mlp_ratio),
                act_layer=act_layer),
        )

    def forward(self, x, size):
        if self.conv1:
            x, size = self.conv1(x, size)
        x, size = self.channel_attn(x, size)

        if self.conv2:
            x, size = self.conv2(x, size)
        x, size = self.ffn(x, size)

        return x, size

channel_attn instance-attribute

channel_attn = PreNorm(
    norm_layer(dim),
    ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
)

conv1 instance-attribute

conv1 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_attn
    else None
)

conv2 instance-attribute

conv2 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_ffn
    else None
)

ffn instance-attribute

ffn = PreNorm(
    norm_layer(dim),
    Mlp(
        in_features=dim,
        hidden_features=int(dim * mlp_ratio),
        act_layer=act_layer,
    ),
)

__init__

__init__(
    dim,
    groups,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.0,
    act_layer=GELU,
    norm_layer=LayerNorm,
    conv_at_attn=True,
    conv_at_ffn=True,
)
Source code in vllm/model_executor/models/florence2.py
def __init__(self,
             dim,
             groups,
             mlp_ratio=4.,
             qkv_bias=True,
             drop_path_rate=0.,
             act_layer=nn.GELU,
             norm_layer=nn.LayerNorm,
             conv_at_attn=True,
             conv_at_ffn=True):
    super().__init__()

    self.conv1 = PreNorm(None, DepthWiseConv2d(
        dim, 3, 1, 1)) if conv_at_attn else None
    self.channel_attn = PreNorm(
        norm_layer(dim),
        ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
    )
    self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                               1)) if conv_at_ffn else None
    self.ffn = PreNorm(
        norm_layer(dim),
        Mlp(in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer),
    )

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    if self.conv1:
        x, size = self.conv1(x, size)
    x, size = self.channel_attn(x, size)

    if self.conv2:
        x, size = self.conv2(x, size)
    x, size = self.ffn(x, size)

    return x, size

ConvEmbed

Bases: Module

Image to Patch Embedding

Source code in vllm/model_executor/models/florence2.py
class ConvEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self,
                 patch_size=7,
                 in_chans=3,
                 embed_dim=64,
                 stride=4,
                 padding=2,
                 norm_layer=None,
                 pre_norm=True):
        super().__init__()
        self.patch_size = patch_size

        self.proj = nn.Conv2d(in_chans,
                              embed_dim,
                              kernel_size=patch_size,
                              stride=stride,
                              padding=padding)

        dim_norm = in_chans if pre_norm else embed_dim
        self.norm = norm_layer(dim_norm) if norm_layer else None

        self.pre_norm = pre_norm

    def forward(self, x, size):
        H, W = size
        if len(x.size()) == 3:
            if self.norm and self.pre_norm:
                x = self.norm(x)
            x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)

        x = self.proj(x)

        _, _, H, W = x.shape
        x = rearrange(x, 'b c h w -> b (h w) c')
        if self.norm and not self.pre_norm:
            x = self.norm(x)

        return x, (H, W)

norm instance-attribute

norm = norm_layer(dim_norm) if norm_layer else None

patch_size instance-attribute

patch_size = patch_size

pre_norm instance-attribute

pre_norm = pre_norm

proj instance-attribute

proj = Conv2d(
    in_chans,
    embed_dim,
    kernel_size=patch_size,
    stride=stride,
    padding=padding,
)

__init__

__init__(
    patch_size=7,
    in_chans=3,
    embed_dim=64,
    stride=4,
    padding=2,
    norm_layer=None,
    pre_norm=True,
)
Source code in vllm/model_executor/models/florence2.py
def __init__(self,
             patch_size=7,
             in_chans=3,
             embed_dim=64,
             stride=4,
             padding=2,
             norm_layer=None,
             pre_norm=True):
    super().__init__()
    self.patch_size = patch_size

    self.proj = nn.Conv2d(in_chans,
                          embed_dim,
                          kernel_size=patch_size,
                          stride=stride,
                          padding=padding)

    dim_norm = in_chans if pre_norm else embed_dim
    self.norm = norm_layer(dim_norm) if norm_layer else None

    self.pre_norm = pre_norm

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    H, W = size
    if len(x.size()) == 3:
        if self.norm and self.pre_norm:
            x = self.norm(x)
        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)

    x = self.proj(x)

    _, _, H, W = x.shape
    x = rearrange(x, 'b c h w -> b (h w) c')
    if self.norm and not self.pre_norm:
        x = self.norm(x)

    return x, (H, W)

DaViT

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class DaViT(nn.Module):

    def __init__(
        self,
        in_chans=3,
        num_classes=1000,
        depths=(1, 1, 3, 1),
        patch_size=(7, 2, 2, 2),
        patch_stride=(4, 2, 2, 2),
        patch_padding=(3, 0, 0, 0),
        patch_prenorm=(False, False, False, False),
        embed_dims=(64, 128, 192, 256),
        num_heads=(3, 6, 12, 24),
        num_groups=(3, 6, 12, 24),
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        enable_checkpoint=False,
        conv_at_attn=True,
        conv_at_ffn=True,
    ):
        super().__init__()

        self.num_classes = num_classes
        self.embed_dims = embed_dims
        self.num_heads = num_heads
        self.num_groups = num_groups
        self.num_stages = len(self.embed_dims)
        self.enable_checkpoint = enable_checkpoint
        assert self.num_stages == len(self.num_heads) == len(self.num_groups)

        num_stages = len(embed_dims)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate,
                                             sum(depths) * 2)
        ]

        depth_offset = 0
        convs = []
        blocks = []
        for i in range(num_stages):
            conv_embed = ConvEmbed(
                patch_size=patch_size[i],
                stride=patch_stride[i],
                padding=patch_padding[i],
                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
                embed_dim=self.embed_dims[i],
                norm_layer=norm_layer,
                pre_norm=patch_prenorm[i])
            convs.append(conv_embed)

            block = MySequential(*[
                MySequential(
                    OrderedDict([('spatial_block',
                                  SpatialBlock(
                                      embed_dims[i],
                                      num_heads[i],
                                      window_size,
                                      drop_path_rate=dpr[depth_offset + j * 2],
                                      qkv_bias=qkv_bias,
                                      mlp_ratio=mlp_ratio,
                                      conv_at_attn=conv_at_attn,
                                      conv_at_ffn=conv_at_ffn,
                                  )),
                                 ('channel_block',
                                  ChannelBlock(
                                      embed_dims[i],
                                      num_groups[i],
                                      drop_path_rate=dpr[depth_offset + j * 2 +
                                                         1],
                                      qkv_bias=qkv_bias,
                                      mlp_ratio=mlp_ratio,
                                      conv_at_attn=conv_at_attn,
                                      conv_at_ffn=conv_at_ffn,
                                  ))])) for j in range(depths[i])
            ])
            blocks.append(block)
            depth_offset += depths[i] * 2

        self.convs = nn.ModuleList(convs)
        self.blocks = nn.ModuleList(blocks)

        self.avgpool = nn.AdaptiveAvgPool1d(1)

    @property
    def dim_out(self):
        return self.embed_dims[-1]

    def forward_features_unpool(self, x):
        """
        forward until avg pooling 
        Args:
            x (_type_): input image tensor
        """
        input_size = (x.size(2), x.size(3))
        for conv, block in zip(self.convs, self.blocks):
            x, input_size = conv(x, input_size)
            x, input_size = block(x, input_size)
        return x

    def forward_features(self, x):
        x = self.forward_features_unpool(x)

        # (batch_size, num_tokens, token_dim)
        x = self.avgpool(x.transpose(1, 2))
        # (batch_size, 1, num_tokens)
        x = torch.flatten(x, 1)
        x = self.norms(x)

        return x

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

    @classmethod
    def from_config(cls, config):
        return cls(
            depths=config.depths,
            embed_dims=config.dim_embed,
            num_heads=config.num_heads,
            num_groups=config.num_groups,
            patch_size=config.patch_size,
            patch_stride=config.patch_stride,
            patch_padding=config.patch_padding,
            patch_prenorm=config.patch_prenorm,
            drop_path_rate=config.drop_path_rate,
            window_size=config.window_size,
        )

avgpool instance-attribute

avgpool = AdaptiveAvgPool1d(1)

blocks instance-attribute

blocks = ModuleList(blocks)

convs instance-attribute

convs = ModuleList(convs)

dim_out property

dim_out

embed_dims instance-attribute

embed_dims = embed_dims

enable_checkpoint instance-attribute

enable_checkpoint = enable_checkpoint

num_classes instance-attribute

num_classes = num_classes

num_groups instance-attribute

num_groups = num_groups

num_heads instance-attribute

num_heads = num_heads

num_stages instance-attribute

num_stages = len(embed_dims)

__init__

__init__(
    in_chans=3,
    num_classes=1000,
    depths=(1, 1, 3, 1),
    patch_size=(7, 2, 2, 2),
    patch_stride=(4, 2, 2, 2),
    patch_padding=(3, 0, 0, 0),
    patch_prenorm=(False, False, False, False),
    embed_dims=(64, 128, 192, 256),
    num_heads=(3, 6, 12, 24),
    num_groups=(3, 6, 12, 24),
    window_size=7,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.1,
    norm_layer=LayerNorm,
    enable_checkpoint=False,
    conv_at_attn=True,
    conv_at_ffn=True,
)
Source code in vllm/model_executor/models/florence2.py
def __init__(
    self,
    in_chans=3,
    num_classes=1000,
    depths=(1, 1, 3, 1),
    patch_size=(7, 2, 2, 2),
    patch_stride=(4, 2, 2, 2),
    patch_padding=(3, 0, 0, 0),
    patch_prenorm=(False, False, False, False),
    embed_dims=(64, 128, 192, 256),
    num_heads=(3, 6, 12, 24),
    num_groups=(3, 6, 12, 24),
    window_size=7,
    mlp_ratio=4.,
    qkv_bias=True,
    drop_path_rate=0.1,
    norm_layer=nn.LayerNorm,
    enable_checkpoint=False,
    conv_at_attn=True,
    conv_at_ffn=True,
):
    super().__init__()

    self.num_classes = num_classes
    self.embed_dims = embed_dims
    self.num_heads = num_heads
    self.num_groups = num_groups
    self.num_stages = len(self.embed_dims)
    self.enable_checkpoint = enable_checkpoint
    assert self.num_stages == len(self.num_heads) == len(self.num_groups)

    num_stages = len(embed_dims)
    dpr = [
        x.item() for x in torch.linspace(0, drop_path_rate,
                                         sum(depths) * 2)
    ]

    depth_offset = 0
    convs = []
    blocks = []
    for i in range(num_stages):
        conv_embed = ConvEmbed(
            patch_size=patch_size[i],
            stride=patch_stride[i],
            padding=patch_padding[i],
            in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
            embed_dim=self.embed_dims[i],
            norm_layer=norm_layer,
            pre_norm=patch_prenorm[i])
        convs.append(conv_embed)

        block = MySequential(*[
            MySequential(
                OrderedDict([('spatial_block',
                              SpatialBlock(
                                  embed_dims[i],
                                  num_heads[i],
                                  window_size,
                                  drop_path_rate=dpr[depth_offset + j * 2],
                                  qkv_bias=qkv_bias,
                                  mlp_ratio=mlp_ratio,
                                  conv_at_attn=conv_at_attn,
                                  conv_at_ffn=conv_at_ffn,
                              )),
                             ('channel_block',
                              ChannelBlock(
                                  embed_dims[i],
                                  num_groups[i],
                                  drop_path_rate=dpr[depth_offset + j * 2 +
                                                     1],
                                  qkv_bias=qkv_bias,
                                  mlp_ratio=mlp_ratio,
                                  conv_at_attn=conv_at_attn,
                                  conv_at_ffn=conv_at_ffn,
                              ))])) for j in range(depths[i])
        ])
        blocks.append(block)
        depth_offset += depths[i] * 2

    self.convs = nn.ModuleList(convs)
    self.blocks = nn.ModuleList(blocks)

    self.avgpool = nn.AdaptiveAvgPool1d(1)

forward

forward(x)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x):
    x = self.forward_features(x)
    x = self.head(x)
    return x

forward_features

forward_features(x)
Source code in vllm/model_executor/models/florence2.py
def forward_features(self, x):
    x = self.forward_features_unpool(x)

    # (batch_size, num_tokens, token_dim)
    x = self.avgpool(x.transpose(1, 2))
    # (batch_size, 1, num_tokens)
    x = torch.flatten(x, 1)
    x = self.norms(x)

    return x

forward_features_unpool

forward_features_unpool(x)

forward until avg pooling Args: x (type): input image tensor

Source code in vllm/model_executor/models/florence2.py
def forward_features_unpool(self, x):
    """
    forward until avg pooling 
    Args:
        x (_type_): input image tensor
    """
    input_size = (x.size(2), x.size(3))
    for conv, block in zip(self.convs, self.blocks):
        x, input_size = conv(x, input_size)
        x, input_size = block(x, input_size)
    return x

from_config classmethod

from_config(config)
Source code in vllm/model_executor/models/florence2.py
@classmethod
def from_config(cls, config):
    return cls(
        depths=config.depths,
        embed_dims=config.dim_embed,
        num_heads=config.num_heads,
        num_groups=config.num_groups,
        patch_size=config.patch_size,
        patch_stride=config.patch_stride,
        patch_padding=config.patch_padding,
        patch_prenorm=config.patch_prenorm,
        drop_path_rate=config.drop_path_rate,
        window_size=config.window_size,
    )

DepthWiseConv2d

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class DepthWiseConv2d(nn.Module):

    def __init__(
        self,
        dim_in,
        kernel_size,
        padding,
        stride,
        bias=True,
    ):
        super().__init__()
        self.dw = nn.Conv2d(dim_in,
                            dim_in,
                            kernel_size=kernel_size,
                            padding=padding,
                            groups=dim_in,
                            stride=stride,
                            bias=bias)

    def forward(self, x, size):
        B, N, C = x.shape
        H, W = size
        assert N == H * W

        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
        size = (x.size(-2), x.size(-1))
        x = x.flatten(2).transpose(1, 2)
        return x, size

dw instance-attribute

dw = Conv2d(
    dim_in,
    dim_in,
    kernel_size=kernel_size,
    padding=padding,
    groups=dim_in,
    stride=stride,
    bias=bias,
)

__init__

__init__(dim_in, kernel_size, padding, stride, bias=True)
Source code in vllm/model_executor/models/florence2.py
def __init__(
    self,
    dim_in,
    kernel_size,
    padding,
    stride,
    bias=True,
):
    super().__init__()
    self.dw = nn.Conv2d(dim_in,
                        dim_in,
                        kernel_size=kernel_size,
                        padding=padding,
                        groups=dim_in,
                        stride=stride,
                        bias=bias)

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    B, N, C = x.shape
    H, W = size
    assert N == H * W

    x = self.dw(x.transpose(1, 2).view(B, C, H, W))
    size = (x.size(-2), x.size(-1))
    x = x.flatten(2).transpose(1, 2)
    return x, size

Florence2DummyInputsBuilder

Bases: BaseDummyInputsBuilder[Florence2ProcessingInfo]

Source code in vllm/model_executor/models/florence2.py
class Florence2DummyInputsBuilder(
        BaseDummyInputsBuilder[Florence2ProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        return ""

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width = target_height = self.info.get_hf_config().projection_dim

        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict
Source code in vllm/model_executor/models/florence2.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)

    target_width = target_height = self.info.get_hf_config().projection_dim

    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/florence2.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    return ""

Florence2ForConditionalGeneration

Bases: Module, SupportsMultiModal, SupportsV0Only

Source code in vllm/model_executor/models/florence2.py
@MULTIMODAL_REGISTRY.register_processor(
    Florence2MultiModalProcessor,
    info=Florence2ProcessingInfo,
    dummy_inputs=Florence2DummyInputsBuilder)
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                        SupportsV0Only):

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("image"):
            return None

        raise ValueError("Only image modality is supported")

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        processor_config = vllm_config.model_config.hf_image_processor_config

        self.config = config
        self.vision_config = config.vision_config
        self.processor_config = processor_config
        assert config.vision_config.model_type == 'davit', (
            'only DaViT is supported for now')
        self.vision_tower = DaViT.from_config(config=config.vision_config)
        self._build_image_projection_layers(config)
        self.language_model = Florence2LanguageForConditionalGeneration(
            vllm_config=vllm_config.with_hf_config(config.text_config),
            prefix=f"{prefix}.language_model",
        )
        self.pad_token_id = config.pad_token_id

    def _build_image_projection_layers(self, config: PretrainedConfig):
        image_dim_out = config.vision_config.dim_embed[-1]
        dim_projection = config.vision_config.projection_dim
        self.image_projection = nn.Parameter(
            torch.empty(image_dim_out, dim_projection))
        self.image_proj_norm = nn.LayerNorm(dim_projection)
        image_pos_embed_config = config.vision_config.image_pos_embed
        if image_pos_embed_config['type'] == 'learned_abs_2d':
            self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                embedding_dim=image_dim_out,
                num_pos=image_pos_embed_config['max_pos_embeddings'])
        else:
            raise NotImplementedError("Florence2 only supports learned_abs_2d "
                                      "as image position embedding.")

        self.image_feature_source = config.vision_config.image_feature_source

        # temporal embedding
        visual_temporal_embedding_config = (
            self.vision_config.visual_temporal_embedding)
        if visual_temporal_embedding_config['type'] == 'COSINE':
            self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                embed_dim=image_dim_out,
                max_seq_len=visual_temporal_embedding_config[
                    'max_temporal_embeddings'])
        else:
            raise NotImplementedError(
                'Florence2 only supports COSINE as temporal embedding.')

    def _parse_and_validate_image_input(self, **kwargs: object):
        pixel_values: Optional[Union[list[list[torch.Tensor]],
                                     list[torch.Tensor],
                                     torch.Tensor]] = kwargs.pop(
                                         "pixel_values", None)
        image_embeds: Optional[Union[list[list[torch.Tensor]],
                                     list[torch.Tensor],
                                     torch.Tensor]] = kwargs.pop(
                                         "image_embeds", None)

        if pixel_values is None and image_embeds is None:
            return None

        if pixel_values is not None and image_embeds is not None:
            raise ValueError(
                "Both pixel values and image embeds are provided.")

        if pixel_values is not None:
            size = self.processor_config["size"]
            expected_h, expected_w = size["height"], size["width"]

            return Florence2ImagePixelInputs(
                type="pixel_values",
                data=flatten_bn(pixel_values, concat=True),
                resolve_bindings={
                    "h": expected_h,
                    "w": expected_w
                },
            )

        if image_embeds is not None:
            raise NotImplementedError

        raise AssertionError("This line should be unreachable.")

    def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
        dtype = next(self.vision_tower.parameters()).dtype
        pixel_values = pixel_values.to(dtype)

        batch_size, T = pixel_values.size(0), 1
        x = self.vision_tower.forward_features_unpool(pixel_values)
        if self.image_pos_embed is not None:
            x = x.view(batch_size * T, -1, x.shape[-1])
            num_tokens = x.shape[-2]
            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
            assert h * w == num_tokens, (
                'only support square feature maps for now')
            x = x.view(batch_size * T, h, w, x.shape[-1])
            pos_embed = self.image_pos_embed(x)
            x = x + pos_embed
            x = x.view(batch_size, T * h * w, x.shape[-1])

        if self.visual_temporal_embed is not None:
            visual_temporal_embed = self.visual_temporal_embed(
                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
            x = x.view(batch_size, T, -1,
                       x.shape[-1]) + visual_temporal_embed.view(
                           1, T, 1, x.shape[-1])

        x_feat_dict = {}

        spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x

        temporal_avg_pool_x = x.view(batch_size, T, -1,
                                     x.shape[-1]).mean(dim=1)
        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x

        x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
        x_feat_dict['last_frame'] = x

        new_x = []
        for _image_feature_source in self.image_feature_source:
            if _image_feature_source not in x_feat_dict:
                raise ValueError('invalid image feature source: {}'.format(
                    _image_feature_source))
            new_x.append(x_feat_dict[_image_feature_source])

        x = torch.cat(new_x, dim=1)

        x = x @ self.image_projection
        x = self.image_proj_norm(x)

        return x

    def _process_image_input(
            self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
        assert image_input["type"] == "pixel_values"
        pixel_values = image_input["data"]
        return self._encode_image(pixel_values)

    def get_language_model(self) -> torch.nn.Module:
        return self.language_model

    def get_multimodal_embeddings(self,
                                  **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return []
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.pad_token_id)
        return inputs_embeds

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        *,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: torch.Tensor of *decoder* input token ids.
            positions: torch.Tensor of *decoder* position indices.
            encoder_input_ids: torch.Tensor of *encoder* input token ids.
            encoder_positions: torch.Tensor of *encoder* position indices
        Returns:
            Output torch.Tensor
        """
        vision_embeddings = self.get_multimodal_embeddings(**kwargs)
        if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
            inputs_embeds = self.get_input_embeddings(encoder_input_ids,
                                                      vision_embeddings)
        else:
            inputs_embeds = None

        hidden_states = self.language_model(input_ids,
                                            positions,
                                            encoder_input_ids,
                                            encoder_positions,
                                            inputs_embeds=inputs_embeds)
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights)

config instance-attribute

config = config

language_model instance-attribute

language_model = Florence2LanguageForConditionalGeneration(
    vllm_config=with_hf_config(text_config),
    prefix=f"{prefix}.language_model",
)

pad_token_id instance-attribute

pad_token_id = pad_token_id

processor_config instance-attribute

processor_config = processor_config

vision_config instance-attribute

vision_config = vision_config

vision_tower instance-attribute

vision_tower = from_config(config=vision_config)

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/florence2.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config = vllm_config.model_config.hf_config
    processor_config = vllm_config.model_config.hf_image_processor_config

    self.config = config
    self.vision_config = config.vision_config
    self.processor_config = processor_config
    assert config.vision_config.model_type == 'davit', (
        'only DaViT is supported for now')
    self.vision_tower = DaViT.from_config(config=config.vision_config)
    self._build_image_projection_layers(config)
    self.language_model = Florence2LanguageForConditionalGeneration(
        vllm_config=vllm_config.with_hf_config(config.text_config),
        prefix=f"{prefix}.language_model",
    )
    self.pad_token_id = config.pad_token_id

_build_image_projection_layers

_build_image_projection_layers(config: PretrainedConfig)
Source code in vllm/model_executor/models/florence2.py
def _build_image_projection_layers(self, config: PretrainedConfig):
    image_dim_out = config.vision_config.dim_embed[-1]
    dim_projection = config.vision_config.projection_dim
    self.image_projection = nn.Parameter(
        torch.empty(image_dim_out, dim_projection))
    self.image_proj_norm = nn.LayerNorm(dim_projection)
    image_pos_embed_config = config.vision_config.image_pos_embed
    if image_pos_embed_config['type'] == 'learned_abs_2d':
        self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
            embedding_dim=image_dim_out,
            num_pos=image_pos_embed_config['max_pos_embeddings'])
    else:
        raise NotImplementedError("Florence2 only supports learned_abs_2d "
                                  "as image position embedding.")

    self.image_feature_source = config.vision_config.image_feature_source

    # temporal embedding
    visual_temporal_embedding_config = (
        self.vision_config.visual_temporal_embedding)
    if visual_temporal_embedding_config['type'] == 'COSINE':
        self.visual_temporal_embed = PositionalEmbeddingCosine1D(
            embed_dim=image_dim_out,
            max_seq_len=visual_temporal_embedding_config[
                'max_temporal_embeddings'])
    else:
        raise NotImplementedError(
            'Florence2 only supports COSINE as temporal embedding.')

_encode_image

_encode_image(pixel_values: Tensor) -> Tensor
Source code in vllm/model_executor/models/florence2.py
def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
    dtype = next(self.vision_tower.parameters()).dtype
    pixel_values = pixel_values.to(dtype)

    batch_size, T = pixel_values.size(0), 1
    x = self.vision_tower.forward_features_unpool(pixel_values)
    if self.image_pos_embed is not None:
        x = x.view(batch_size * T, -1, x.shape[-1])
        num_tokens = x.shape[-2]
        h, w = int(num_tokens**0.5), int(num_tokens**0.5)
        assert h * w == num_tokens, (
            'only support square feature maps for now')
        x = x.view(batch_size * T, h, w, x.shape[-1])
        pos_embed = self.image_pos_embed(x)
        x = x + pos_embed
        x = x.view(batch_size, T * h * w, x.shape[-1])

    if self.visual_temporal_embed is not None:
        visual_temporal_embed = self.visual_temporal_embed(
            x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
        x = x.view(batch_size, T, -1,
                   x.shape[-1]) + visual_temporal_embed.view(
                       1, T, 1, x.shape[-1])

    x_feat_dict = {}

    spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
    x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x

    temporal_avg_pool_x = x.view(batch_size, T, -1,
                                 x.shape[-1]).mean(dim=1)
    x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x

    x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
    x_feat_dict['last_frame'] = x

    new_x = []
    for _image_feature_source in self.image_feature_source:
        if _image_feature_source not in x_feat_dict:
            raise ValueError('invalid image feature source: {}'.format(
                _image_feature_source))
        new_x.append(x_feat_dict[_image_feature_source])

    x = torch.cat(new_x, dim=1)

    x = x @ self.image_projection
    x = self.image_proj_norm(x)

    return x

_parse_and_validate_image_input

_parse_and_validate_image_input(**kwargs: object)
Source code in vllm/model_executor/models/florence2.py
def _parse_and_validate_image_input(self, **kwargs: object):
    pixel_values: Optional[Union[list[list[torch.Tensor]],
                                 list[torch.Tensor],
                                 torch.Tensor]] = kwargs.pop(
                                     "pixel_values", None)
    image_embeds: Optional[Union[list[list[torch.Tensor]],
                                 list[torch.Tensor],
                                 torch.Tensor]] = kwargs.pop(
                                     "image_embeds", None)

    if pixel_values is None and image_embeds is None:
        return None

    if pixel_values is not None and image_embeds is not None:
        raise ValueError(
            "Both pixel values and image embeds are provided.")

    if pixel_values is not None:
        size = self.processor_config["size"]
        expected_h, expected_w = size["height"], size["width"]

        return Florence2ImagePixelInputs(
            type="pixel_values",
            data=flatten_bn(pixel_values, concat=True),
            resolve_bindings={
                "h": expected_h,
                "w": expected_w
            },
        )

    if image_embeds is not None:
        raise NotImplementedError

    raise AssertionError("This line should be unreachable.")

_process_image_input

_process_image_input(
    image_input: Florence2ImagePixelInputs,
) -> Tensor
Source code in vllm/model_executor/models/florence2.py
def _process_image_input(
        self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
    assert image_input["type"] == "pixel_values"
    pixel_values = image_input["data"]
    return self._encode_image(pixel_values)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/florence2.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    return self.language_model.compute_logits(hidden_states,
                                              sampling_metadata)

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    *,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    **kwargs,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

torch.Tensor of decoder input token ids.

required
positions Tensor

torch.Tensor of decoder position indices.

required
encoder_input_ids Tensor

torch.Tensor of encoder input token ids.

required
encoder_positions Tensor

torch.Tensor of encoder position indices

required

Returns: Output torch.Tensor

Source code in vllm/model_executor/models/florence2.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    *,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    **kwargs,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: torch.Tensor of *decoder* input token ids.
        positions: torch.Tensor of *decoder* position indices.
        encoder_input_ids: torch.Tensor of *encoder* input token ids.
        encoder_positions: torch.Tensor of *encoder* position indices
    Returns:
        Output torch.Tensor
    """
    vision_embeddings = self.get_multimodal_embeddings(**kwargs)
    if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
        inputs_embeds = self.get_input_embeddings(encoder_input_ids,
                                                  vision_embeddings)
    else:
        inputs_embeds = None

    hidden_states = self.language_model(input_ids,
                                        positions,
                                        encoder_input_ids,
                                        encoder_positions,
                                        inputs_embeds=inputs_embeds)
    return hidden_states

get_input_embeddings

get_input_embeddings(
    input_ids: Tensor,
    multimodal_embeddings: Optional[
        MultiModalEmbeddings
    ] = None,
) -> Tensor
Source code in vllm/model_executor/models/florence2.py
def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
    inputs_embeds = self.language_model.get_input_embeddings(input_ids)
    if multimodal_embeddings is not None \
        and len(multimodal_embeddings) != 0:
        inputs_embeds = merge_multimodal_embeddings(
            input_ids, inputs_embeds, multimodal_embeddings,
            self.pad_token_id)
    return inputs_embeds

get_language_model

get_language_model() -> Module
Source code in vllm/model_executor/models/florence2.py
def get_language_model(self) -> torch.nn.Module:
    return self.language_model

get_multimodal_embeddings

get_multimodal_embeddings(
    **kwargs: object,
) -> MultiModalEmbeddings
Source code in vllm/model_executor/models/florence2.py
def get_multimodal_embeddings(self,
                              **kwargs: object) -> MultiModalEmbeddings:
    image_input = self._parse_and_validate_image_input(**kwargs)
    if image_input is None:
        return []
    vision_embeddings = self._process_image_input(image_input)
    return vision_embeddings

get_placeholder_str classmethod

get_placeholder_str(modality: str, i: int) -> Optional[str]
Source code in vllm/model_executor/models/florence2.py
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
    if modality.startswith("image"):
        return None

    raise ValueError("Only image modality is supported")

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/florence2.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    loader = AutoWeightsLoader(self)
    return loader.load_weights(weights)

Florence2ImagePixelInputs

Bases: TensorSchema

Dimensions
  • b: Batch size
  • c: Number of channels (3)
  • h: Height of the image
  • w: Width of the image
Source code in vllm/model_executor/models/florence2.py
class Florence2ImagePixelInputs(TensorSchema):
    """
    Dimensions:
        - b: Batch size
        - c: Number of channels (3)
        - h: Height of the image
        - w: Width of the image
    """

    type: Literal["pixel_values"]

    data: Annotated[
        torch.Tensor,
        TensorShape("b", 3, "h", "w"),
    ]

data instance-attribute

data: Annotated[Tensor, TensorShape(b, 3, h, w)]

type instance-attribute

type: Literal['pixel_values']

Florence2LanguageForConditionalGeneration

Bases: Module, SupportsV0Only

Source code in vllm/model_executor/models/florence2.py
class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config

        self.config = config
        self.model = Florence2LanguageModel(vllm_config=vllm_config,
                                            prefix=f"{prefix}.model")
        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.vocab_size = config.vocab_size
        self.lm_head = BartParallelLMHead(self.vocab_size,
                                          config.d_model,
                                          embed_scale=embed_scale)
        if self.config.tie_word_embeddings:
            self.lm_head.tie_weights(self.model.shared)

        self.logits_processor = LogitsProcessor(self.vocab_size,
                                                config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: torch.Tensor of *decoder* input token ids.
            positions: torch.Tensor of *decoder* position indices.
            encoder_input_ids: torch.Tensor of *encoder* input token ids.
            encoder_positions: torch.Tensor of *encoder* position indices
        Returns:
            Output torch.Tensor
        """

        return self.model(input_ids,
                          positions,
                          encoder_input_ids,
                          encoder_positions,
                          inputs_embeds=inputs_embeds)

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.encoder.embed_tokens(input_ids)

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
        ]

        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()
        for name, loaded_weight in weights:
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                if "final_logits_bias" in name:
                    continue
                if self.config.tie_word_embeddings and ("embed_tokens" in name
                                                        or "lm_head" in name):
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params

config instance-attribute

config = config

lm_head instance-attribute

lm_head = BartParallelLMHead(
    vocab_size, d_model, embed_scale=embed_scale
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(vocab_size, vocab_size)

model instance-attribute

model = Florence2LanguageModel(
    vllm_config=vllm_config, prefix=f"{prefix}.model"
)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/florence2.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config

    self.config = config
    self.model = Florence2LanguageModel(vllm_config=vllm_config,
                                        prefix=f"{prefix}.model")
    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.vocab_size = config.vocab_size
    self.lm_head = BartParallelLMHead(self.vocab_size,
                                      config.d_model,
                                      embed_scale=embed_scale)
    if self.config.tie_word_embeddings:
        self.lm_head.tie_weights(self.model.shared)

    self.logits_processor = LogitsProcessor(self.vocab_size,
                                            config.vocab_size)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/florence2.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

torch.Tensor of decoder input token ids.

required
positions Tensor

torch.Tensor of decoder position indices.

required
encoder_input_ids Tensor

torch.Tensor of encoder input token ids.

required
encoder_positions Tensor

torch.Tensor of encoder position indices

required

Returns: Output torch.Tensor

Source code in vllm/model_executor/models/florence2.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: torch.Tensor of *decoder* input token ids.
        positions: torch.Tensor of *decoder* position indices.
        encoder_input_ids: torch.Tensor of *encoder* input token ids.
        encoder_positions: torch.Tensor of *encoder* position indices
    Returns:
        Output torch.Tensor
    """

    return self.model(input_ids,
                      positions,
                      encoder_input_ids,
                      encoder_positions,
                      inputs_embeds=inputs_embeds)

get_input_embeddings

get_input_embeddings(input_ids: Tensor) -> Tensor
Source code in vllm/model_executor/models/florence2.py
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
    return self.model.encoder.embed_tokens(input_ids)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/florence2.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        ("qkv_proj", "q_proj", "q"),
        ("qkv_proj", "k_proj", "k"),
        ("qkv_proj", "v_proj", "v"),
    ]

    params_dict = dict(self.named_parameters())
    loaded_params: set[str] = set()
    for name, loaded_weight in weights:
        for (param_name, weight_name, shard_id) in stacked_params_mapping:
            if weight_name not in name:
                continue
            name = name.replace(weight_name, param_name)
            param = params_dict[name]
            weight_loader = param.weight_loader
            weight_loader(param, loaded_weight, shard_id)
            break
        else:
            if "final_logits_bias" in name:
                continue
            if self.config.tie_word_embeddings and ("embed_tokens" in name
                                                    or "lm_head" in name):
                continue
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            weight_loader(param, loaded_weight)
        loaded_params.add(name)
    return loaded_params

Florence2LanguageModel

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class Florence2LanguageModel(nn.Module):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config

        self.config = config

        self.vocab_size = config.vocab_size

        self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
        self.encoder = BartEncoder(config,
                                   cache_config=cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.encoder")
        self.decoder = BartDecoder(config,
                                   cache_config=cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.decoder")

        if self.config.tie_word_embeddings:
            self.encoder.embed_tokens.weight = self.shared.weight
            self.decoder.embed_tokens.weight = self.shared.weight

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids: Indices of *decoder* input sequence tokens 
                in the vocabulary.
                Padding will be ignored by default should you
                provide it.
            positions: Positions of *decoder* input sequence tokens.
            encoder_input_ids: Indices of *encoder* input sequence tokens 
                in the vocabulary.
            encoder_positions: Positions of *encoder* input sequence tokens.
        Returns:
            Model output torch.Tensor
        """

        encoder_hidden_states = None

        if ((inputs_embeds is not None and inputs_embeds.numel() > 0)
                or encoder_input_ids.numel() > 0):
            # Run encoder attention if a non-zero number of encoder tokens
            # are provided as input
            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                                 positions=encoder_positions,
                                                 inputs_embeds=inputs_embeds)

        # decoder outputs consists of
        # (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            decoder_input_ids=input_ids,
            decoder_positions=positions,
            encoder_hidden_states=encoder_hidden_states)

        return decoder_outputs

config instance-attribute

config = config

decoder instance-attribute

decoder = BartDecoder(
    config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.decoder",
)

encoder instance-attribute

encoder = BartEncoder(
    config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.encoder",
)

shared instance-attribute

shared = BartScaledWordEmbedding(vocab_size, d_model)

vocab_size instance-attribute

vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/florence2.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config

    self.config = config

    self.vocab_size = config.vocab_size

    self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
    self.encoder = BartEncoder(config,
                               cache_config=cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.encoder")
    self.decoder = BartDecoder(config,
                               cache_config=cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.decoder")

    if self.config.tie_word_embeddings:
        self.encoder.embed_tokens.weight = self.shared.weight
        self.decoder.embed_tokens.weight = self.shared.weight

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name Type Description Default
input_ids Tensor

Indices of decoder input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.

required
positions Tensor

Positions of decoder input sequence tokens.

required
encoder_input_ids Tensor

Indices of encoder input sequence tokens in the vocabulary.

required
encoder_positions Tensor

Positions of encoder input sequence tokens.

required

Returns: Model output torch.Tensor

Source code in vllm/model_executor/models/florence2.py
def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        input_ids: Indices of *decoder* input sequence tokens 
            in the vocabulary.
            Padding will be ignored by default should you
            provide it.
        positions: Positions of *decoder* input sequence tokens.
        encoder_input_ids: Indices of *encoder* input sequence tokens 
            in the vocabulary.
        encoder_positions: Positions of *encoder* input sequence tokens.
    Returns:
        Model output torch.Tensor
    """

    encoder_hidden_states = None

    if ((inputs_embeds is not None and inputs_embeds.numel() > 0)
            or encoder_input_ids.numel() > 0):
        # Run encoder attention if a non-zero number of encoder tokens
        # are provided as input
        encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                             positions=encoder_positions,
                                             inputs_embeds=inputs_embeds)

    # decoder outputs consists of
    # (dec_features, past_key_value, dec_hidden, dec_attn)
    decoder_outputs = self.decoder(
        decoder_input_ids=input_ids,
        decoder_positions=positions,
        encoder_hidden_states=encoder_hidden_states)

    return decoder_outputs

Florence2MultiModalProcessor

Bases: EncDecMultiModalProcessor[Florence2ProcessingInfo]

Source code in vllm/model_executor/models/florence2.py
class Florence2MultiModalProcessor(
        EncDecMultiModalProcessor[Florence2ProcessingInfo]):

    def _hf_processor_applies_updates(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ) -> bool:
        return False

    def create_encoder_prompt(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
    ) -> Union[str, list[int]]:
        return prompt

    def create_decoder_prompt(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
    ) -> Union[str, list[int]]:
        return [self.info.get_hf_config().eos_token_id]

    def _apply_hf_processor_tokens_only(
        self,
        prompt_tokens: list[int],
    ) -> list[int]:
        hf_processor = self.info.get_hf_processor()
        tokenizer: BartTokenizer = hf_processor.tokenizer
        prompt_text = tokenizer.decode(prompt_tokens)
        # convert task tokens to prompt
        prompt_text = hf_processor._construct_prompts([prompt_text])[0]
        prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
        return prompt_tokens

    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
        tok_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        if mm_data:
            processed_outputs = super()._call_hf_processor(
                prompt, mm_data, mm_kwargs, tok_kwargs)
        else:
            hf_processor = self.info.get_hf_processor()
            tokenizer = hf_processor.tokenizer
            prompt = hf_processor._construct_prompts([prompt])[0]
            processed_outputs = tokenizer(prompt,
                                          add_special_tokens=True,
                                          return_tensors="pt")
        return processed_outputs

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(pixel_values=MultiModalFieldConfig.batched("image"))

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        pad_token_id = hf_config.pad_token_id
        num_image_tokens = self.info.get_num_image_tokens()
        image_tokens = [pad_token_id] * num_image_tokens

        return [
            PromptInsertion(
                modality="image",
                target=PromptIndexTargets.start(),
                insertion=image_tokens,
            )
        ]

_apply_hf_processor_tokens_only

_apply_hf_processor_tokens_only(
    prompt_tokens: list[int],
) -> list[int]
Source code in vllm/model_executor/models/florence2.py
def _apply_hf_processor_tokens_only(
    self,
    prompt_tokens: list[int],
) -> list[int]:
    hf_processor = self.info.get_hf_processor()
    tokenizer: BartTokenizer = hf_processor.tokenizer
    prompt_text = tokenizer.decode(prompt_tokens)
    # convert task tokens to prompt
    prompt_text = hf_processor._construct_prompts([prompt_text])[0]
    prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
    return prompt_tokens

_call_hf_processor

_call_hf_processor(
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, object],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature
Source code in vllm/model_executor/models/florence2.py
def _call_hf_processor(
    self,
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, object],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature:
    if mm_data:
        processed_outputs = super()._call_hf_processor(
            prompt, mm_data, mm_kwargs, tok_kwargs)
    else:
        hf_processor = self.info.get_hf_processor()
        tokenizer = hf_processor.tokenizer
        prompt = hf_processor._construct_prompts([prompt])[0]
        processed_outputs = tokenizer(prompt,
                                      add_special_tokens=True,
                                      return_tensors="pt")
    return processed_outputs

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]
Source code in vllm/model_executor/models/florence2.py
def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    return dict(pixel_values=MultiModalFieldConfig.batched("image"))

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]
Source code in vllm/model_executor/models/florence2.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
    hf_config = self.info.get_hf_config()
    pad_token_id = hf_config.pad_token_id
    num_image_tokens = self.info.get_num_image_tokens()
    image_tokens = [pad_token_id] * num_image_tokens

    return [
        PromptInsertion(
            modality="image",
            target=PromptIndexTargets.start(),
            insertion=image_tokens,
        )
    ]

_hf_processor_applies_updates

_hf_processor_applies_updates(
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool
Source code in vllm/model_executor/models/florence2.py
def _hf_processor_applies_updates(
    self,
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool:
    return False

create_decoder_prompt

create_decoder_prompt(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]
Source code in vllm/model_executor/models/florence2.py
def create_decoder_prompt(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]:
    return [self.info.get_hf_config().eos_token_id]

create_encoder_prompt

create_encoder_prompt(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]
Source code in vllm/model_executor/models/florence2.py
def create_encoder_prompt(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]:
    return prompt

Florence2ProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/florence2.py
class Florence2ProcessingInfo(BaseProcessingInfo):

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

    def get_num_image_tokens(self) -> int:
        processor_config = self.ctx.get_hf_image_processor_config()
        return processor_config["image_seq_length"]

get_num_image_tokens

get_num_image_tokens() -> int
Source code in vllm/model_executor/models/florence2.py
def get_num_image_tokens(self) -> int:
    processor_config = self.ctx.get_hf_image_processor_config()
    return processor_config["image_seq_length"]

get_supported_mm_limits

get_supported_mm_limits() -> Mapping[str, Optional[int]]
Source code in vllm/model_executor/models/florence2.py
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": 1}

LearnedAbsolutePositionEmbedding2D

Bases: Module

This module learns positional embeddings up to a fixed maximum size.

Source code in vllm/model_executor/models/florence2.py
class LearnedAbsolutePositionEmbedding2D(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, embedding_dim=256, num_pos=50):
        super().__init__()
        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
        self.column_embeddings = nn.Embedding(
            num_pos, embedding_dim - (embedding_dim // 2))

    def forward(self, pixel_values):
        """
        pixel_values: (batch_size, height, width, num_channels) 
        returns: (batch_size, height, width, embedding_dim * 2)
        """
        if len(pixel_values.shape) != 4:
            raise ValueError('pixel_values must be a 4D tensor')
        height, width = pixel_values.shape[1:3]
        width_values = torch.arange(width, device=pixel_values.device)
        height_values = torch.arange(height, device=pixel_values.device)
        x_emb = self.column_embeddings(width_values)
        y_emb = self.row_embeddings(height_values)
        # (height, width, embedding_dim * 2)
        pos = torch.cat([
            x_emb.unsqueeze(0).repeat(height, 1, 1),
            y_emb.unsqueeze(1).repeat(1, width, 1)
        ],
                        dim=-1)
        # (embedding_dim * 2, height, width)
        pos = pos.permute(2, 0, 1)
        pos = pos.unsqueeze(0)
        # (batch_size, embedding_dim * 2, height, width)
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # (batch_size, height, width, embedding_dim * 2)
        pos = pos.permute(0, 2, 3, 1)
        return pos

column_embeddings instance-attribute

column_embeddings = Embedding(
    num_pos, embedding_dim - embedding_dim // 2
)

row_embeddings instance-attribute

row_embeddings = Embedding(num_pos, embedding_dim // 2)

__init__

__init__(embedding_dim=256, num_pos=50)
Source code in vllm/model_executor/models/florence2.py
def __init__(self, embedding_dim=256, num_pos=50):
    super().__init__()
    self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
    self.column_embeddings = nn.Embedding(
        num_pos, embedding_dim - (embedding_dim // 2))

forward

forward(pixel_values)

pixel_values: (batch_size, height, width, num_channels) returns: (batch_size, height, width, embedding_dim * 2)

Source code in vllm/model_executor/models/florence2.py
def forward(self, pixel_values):
    """
    pixel_values: (batch_size, height, width, num_channels) 
    returns: (batch_size, height, width, embedding_dim * 2)
    """
    if len(pixel_values.shape) != 4:
        raise ValueError('pixel_values must be a 4D tensor')
    height, width = pixel_values.shape[1:3]
    width_values = torch.arange(width, device=pixel_values.device)
    height_values = torch.arange(height, device=pixel_values.device)
    x_emb = self.column_embeddings(width_values)
    y_emb = self.row_embeddings(height_values)
    # (height, width, embedding_dim * 2)
    pos = torch.cat([
        x_emb.unsqueeze(0).repeat(height, 1, 1),
        y_emb.unsqueeze(1).repeat(1, width, 1)
    ],
                    dim=-1)
    # (embedding_dim * 2, height, width)
    pos = pos.permute(2, 0, 1)
    pos = pos.unsqueeze(0)
    # (batch_size, embedding_dim * 2, height, width)
    pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
    # (batch_size, height, width, embedding_dim * 2)
    pos = pos.permute(0, 2, 3, 1)
    return pos

Mlp

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class Mlp(nn.Module):

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.net = nn.Sequential(
            OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
                         ("act", act_layer()),
                         ("fc2", nn.Linear(hidden_features, out_features))]))

    def forward(self, x, size):
        return self.net(x), size

net instance-attribute

net = Sequential(
    OrderedDict(
        [
            ("fc1", Linear(in_features, hidden_features)),
            ("act", act_layer()),
            ("fc2", Linear(hidden_features, out_features)),
        ]
    )
)

__init__

__init__(
    in_features,
    hidden_features=None,
    out_features=None,
    act_layer=GELU,
)
Source code in vllm/model_executor/models/florence2.py
def __init__(
    self,
    in_features,
    hidden_features=None,
    out_features=None,
    act_layer=nn.GELU,
):
    super().__init__()
    out_features = out_features or in_features
    hidden_features = hidden_features or in_features
    self.net = nn.Sequential(
        OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
                     ("act", act_layer()),
                     ("fc2", nn.Linear(hidden_features, out_features))]))

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    return self.net(x), size

MySequential

Bases: Sequential

Source code in vllm/model_executor/models/florence2.py
class MySequential(nn.Sequential):

    def forward(self, *inputs):
        for module in self._modules.values():
            if isinstance(inputs, tuple):
                inputs = module(*inputs)
            else:
                inputs = module(inputs)
        return inputs

forward

forward(*inputs)
Source code in vllm/model_executor/models/florence2.py
def forward(self, *inputs):
    for module in self._modules.values():
        if isinstance(inputs, tuple):
            inputs = module(*inputs)
        else:
            inputs = module(inputs)
    return inputs

PositionalEmbeddingCosine1D

Bases: Module

This class implements a very simple positional encoding. It follows closely the encoder from the link below: https://pytorch.org/tutorials/beginner/translation_transformer.html Args: embed_dim: The dimension of the embeddings. dropout_prob: The dropout probability. max_seq_len: The maximum length to precompute the positional encodings.

Source code in vllm/model_executor/models/florence2.py
class PositionalEmbeddingCosine1D(nn.Module):
    """
    This class implements a very simple positional encoding. It follows closely
    the encoder from the link below:
    https://pytorch.org/tutorials/beginner/translation_transformer.html
    Args:
        embed_dim: The dimension of the embeddings.
        dropout_prob: The dropout probability.
        max_seq_len: The maximum length to precompute the positional encodings.
    """

    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        # Generate the sinusoidal arrays.
        factor = math.log(10000)
        denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
                                self.embed_dim)
        # Matrix where rows correspond to a positional embedding as a function
        # of the position index (i.e., the row index).
        frequencies = \
            torch.arange(0, self.max_seq_len) \
            .reshape(self.max_seq_len, 1) * denominator
        pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
        # Populate uneven entries.
        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
        # Save the positional embeddings in a constant buffer.
        # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
                                             requires_grad=False)

    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
        """
        Args:
            seq_embeds: The sequence embeddings in order. Allowed size:
                1. [T, D], where T is the length of the sequence, and D is the
                frame embedding dimension.
                2. [B, T, D], where B is the batch size and T and D are the
                same as above.
        Returns a tensor of with the same dimensions as the input: i.e.,
        [1, T, D] or [T, D].
        """
        shape_len = len(seq_embeds.shape)
        assert 2 <= shape_len <= 3
        len_seq = seq_embeds.size(-2)
        assert len_seq <= self.max_seq_len
        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
        # Adapt pre-computed positional embeddings to the input.
        if shape_len == 3:
            pos_embeds = pos_embeds.view(
                (1, pos_embeds.size(0), pos_embeds.size(1)))
        return pos_embeds

embed_dim instance-attribute

embed_dim = embed_dim

max_seq_len instance-attribute

max_seq_len = max_seq_len

pos_idx_to_embed instance-attribute

pos_idx_to_embed = Parameter(
    pos_idx_to_embed, requires_grad=False
)

__init__

__init__(
    embed_dim: int = 512, max_seq_len: int = 1024
) -> None
Source code in vllm/model_executor/models/florence2.py
def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
    super().__init__()
    self.embed_dim = embed_dim
    self.max_seq_len = max_seq_len
    # Generate the sinusoidal arrays.
    factor = math.log(10000)
    denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
                            self.embed_dim)
    # Matrix where rows correspond to a positional embedding as a function
    # of the position index (i.e., the row index).
    frequencies = \
        torch.arange(0, self.max_seq_len) \
        .reshape(self.max_seq_len, 1) * denominator
    pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
    # Populate uneven entries.
    pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
    pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
    # Save the positional embeddings in a constant buffer.
    # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
    self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
                                         requires_grad=False)

forward

forward(seq_embeds: Tensor) -> Tensor

Parameters:

Name Type Description Default
seq_embeds Tensor

The sequence embeddings in order. Allowed size: 1. [T, D], where T is the length of the sequence, and D is the frame embedding dimension. 2. [B, T, D], where B is the batch size and T and D are the same as above.

required

Returns a tensor of with the same dimensions as the input: i.e., [1, T, D] or [T, D].

Source code in vllm/model_executor/models/florence2.py
def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
    """
    Args:
        seq_embeds: The sequence embeddings in order. Allowed size:
            1. [T, D], where T is the length of the sequence, and D is the
            frame embedding dimension.
            2. [B, T, D], where B is the batch size and T and D are the
            same as above.
    Returns a tensor of with the same dimensions as the input: i.e.,
    [1, T, D] or [T, D].
    """
    shape_len = len(seq_embeds.shape)
    assert 2 <= shape_len <= 3
    len_seq = seq_embeds.size(-2)
    assert len_seq <= self.max_seq_len
    pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
    # Adapt pre-computed positional embeddings to the input.
    if shape_len == 3:
        pos_embeds = pos_embeds.view(
            (1, pos_embeds.size(0), pos_embeds.size(1)))
    return pos_embeds

PreNorm

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class PreNorm(nn.Module):

    def __init__(self, norm, fn):
        super().__init__()
        self.norm = norm
        self.fn = fn

    def forward(self, x, *args, **kwargs):
        shortcut = x
        if self.norm is not None:
            x, size = self.fn(self.norm(x), *args, **kwargs)
        else:
            x, size = self.fn(x, *args, **kwargs)

        x = shortcut + x

        return x, size

fn instance-attribute

fn = fn

norm instance-attribute

norm = norm

__init__

__init__(norm, fn)
Source code in vllm/model_executor/models/florence2.py
def __init__(self, norm, fn):
    super().__init__()
    self.norm = norm
    self.fn = fn

forward

forward(x, *args, **kwargs)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, *args, **kwargs):
    shortcut = x
    if self.norm is not None:
        x, size = self.fn(self.norm(x), *args, **kwargs)
    else:
        x, size = self.fn(x, *args, **kwargs)

    x = shortcut + x

    return x, size

SpatialBlock

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class SpatialBlock(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 window_size,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 drop_path_rate=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 conv_at_attn=True,
                 conv_at_ffn=True):
        super().__init__()

        self.conv1 = PreNorm(None, DepthWiseConv2d(
            dim, 3, 1, 1)) if conv_at_attn else None
        self.window_attn = PreNorm(
            norm_layer(dim),
            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
        )
        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                                   1)) if conv_at_ffn else None
        self.ffn = PreNorm(
            norm_layer(dim),
            Mlp(in_features=dim,
                hidden_features=int(dim * mlp_ratio),
                act_layer=act_layer),
        )

    def forward(self, x, size):
        if self.conv1:
            x, size = self.conv1(x, size)
        x, size = self.window_attn(x, size)

        if self.conv2:
            x, size = self.conv2(x, size)
        x, size = self.ffn(x, size)
        return x, size

conv1 instance-attribute

conv1 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_attn
    else None
)

conv2 instance-attribute

conv2 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_ffn
    else None
)

ffn instance-attribute

ffn = PreNorm(
    norm_layer(dim),
    Mlp(
        in_features=dim,
        hidden_features=int(dim * mlp_ratio),
        act_layer=act_layer,
    ),
)

window_attn instance-attribute

window_attn = PreNorm(
    norm_layer(dim),
    WindowAttention(
        dim, num_heads, window_size, qkv_bias=qkv_bias
    ),
)

__init__

__init__(
    dim,
    num_heads,
    window_size,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.0,
    act_layer=GELU,
    norm_layer=LayerNorm,
    conv_at_attn=True,
    conv_at_ffn=True,
)
Source code in vllm/model_executor/models/florence2.py
def __init__(self,
             dim,
             num_heads,
             window_size,
             mlp_ratio=4.,
             qkv_bias=True,
             drop_path_rate=0.,
             act_layer=nn.GELU,
             norm_layer=nn.LayerNorm,
             conv_at_attn=True,
             conv_at_ffn=True):
    super().__init__()

    self.conv1 = PreNorm(None, DepthWiseConv2d(
        dim, 3, 1, 1)) if conv_at_attn else None
    self.window_attn = PreNorm(
        norm_layer(dim),
        WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
    )
    self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                               1)) if conv_at_ffn else None
    self.ffn = PreNorm(
        norm_layer(dim),
        Mlp(in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer),
    )

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):
    if self.conv1:
        x, size = self.conv1(x, size)
    x, size = self.window_attn(x, size)

    if self.conv2:
        x, size = self.conv2(x, size)
    x, size = self.ffn(x, size)
    return x, size

WindowAttention

Bases: Module

Source code in vllm/model_executor/models/florence2.py
class WindowAttention(nn.Module):

    def __init__(self, dim, num_heads, window_size, qkv_bias=True):

        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = float(head_dim)**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, size):

        H, W = size
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"

        x = x.view(B, H, W, C)

        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        x = window_partition(x, self.window_size)
        x = x.view(-1, self.window_size * self.window_size, C)

        # W-MSA/SW-MSA
        # attn_windows = self.attn(x_windows)

        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))
        attn = self.softmax(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)

        # merge windows
        x = x.view(-1, self.window_size, self.window_size, C)
        x = window_reverse(x, B, self.window_size, Hp, Wp)

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        return x, size

dim instance-attribute

dim = dim

num_heads instance-attribute

num_heads = num_heads

proj instance-attribute

proj = Linear(dim, dim)

qkv instance-attribute

qkv = Linear(dim, dim * 3, bias=qkv_bias)

scale instance-attribute

scale = float(head_dim) ** -0.5

softmax instance-attribute

softmax = Softmax(dim=-1)

window_size instance-attribute

window_size = window_size

__init__

__init__(dim, num_heads, window_size, qkv_bias=True)
Source code in vllm/model_executor/models/florence2.py
def __init__(self, dim, num_heads, window_size, qkv_bias=True):

    super().__init__()
    self.dim = dim
    self.window_size = window_size
    self.num_heads = num_heads
    head_dim = dim // num_heads
    self.scale = float(head_dim)**-0.5

    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
    self.proj = nn.Linear(dim, dim)

    self.softmax = nn.Softmax(dim=-1)

forward

forward(x, size)
Source code in vllm/model_executor/models/florence2.py
def forward(self, x, size):

    H, W = size
    B, L, C = x.shape
    assert L == H * W, "input feature has wrong size"

    x = x.view(B, H, W, C)

    pad_l = pad_t = 0
    pad_r = (self.window_size - W % self.window_size) % self.window_size
    pad_b = (self.window_size - H % self.window_size) % self.window_size
    x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
    _, Hp, Wp, _ = x.shape

    x = window_partition(x, self.window_size)
    x = x.view(-1, self.window_size * self.window_size, C)

    # W-MSA/SW-MSA
    # attn_windows = self.attn(x_windows)

    B_, N, C = x.shape
    qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                              C // self.num_heads).permute(2, 0, 3, 1, 4)
    q, k, v = qkv[0], qkv[1], qkv[2]

    q = q * self.scale
    attn = (q @ k.transpose(-2, -1))
    attn = self.softmax(attn)

    x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
    x = self.proj(x)

    # merge windows
    x = x.view(-1, self.window_size, self.window_size, C)
    x = window_reverse(x, B, self.window_size, Hp, Wp)

    if pad_r > 0 or pad_b > 0:
        x = x[:, :H, :W, :].contiguous()

    x = x.view(B, H * W, C)

    return x, size

window_partition

window_partition(x, window_size: int)
Source code in vllm/model_executor/models/florence2.py
def window_partition(x, window_size: int):
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows

window_reverse

window_reverse(
    windows,
    batch_size: int,
    window_size: int,
    H: int,
    W: int,
)
Source code in vllm/model_executor/models/florence2.py
def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
    B = batch_size

    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x