support vllm (#2)

- support vllm (c7716cef25ba1eef5bec2a539453cbef9b88ee3d)
- Update config.json (ba8f23ad1c359ca7ac149bae3123bf38d2989e43)
- Rename configuration_ernie_45t_vl.py to configuration_ernie4_5_vl.py (dc38b3c0c3f64c1c32ce7e8b1dedf83d47efd115)
- Update configuration_ernie4_5_vl.py (af466974158d43c4070c4243ae7ad82f2e8380b5)
- Rename modeling_ernie_45t_vl.py to modeling_ernie4_5_vl.py (753232921ad32debb593699b568e9c70331857a5)
- Rename processing_ernie_45t_vl.py to processing_ernie4_5_vl.py (cdeb408adaf7208fd9cc65b5efc220142d1c77c5)
- Update tokenizer_config.json (facc0daa460b975c0172a2bcbf53850c87b71b29)

Files changed (6) hide show

chat_template.json +1 -1
config.json +16 -5
configuration_ernie_45t_vl.py → configuration_ernie4_5_vl.py +14 -12
modeling_ernie_45t_vl.py → modeling_ernie4_5_vl.py +142 -65
processing_ernie_45t_vl.py → processing_ernie4_5_vl.py +396 -392
tokenizer_config.json +3 -3

chat_template.json CHANGED Viewed

@@ -1,3 +1,3 @@
 {
-        "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }

 {
+    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type in ['image_url', 'image'] -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type in ['video_url', 'video'] -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
 }

config.json CHANGED Viewed

@@ -3,11 +3,11 @@
     "Ernie4_5_VLMoeForConditionalGeneration"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_ernie_45t_vl.Ernie4_5_VLMoEConfig",
-    "AutoModel": "modeling_ernie_45t_vl.Ernie4_5_VLMoeForConditionalGeneration",
-    "AutoModelForCausalLM": "modeling_ernie_45t_vl.Ernie4_5_VLMoeForConditionalGeneration",
-    "AutoProcessor": "processing_ernie_45t_vl.Ernie_45T_VLProcessor",
-    "AutoImageProcessor": "processing_ernie_45t_vl.Ernie_45T_VLImageProcessor"
   },
   "pad_token_id": 0,
   "bos_token_id": 1,
@@ -17,6 +17,9 @@
   "hidden_size": 2560,
   "intermediate_size": 12288,
   "im_patch_id": 100295,
   "model_type": "ernie4_5_moe_vl",
   "moe_capacity": [128, 128, 128],
   "moe_gate": "topk",
@@ -43,6 +46,14 @@
   "use_cache": true,
   "use_rmsnorm": true,
   "use_bias": false,
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

     "Ernie4_5_VLMoeForConditionalGeneration"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_ernie4_5_vl.Ernie4_5_VLMoEConfig",
+    "AutoModel": "modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_ernie4_5_vl.Ernie4_5_VLMoeForConditionalGeneration",
+    "AutoProcessor": "processing_ernie4_5_vl.Ernie4_5_VLProcessor",
+    "AutoImageProcessor": "processing_ernie4_5_vl.Ernie4_5_VLImageProcessor"
   },
   "pad_token_id": 0,
   "bos_token_id": 1,
   "hidden_size": 2560,
   "intermediate_size": 12288,
   "im_patch_id": 100295,
+  "video_start_token_id": 101306,
+  "video_end_token_id": 101307,
+  "max_position_embeddings": 131072,
   "model_type": "ernie4_5_moe_vl",
   "moe_capacity": [128, 128, 128],
   "moe_gate": "topk",
   "use_cache": true,
   "use_rmsnorm": true,
   "use_bias": false,
+  "rope_scaling": {
+    "type": "default",
+    "mrope_section": [
+      22,
+      22,
+      20
+    ]
+  },
   "vision_config": {
     "attn_implementation": "eager",
     "depth": 32,

configuration_ernie_45t_vl.py → configuration_ernie4_5_vl.py RENAMED Viewed

@@ -171,7 +171,7 @@ class Ernie4_5_Config(PretrainedConfig):
         use_fast_ln=False,
         weight_share_add_bias=True,
         fuse_linear=False,
-        max_sequence_length=1024,
         ignored_index=-100,
         add_tail_layers=False,
         use_recompute_lm_head=False,
@@ -539,17 +539,19 @@ class Ernie4_5_VLMoEConfig(Ernie4_5_MoEConfig):
         "activation_function": "hidden_act",
     }
     base_model_tp_plan = {
-        "ernie.layers.*.self_attn.qkv_proj": "colwise",
-        "ernie.layers.*.self_attn.o_proj": "rowwise",
-        "ernie.layers.*.mlp_text.experts.*.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp_text.experts.*.down_proj": "rowwise",
-        "ernie.layers.*.mlp_text.gate": "colwise_rep",
-        "ernie.layers.*.mlp.experts.*.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp.experts.*.down_proj": "rowwise",
-        "ernie.layers.*.mlp.gate": "colwise_rep",
-        "ernie.layers.*.mlp.up_gate_proj": "colwise",
-        "ernie.layers.*.mlp.down_proj": "rowwise",
-        "lm_head": "colwise_rep",
     }
     def __init__(

         use_fast_ln=False,
         weight_share_add_bias=True,
         fuse_linear=False,
+        max_sequence_length=None,
         ignored_index=-100,
         add_tail_layers=False,
         use_recompute_lm_head=False,
         "activation_function": "hidden_act",
     }
     base_model_tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise_rep",
+        "model.layers.*.self_attn.k_proj": "colwise_rep",
+        "model.layers.*.self_attn.v_proj": "colwise_rep",
+        "model.layers.*.self_attn.o_proj": "rowwise_rep",
+        "model.layers.*.mlp.experts.*.gate_proj": "colwise",
+        "model.layers.*.mlp.experts.*.up_proj": "colwise",
+        "model.layers.*.mlp.experts.*.down_proj": "rowwise",
+        "model.layers.*.mlp_text.experts.*.gate_proj": "colwise",
+        "model.layers.*.mlp_text.experts.*.up_proj": "colwise",
+        "model.layers.*.mlp_text.experts.*.down_proj": "rowwise",
+        "model.layers.*.mlp.gate_proj": "colwise",
+        "model.layers.*.mlp.up_proj": "colwise",
+        "model.layers.*.mlp.down_proj": "rowwise"
     }
     def __init__(

modeling_ernie_45t_vl.py → modeling_ernie4_5_vl.py RENAMED Viewed

@@ -27,13 +27,14 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from .configuration_ernie_45t_vl import (
     DFNRopeVisionTransformerConfig,
     Ernie4_5_MoEConfig,
     Ernie4_5_VLMoEConfig,
@@ -321,6 +322,7 @@ class Ernie4_5_Attention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.is_gqa = (
             self.num_key_value_heads is not None
@@ -373,7 +375,10 @@ class Ernie4_5_Attention(nn.Module):
             freq_allocation=self.freq_allocation,
         )
         self.config = config
-        self.attn_func = self.core_attn
     def forward(
         self,
@@ -446,6 +451,47 @@ class Ernie4_5_Attention(nn.Module):
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
     def core_attn(
         self,
         q,
@@ -493,19 +539,13 @@ class Ernie4_5_Attention(nn.Module):
         if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
             product = product * getattr(self.config, "scale_qk_coeff", 1.0)
-        if attention_mask is not None:
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(torch.float32)
-            product = product + attention_mask
-            weights = F.softmax(product, dim=-1)
-        else:
-            seq_len = product.size(-1)
-            mask = torch.triu(
-                torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
-                diagonal=1,
-            )
-            product = product.masked_fill(mask, float("-inf"))
-            weights = F.softmax(product, dim=-1)
         weights = weights.to(origin_dtype)
@@ -1508,16 +1548,8 @@ class MOELayer(nn.Module):
             )
             assert self.gate.config.moe_use_aux_free
-        try:
-            self.world_size = torch.distributed.get_world_size()
-            self.rank = torch.distributed.get_rank()
-        except:
-            self.world_size = 1
-            self.rank = 0
-        if self.world_size < 1:
-            self.world_size = 1
-        if self.rank < 0:
-            self.rank = 0
         self.multimodal_experts = (
             isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
@@ -1602,7 +1634,11 @@ class MOELayer(nn.Module):
         S, H = x.shape
         E = gate_logits.shape[1]
         device = x.device
-        topk_prob, topk_idx = torch.topk(gate_logits, k, dim=-1)  # [S, k]
         combine_weights = topk_prob  # [S, k]
         expert_id = topk_idx  # [S, k]
         y = x.new_zeros((E, capacity, H))  # [E, C, H]
@@ -1803,7 +1839,7 @@ class MOEAllGatherLayerV2(MOELayer):
         enable_reverse_token_drop=False,
         all_to_all_dropout=0,
         group_experts=False,
-        use_expert_out_alltoall=True,  #
         use_expert_alltoall_overlap=False,
         use_padding=True,
         dense_token_type=3,  # considerd as dense tokens (no moe)
@@ -2729,7 +2765,6 @@ class Ernie4_5_PretrainedModel(PreTrainedModel):
     config_class = Ernie4_5_MoEConfig
     base_model_prefix = "ernie"
     _no_split_modules = ["Ernie4_5_DecoderLayer"]
-    # _keep_in_fp32_modules = ["mlp.gate", "e_score_correction_bias"]
 class Ernie4_5_Model(Ernie4_5_PretrainedModel):
@@ -2876,7 +2911,6 @@ class Ernie4_5_Model(Ernie4_5_PretrainedModel):
             past_key_value = (
                 past_key_values[idx] if past_key_values is not None else None
             )
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask,
@@ -3224,15 +3258,61 @@ class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin):
         """
         return self.model
-    def prepare_attention_mask_for_generation(
-        self, input_ids, pad_token_id, eos_token_id
-    ):
-        """Avoid using attention_mask with flash_attn on generation."""
-        if self.config.use_flash_attention:
-            return None
-        return super().prepare_attention_mask_for_generation(
-            input_ids, pad_token_id, eos_token_id
-        )
 class VisionMlp(nn.Module):
@@ -3381,33 +3461,27 @@ class VisionAttention(nn.Module):
         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attention_mask = torch.full(
-            [1, seq_length, seq_length],
-            torch.finfo(q.dtype).min,
-            device=q.device,
-            dtype=q.dtype,
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = 0
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
-        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q.dtype)
-        attn_output = torch.matmul(attn_weights, v)
-        attn_output = attn_output.transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
         return attn_output
@@ -3943,7 +4017,10 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
                 image_type_ids[:, -1:] if image_type_ids is not None else None
             )
-        attention_mask = kwargs.get("attention_mask", None)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -4077,7 +4154,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
         if images is not None and image_features is not None:
             inputs_embeds = self.vision_mapping_forward(
-                token_type_ids,
                 token_type_ids_w_video,
                 input_ids,
                 mm_input_ids,
@@ -4091,7 +4168,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
         outputs = self.model(
             position_ids=position_ids,
-            attention_mask=None,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+from .configuration_ernie4_5_vl import (
     DFNRopeVisionTransformerConfig,
     Ernie4_5_MoEConfig,
     Ernie4_5_VLMoEConfig,
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.is_gqa = (
             self.num_key_value_heads is not None
             freq_allocation=self.freq_allocation,
         )
         self.config = config
+        if self.config.use_flash_attention:
+            self.attn_func = self._flash_attention_wrapper
+        else:
+            self.attn_func = self.core_attn
     def forward(
         self,
         )
         return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+    def _flash_attention_wrapper(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Wrapper for flash attention implementation.
+        Args:
+            q (torch.Tensor): Query tensor
+            k (torch.Tensor): Key tensor
+            v (torch.Tensor): Value tensor
+            attention_mask (Optional[torch.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
+        """
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=self.config.attention_probs_dropout_prob,
+                is_causal=q.shape[-2] == k.shape[-2],
+                scale=1
+                / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
+                enable_gqa=self.is_gqa,
+            )
+        out = out.transpose(1, 2)
+        out = out.contiguous().view(out.size(0), out.size(1), -1)
+        return out, None
     def core_attn(
         self,
         q,
         if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
             product = product * getattr(self.config, "scale_qk_coeff", 1.0)
+        seq_len = product.size(-1)
+        mask = torch.triu(
+            torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
+            diagonal=1,
+        )
+        product = product.masked_fill(mask, float("-inf"))
+        weights = F.softmax(product, dim=-1)
         weights = weights.to(origin_dtype)
             )
             assert self.gate.config.moe_use_aux_free
+        self.world_size = 1
+        self.rank = 0
         self.multimodal_experts = (
             isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
         S, H = x.shape
         E = gate_logits.shape[1]
         device = x.device
+        if self.use_correction_bias:
+            _, topk_idx = torch.topk(gate_logits + self.moe_statics.e_score_correction_bias[0].detach().to(gate_logits.device), k, dim=-1)
+            topk_prob = torch.gather(gate_logits, dim=1, index=topk_idx) #  [Seq, k]
+        else:
+            topk_prob, topk_idx = torch.topk(gate_logits, k, dim=-1)  # [S, k]
         combine_weights = topk_prob  # [S, k]
         expert_id = topk_idx  # [S, k]
         y = x.new_zeros((E, capacity, H))  # [E, C, H]
         enable_reverse_token_drop=False,
         all_to_all_dropout=0,
         group_experts=False,
+        use_expert_out_alltoall=True,
         use_expert_alltoall_overlap=False,
         use_padding=True,
         dense_token_type=3,  # considerd as dense tokens (no moe)
     config_class = Ernie4_5_MoEConfig
     base_model_prefix = "ernie"
     _no_split_modules = ["Ernie4_5_DecoderLayer"]
 class Ernie4_5_Model(Ernie4_5_PretrainedModel):
             past_key_value = (
                 past_key_values[idx] if past_key_values is not None else None
             )
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask,
         """
         return self.model
+    # @staticmethod
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False):
+        """
+        Updates model kwargs for generation.
+        Args:
+            outputs (Any): Model outputs.
+            model_kwargs (dict): Current model kwargs.
+            is_encoder_decoder (bool): Whether using encoder-decoder architecture.
+        Returns:
+            dict: Updated model kwargs.
+        """
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], torch.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1:]], dim=-1)
+        if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
+            # update attention mask
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    torch.ones((attention_mask.shape[0], 1), dtype=torch.int64, device=attention_mask.device),
+                ],
+                dim=-1,
+            )
+        # update role_ids
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = torch.cat([role_ids, role_ids[:, -1:]], dim=-1)
+        if self.config.get('rope_3d', False):
+            assert "position_ids" in model_kwargs, "position_ids must be provided if rope_3d is on"
+            position_ids = model_kwargs["position_ids"]
+            bsz = position_ids.shape[0]
+            max_position = position_ids.max(dim=1, keepdim=True)[0]  # [batch_size, 1, hidden_dim]
+            new_positions = max_position + 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_positions],
+                dim=1
+            )
+        return model_kwargs
 class VisionMlp(nn.Module):
         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
+        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        splits = [
+            torch.split(tensor, lengths.tolist(), dim=1) for tensor in (q, k, v)
+        ]
+        attn_output = []
+        for q, k, v in zip(*splits):
+            attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_output_splited = torch.matmul(attn_weights, v)
+            attn_output_splited = attn_output_splited.transpose(0, 1)
+            attn_output.append(attn_output_splited)
+        attn_output = torch.cat(attn_output, dim=0)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
         attn_output = self.proj(attn_output)
         return attn_output
                 image_type_ids[:, -1:] if image_type_ids is not None else None
             )
+        if self.config.use_flash_attention:
+            attention_mask = None
+        else:
+            attention_mask = kwargs.get("attention_mask", None)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
         if images is not None and image_features is not None:
             inputs_embeds = self.vision_mapping_forward(
+                token_type_ids[..., :-1],
                 token_type_ids_w_video,
                 input_ids,
                 mm_input_ids,
         outputs = self.model(
             position_ids=position_ids,
+            attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,

processing_ernie_45t_vl.py → processing_ernie4_5_vl.py RENAMED Viewed

@@ -17,7 +17,6 @@
 import copy
 import io
 import os
-import re
 import math
 import random
 import requests
@@ -28,14 +27,13 @@ import threading
 import uuid
 import decord
 from shutil import copyfile
-from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
-from typing import Any, Dict, List, Union
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
@@ -52,7 +50,6 @@ from transformers.tokenization_utils_base import (
     PaddingStrategy,
     TextInput,
 )
-from transformers.utils import logging
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
@@ -82,132 +79,419 @@ from transformers.image_utils import (
 logger = logging.get_logger(__name__)
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = 28,
-    min_pixels: int = 4 * 28 * 28,
-    max_pixels: int = 16384 * 28 * 28,
-):
-    """
-    Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    MAX_RATIO = 200
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
-        logger.info(
-            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-        height = new_height
-        width = new_width
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-    return h_bar, w_bar
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Checks to see whether the pixel values have already been rescaled to [0, 1].
-    """
-    if image.dtype == np.uint8:
-        return False
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-    Returns:
-        list: A list of images.
-    """
-    if (
-        isinstance(images, (list, tuple))
-        and isinstance(images[0], (list, tuple))
-        and is_valid_image(images[0][0])
-    ):
-        return [img for img_list in images for img in img_list]
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-    elif is_valid_image(images):
-        return [images]
-    raise ValueError(f"Could not make batched images from {images}")
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
-    """dummy"""
-    if (
-        isinstance(videos, (list, tuple))
-        and isinstance(videos[0], (list, tuple))
-        and is_valid_image(videos[0][0])
-    ):
-        return videos
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
     raise ValueError(f"Could not make batched video from {videos}")
-class Ernie_45T_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -289,7 +573,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
@@ -298,7 +582,7 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
-                f"{msg} Ernie_45T_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
@@ -618,298 +902,15 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor):
         return BatchFeature(data=data, tensor_type=return_tensors)
-class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
-    """
-    Ernie4_5_VLTokenizer
-    """
-    vocab_files_names = {
-        "vocab_file": "tokenizer.model",
-    }
-    # Model input names expected by the tokenizer
-    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
-    # Padding side (where to add padding tokens)
-    padding_side = "right"
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        cls_token="<cls>",
-        eos_token="</s>",
-        mask_token="<mask:0>",
-        pad_token="<pad>",
-        sep_token="<sep>",
-        unk_token="<unk>",
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        """
-        Initialize the Ernie4_5_VLTokenizer
-        Args:
-            vocab_file (str): Path to the tokenizer vocabulary model.
-            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
-            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
-            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
-            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
-            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
-            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
-            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
-            additional_special_tokens (List[str], optional): Additional special tokens to use.
-                Defaults to `["<mask:1>", "<mask:7>"]`.
-            **kwargs (dict): Additional keyword arguments passed along to the superclass.
-        """
-        # Store vocabulary file path
-        self.vocab_file = vocab_file
-        # Initialize SentencePiece processor
-        self.sp_model = spm.SentencePieceProcessor()
-        # Load the vocabulary model
-        self.sp_model.Load(vocab_file)
-        # Set default additional special tokens if none provided
-        if additional_special_tokens is None:
-            additional_special_tokens = ["<mask:1>", "<mask:7>"]
-        super().__init__(
-            bos_token=bos_token,
-            cls_token=cls_token,
-            eos_token=eos_token,
-            mask_token=mask_token,
-            pad_token=pad_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-    @property
-    def space_token(self):
-        """Return the space token"""
-        return "<mask:1>"
-    @property
-    def space_token_id(self):
-        """Return the ID of the space token"""
-        return self.sp_model.piece_to_id("<mask:1>")
-    @property
-    def gend_token(self):
-        """Return the gender token"""
-        return "<mask:7>"
-    @property
-    def gend_token_id(self):
-        """Return the ID of the gender token"""
-        return self.sp_model.piece_to_id("<mask:7>")
-    @property
-    def im_start_id(self):
-        """Return the ID of the image start token"""
-        return self.sp_model.piece_to_id("<|im_start|>")
-    @property
-    def im_end_id(self):
-        """Return the ID of the image end token"""
-        return self.sp_model.piece_to_id("<|im_end|>")
-    @property
-    def vocab_size(self):
-        """Return the size of the vocabulary"""
-        return self.sp_model.vocab_size()
-    def get_vocab(self):
-        """Return the vocabulary as a dictionary mapping tokens to IDs"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def _tokenize(self, text):
-        """Tokenize the input text into pieces"""
-        return self.sp_model.encode_as_pieces(text)
-    def _convert_token_to_id(self, token):
-        """Convert a token to its corresponding ID"""
-        return self.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, id):
-        """Convert an ID to its corresponding token"""
-        return self.sp_model.id_to_piece(id)
-    def convert_tokens_to_string(self, tokens):
-        """Convert a sequence of tokens back to a string"""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # Handle special tokens differently
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        # Add any remaining sub-tokens
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-    def prepare_for_model(self, *args, **kwargs):
-        """Prepare the tokenized inputs for the model"""
-        # Remove add_special_tokens if present (not supported)
-        if "add_special_tokens" in kwargs:
-            kwargs.pop("add_special_tokens")
-        return super().prepare_for_model(*args, **kwargs)
-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-        Args:
-            save_directory (`str`): The directory to save the vocabulary to
-            filename_prefix (`str`, optional): Prefix to add to the filename
-        Returns:
-            `Tuple(str)`: Paths to the saved files
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        # Construct output vocabulary file path
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + self.vocab_files_names["vocab_file"],
-        )
-        # Copy or create vocabulary file
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file,)
-    def _decode(self, *args, **kwargs):
-        """Decode token_id back to text"""
-        # Remove some parameters that aren't used
-        kwargs.pop("clean_up_tokenization_spaces", None)
-        kwargs.pop("spaces_between_special_tokens", None)
-        # Call parent decode method with specific parameters
-        return super()._decode(
-            *args,
-            **kwargs,
-            clean_up_tokenization_spaces=False,
-            spaces_between_special_tokens=False,
-        )
-    def _pad(
-        self,
-        encoded_inputs: Dict,
-        max_length: Optional[int] = None,
-        padding_strategy=PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """Pad the encoded inputs to the specified length"""
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-        if return_attention_mask:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            if padding_strategy == PaddingStrategy.LONGEST:
-                max_length = len(required_input)
-            # Adjust max_length if needed for multiple of padding
-            if (
-                max_length is not None
-                and pad_to_multiple_of is not None
-                and (max_length % pad_to_multiple_of != 0)
-            ):
-                max_length = (
-                    (max_length // pad_to_multiple_of) + 1
-                ) * pad_to_multiple_of
-            # Check if padding is needed
-            needs_to_be_padded = (
-                padding_strategy != PaddingStrategy.DO_NOT_PAD
-                and len(required_input) != max_length
-            )
-            # Handle attention mask if present
-            if (
-                "attention_mask" in encoded_inputs
-                and encoded_inputs["attention_mask"] is not None
-            ):
-                attention_mask = encoded_inputs.pop("attention_mask")
-                if isinstance(attention_mask, torch.Tensor):
-                    attention_mask = attention_mask.numpy()
-                elif isinstance(attention_mask, list):
-                    attention_mask = np.array(attention_mask)
-                elif not isinstance(attention_mask, np.ndarray):
-                    raise ValueError(
-                        f"Unexpected type {type(attention_mask)} of attention_mask, "
-                    )
-            else:
-                # Create default attention mask if none provided
-                attention_mask = np.tril(
-                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
-                )
-                attention_mask = np.expand_dims(attention_mask, axis=0)
-            # Perform padding if needed
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if self.padding_side == "right":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(0, difference)]
-                    else:
-                        pad_width = [(0, 0), (0, difference), (0, difference)]
-                elif self.padding_side == "left":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(difference, 0)]
-                    else:
-                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
-                else:
-                    raise ValueError(
-                        "Invalid padding strategy:" + str(self.padding_side)
-                    )
-                attention_mask = np.pad(
-                    attention_mask,
-                    pad_width=pad_width,
-                    mode="constant",
-                    constant_values=0,
-                )
-        # Call parent padding method
-        encoded_inputs = super()._pad(
-            encoded_inputs,
-            max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=False,
-        )
-        # Add attention mask back if needed
-        if return_attention_mask:
-            encoded_inputs["attention_mask"] = attention_mask.tolist()
-        return encoded_inputs
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
 def is_gif(data: bytes) -> bool:
@@ -1380,7 +1381,7 @@ def render_frame_timestamp(frame, timestamp, font_rate=0.1):
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
-class Ernie_45T_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
@@ -1527,11 +1528,11 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
     def __call__(
         self,
-        text: List[str],
-        images: List[Image.Image],
-        videos: List[List[Image.Image]],
         **kwargs,
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
@@ -1547,6 +1548,9 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
             "pic_cnt": 0,
             "video_cnt": 0,
         }
         texts = text[0]
         new_video_seg = True
@@ -1811,4 +1815,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin):
         return list(tokenizer_input_names) + list(image_processor_input_names)
-__all__ = ["Ernie_45T_VLImageProcessor", "Ernie4_5_VLTokenizer", "Ernie_45T_VLProcessor"]

 import copy
 import io
 import os
 import math
 import random
 import requests
 import uuid
 import decord
 from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from PIL.ExifTags import TAGS
 from collections import defaultdict
 from pathlib import Path
 from tempfile import NamedTemporaryFile as ntf
     PaddingStrategy,
     TextInput,
 )
 from transformers.utils import TensorType, logging
 from transformers.video_utils import VideoInput
 from transformers.processing_utils import ProcessorMixin
 logger = logging.get_logger(__name__)
+class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
+    """
+    Ernie4_5_VLTokenizer
+    """
+    vocab_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+    # Model input names expected by the tokenizer
+    model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
+    # Padding side (where to add padding tokens)
+    padding_side = "right"
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        cls_token="<cls>",
+        eos_token="</s>",
+        mask_token="<mask:0>",
+        pad_token="<pad>",
+        sep_token="<sep>",
+        unk_token="<unk>",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        """
+        Initialize the Ernie4_5_VLTokenizer
+        Args:
+            vocab_file (str): Path to the tokenizer vocabulary model.
+            bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
+            cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
+            eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
+            mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
+            pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
+            sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
+            unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
+            additional_special_tokens (List[str], optional): Additional special tokens to use.
+                Defaults to `["<mask:1>", "<mask:7>"]`.
+            **kwargs (dict): Additional keyword arguments passed along to the superclass.
+        """
+        # Store vocabulary file path
+        self.vocab_file = vocab_file
+        # Initialize SentencePiece processor
+        self.sp_model = spm.SentencePieceProcessor()
+        # Load the vocabulary model
+        self.sp_model.Load(vocab_file)
+        # Set default additional special tokens if none provided
+        if additional_special_tokens is None:
+            additional_special_tokens = ["<mask:1>", "<mask:7>"]
+        super().__init__(
+            bos_token=bos_token,
+            cls_token=cls_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+    @property
+    def space_token(self):
+        """Return the space token"""
+        return "<mask:1>"
+    @property
+    def space_token_id(self):
+        """Return the ID of the space token"""
+        return self.sp_model.piece_to_id("<mask:1>")
+    @property
+    def gend_token(self):
+        """Return the gender token"""
+        return "<mask:7>"
+    @property
+    def gend_token_id(self):
+        """Return the ID of the gender token"""
+        return self.sp_model.piece_to_id("<mask:7>")
+    @property
+    def im_start_id(self):
+        """Return the ID of the image start token"""
+        return self.sp_model.piece_to_id("<|im_start|>")
+    @property
+    def im_end_id(self):
+        """Return the ID of the image end token"""
+        return self.sp_model.piece_to_id("<|im_end|>")
+    @property
+    def vocab_size(self):
+        """Return the size of the vocabulary"""
+        return self.sp_model.vocab_size()
+    def get_vocab(self):
+        """Return the vocabulary as a dictionary mapping tokens to IDs"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize the input text into pieces"""
+        return self.sp_model.encode_as_pieces(text)
+    def _convert_token_to_id(self, token):
+        """Convert a token to its corresponding ID"""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, id):
+        """Convert an ID to its corresponding token"""
+        return self.sp_model.id_to_piece(id)
+    def convert_tokens_to_string(self, tokens):
+        """Convert a sequence of tokens back to a string"""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # Handle special tokens differently
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        # Add any remaining sub-tokens
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def prepare_for_model(self, *args, **kwargs):
+        """Prepare the tokenized inputs for the model"""
+        # Remove add_special_tokens if present (not supported)
+        if "add_special_tokens" in kwargs:
+            kwargs.pop("add_special_tokens")
+        return super().prepare_for_model(*args, **kwargs)
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`): The directory to save the vocabulary to
+            filename_prefix (`str`, optional): Prefix to add to the filename
+        Returns:
+            `Tuple(str)`: Paths to the saved files
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        # Construct output vocabulary file path
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.vocab_files_names["vocab_file"],
+        )
+        # Copy or create vocabulary file
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def _decode(self, *args, **kwargs):
+        """Decode token_id back to text"""
+        # Remove some parameters that aren't used
+        kwargs.pop("clean_up_tokenization_spaces", None)
+        kwargs.pop("spaces_between_special_tokens", None)
+        # Call parent decode method with specific parameters
+        return super()._decode(
+            *args,
+            **kwargs,
+            clean_up_tokenization_spaces=False,
+            spaces_between_special_tokens=False,
+        )
+    def _pad(
+        self,
+        encoded_inputs: Dict,
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs
+    ) -> dict:
+        """Pad the encoded inputs to the specified length"""
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_attention_mask:
+            required_input = encoded_inputs[self.model_input_names[0]]
+            if padding_strategy == PaddingStrategy.LONGEST:
+                max_length = len(required_input)
+            # Adjust max_length if needed for multiple of padding
+            if (
+                max_length is not None
+                and pad_to_multiple_of is not None
+                and (max_length % pad_to_multiple_of != 0)
+            ):
+                max_length = (
+                    (max_length // pad_to_multiple_of) + 1
+                ) * pad_to_multiple_of
+            # Check if padding is needed
+            needs_to_be_padded = (
+                padding_strategy != PaddingStrategy.DO_NOT_PAD
+                and len(required_input) != max_length
+            )
+            # Handle attention mask if present
+            if (
+                "attention_mask" in encoded_inputs
+                and encoded_inputs["attention_mask"] is not None
+            ):
+                attention_mask = encoded_inputs.pop("attention_mask")
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = attention_mask.numpy()
+                elif isinstance(attention_mask, list):
+                    attention_mask = np.array(attention_mask)
+                elif not isinstance(attention_mask, np.ndarray):
+                    raise ValueError(
+                        f"Unexpected type {type(attention_mask)} of attention_mask, "
+                    )
+            else:
+                # Create default attention mask if none provided
+                attention_mask = np.tril(
+                    np.ones((len(required_input), len(required_input)), dtype=np.int64)
+                )
+                attention_mask = np.expand_dims(attention_mask, axis=0)
+            # Perform padding if needed
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if self.padding_side == "right":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(0, difference)]
+                    else:
+                        pad_width = [(0, 0), (0, difference), (0, difference)]
+                elif self.padding_side == "left":
+                    if attention_mask.ndim == 1:
+                        pad_width = [(difference, 0)]
+                    else:
+                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
+                else:
+                    raise ValueError(
+                        "Invalid padding strategy:" + str(self.padding_side)
+                    )
+                attention_mask = np.pad(
+                    attention_mask,
+                    pad_width=pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+        # Call parent padding method
+        encoded_inputs = super()._pad(
+            encoded_inputs,
+            max_length,
+            padding_strategy=padding_strategy,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=False,
+        )
+        # Add attention mask back if needed
+        if return_attention_mask:
+            encoded_inputs["attention_mask"] = attention_mask.tolist()
+        return encoded_inputs
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 4 * 28 * 28,
+    max_pixels: int = 16384 * 28 * 28,
+):
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    MAX_RATIO = 200
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+        logger.info(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
+              resize to {max(new_height, new_width) / min(new_height, new_width)}"
+        )
+        height = new_height
+        width = new_width
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+    return h_bar, w_bar
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if (
+        isinstance(images, (list, tuple))
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    """dummy"""
+    if (
+        isinstance(videos, (list, tuple))
+        and isinstance(videos[0], (list, tuple))
+        and is_valid_image(videos[0][0])
+    ):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
     raise ValueError(f"Could not make batched video from {videos}")
+class Ernie4_5_VLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a adaptive image processor that dynamically resizes images based on the original images.
                 isinstance(min_pixels, int) and min_pixels >= 0
             ), "min_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set min_pixels = {min_pixels}"
             )
             self.min_pixels = min_pixels
             self.size["min_pixels"] = int(min_pixels)
                 isinstance(max_pixels, int) and max_pixels > 0
             ), "max_pixels must be positive int"
             logger.info(
+                f"{msg} Ernie4_5_VLImageProcessor set max_pixels = {max_pixels}"
             )
             self.max_pixels = max_pixels
             self.size["max_pixels"] = int(max_pixels)
         return BatchFeature(data=data, tensor_type=return_tensors)
 RAW_VIDEO_DIR = "./download_tmp/raw_video/"
 RAW_IMAGE_DIR = "./download_tmp/raw_images/"
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 TMP_DIR = "./download_tmp/upload_tmp/"
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
+if not os.path.exists(FONT_PATH):
+    ttf = requests.get("https://paddlenlp.bj.bcebos.com/vision-language-models/materials/Roboto-Regular.ttf")
+    open(FONT_PATH, "wb").write(ttf.content)
 def is_gif(data: bytes) -> bool:
 IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
+class Ernie4_5_VLProcessor(ProcessorMixin):
     """
     Processes multimodal chat messages into model-ready inputs,
     handling text, images, and videos with 3D positional embeddings.
     def __call__(
         self,
+        text: Union[str, List[str]],
+        images: List[Image.Image] = [],
+        videos: List[List[Image.Image]] = [],
         **kwargs,
+    ) -> BatchFeature:
         """
         Convert chat messages into model inputs.
         Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
             "pic_cnt": 0,
             "video_cnt": 0,
         }
+        if not isinstance(text, list):
+            text = [text]
         texts = text[0]
         new_video_seg = True
         return list(tokenizer_input_names) + list(image_processor_input_names)
+__all__ = ["Ernie4_5_VLTokenizer", "Ernie4_5_VLImageProcessor", "Ernie4_5_VLProcessor"]

tokenizer_config.json CHANGED Viewed

@@ -14,9 +14,9 @@
     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
-            "processing_ernie_45t_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
-    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type == 'image_url' -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type == 'video_url' -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
-}

     "tokenizer_class": "Ernie4_5_VLTokenizer",
     "auto_map": {
         "AutoTokenizer": [
+            "processing_ernie4_5_vl.Ernie4_5_VLTokenizer",
             null
         ]
     },
+    "chat_template": "\n{%- set image_count = namespace(value=0) -%}\n{%- set video_count = namespace(value=0) -%}\n{{- '<|begin_of_sentence|>' }}\n{%- for message in messages -%}\n    {%- if message.role in ['system', 'user'] -%}\n        {%- if message.role == 'user' -%}\n            {{- 'User: ' -}}\n        {%- endif -%}\n        {%- if message.content is string -%}\n            {{- message.content -}}\n        {%- else -%}\n            {%- for content_item in message.content -%}\n                {%- if content_item.type == 'text' -%}\n                    {{- content_item.text -}}\n                {%- elif content_item.type in ['image_url', 'image'] -%}\n                    {%- set image_count.value = image_count.value + 1 -%}\n                    Picture {{ image_count.value }}:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>\n                {%- elif content_item.type in ['video_url', 'video'] -%}\n                    {%- set video_count.value = video_count.value + 1 -%}\n                    Video {{ video_count.value }}:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {%- if message.role == 'system' -%}\n            {{- '\n' -}}\n        {%- endif -%}\n    {%- elif message.role == 'assistant' -%}\n        {%- macro extract_text_content(content_field) -%}\n            {%- if content_field is string -%}\n                {{- content_field -}}\n            {%- elif content_field is iterable and content_field is not string -%}\n                {%- set ns = namespace(text_parts=[]) -%}\n                {%- set text_parts = [] -%}\n                {%- for item in content_field -%}\n                    {%- if item.type == 'text' -%}\n                        {%- set ns.text_parts = ns.text_parts + [item.text] -%}\n                    {%- endif -%}\n                {%- endfor -%}\n                {{- ns.text_parts | join('') -}}\n            {%- else -%}\n                {{- '' -}}\n            {%- endif -%}\n        {%- endmacro -%}\n        {%- set reasoning_content = extract_text_content(message.reasoning_content) -%}\n        {%- set content = extract_text_content(message.content) -%}\n        {%- if '</think>' in content %}\n            {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}\n            {%- set content = content.split('</think>')[-1].lstrip('\n') %}\n        {%- endif %}\n        {%- if reasoning_content %}\n            {{- '\n' + 'Assistant: ' + '<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}\n        {%- else %}\n            {{- '\n' + 'Assistant: ' + content }}\n        {%- endif %}\n        {{- '<|end_of_sentence|>' }}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt is not defined or add_generation_prompt is true %}\n    {{- '\nAssistant: ' -}}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\n\n</think>\n\n' }}\n    {%- endif %}\n    {%- if enable_thinking is not defined or enable_thinking is true %}\n        {{- '<think>' }}\n    {%- endif %}\n{%- endif %}\n"
+}