Upload 15 files

Browse files

Files changed (15) hide show

CLIP.png +0 -0
EVA_CLIP_8B_psz14_plus_s0.6B.pt +3 -0
config.json +182 -0
configuration_evaclip.py +411 -0
convert_evaclip_8b_448_pytorch_to_hf.py +160 -0
merges.txt +0 -0
modeling_evaclip.py +1059 -0
pytorch_model-00001-of-00004.bin +3 -0
pytorch_model-00002-of-00004.bin +3 -0
pytorch_model-00003-of-00004.bin +3 -0
pytorch_model-00004-of-00004.bin +3 -0
pytorch_model.bin.index.json +882 -0
special_tokens_map.json +24 -0
tokenizer_config.json +31 -0
vocab.json +0 -0

CLIP.png ADDED Viewed

EVA_CLIP_8B_psz14_plus_s0.6B.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c664cd7cc88722d6a93b061a4b306276b638db1693a08c569cc570b0d8c1022f
+size 32895889318

config.json ADDED Viewed

	@@ -0,0 +1,182 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "EVA-CLIP-8B",
+  "architectures": [
+    "EvaCLIPModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_evaclip.EvaCLIPConfig",
+    "AutoModel": "modeling_evaclip.EvaCLIPModel"
+  },
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.659260036932778,
+  "model_type": "clip",
+  "projection_dim": 1280,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 5120,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "k_bias": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 20,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "post_layernorm": false,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "q_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.28.1",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_rms_norm": false,
+    "v_bias": true,
+    "vocab_size": 49408
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.28.1",
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 20480,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "k_bias": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "post_layernorm": false,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "q_bias": false,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.28.1",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_rms_norm": true,
+    "v_bias": false
+  }
+}

configuration_evaclip.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# coding=utf-8
+""" EvaCLIP model configuration"""
+# Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
+# and adjusted for evaclip
+import copy
+import os
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+if TYPE_CHECKING:
+    from transformers.processing_utils import ProcessorMixin
+    from transformers.utils import TensorType
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class EvaCLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        q_bias=True,
+        k_bias=True,
+        v_bias=True,
+        post_layernorm=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        use_rms_norm=False,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.q_bias=q_bias
+        self.k_bias=k_bias
+        self.v_bias=v_bias
+        self.post_layernorm = post_layernorm
+        self.attention_dropout = attention_dropout
+        self.use_rms_norm = use_rms_norm
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class EvaCLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_vision_model"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        q_bias=True,
+        k_bias=True,
+        v_bias=True,
+        post_layernorm=False,
+        use_rms_norm=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.q_bias=q_bias
+        self.k_bias=k_bias
+        self.v_bias=v_bias
+        self.post_layernorm = post_layernorm
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.use_rms_norm = use_rms_norm
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class EvaCLIPConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+    model_type = "clip"
+    is_composition = True
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        super().__init__(**kwargs)
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = EvaCLIPTextConfig(**text_config_dict).to_dict()
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = EvaCLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+        self.text_config = EvaCLIPTextConfig(**text_config)
+        self.vision_config = EvaCLIPVisionConfig(**vision_config)
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+    @classmethod
+    def from_text_vision_configs(cls, text_config: EvaCLIPTextConfig, vision_config: EvaCLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

convert_evaclip_8b_448_pytorch_to_hf.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Part of the code was taken from:
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+import argparse
+import os, sys
+sys.path.append(os.getcwd())
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoConfig
+from transformers import  CLIPImageProcessor, pipeline, CLIPTokenizer
+from EVA_CLIP_8B_448.configuration_evaclip import EvaCLIPConfig
+from EVA_CLIP_8B_448.modeling_evaclip import EvaCLIPModel
+KEYS_TO_MODIFY_MAPPING = {
+    "cls_token":"embeddings.class_embedding",
+    "pos_embed":"embeddings.position_embedding.weight",
+    "patch_embed.proj":"embeddings.patch_embedding",
+    ".positional_embedding":".embeddings.position_embedding.weight",
+    ".token_embedding":".embeddings.token_embedding",
+    "text.text_projection":"text_projection.weight",
+    "mlp.c_fc":"mlp.fc1",
+    "mlp.c_proj":"mlp.fc2",
+    ".proj.":".out_proj.",
+    "q_bias":"q_proj.bias",
+    "v_bias":"v_proj.bias",
+    "out.":"out_proj.",
+    "norm1":"layer_norm1",
+    "norm2":"layer_norm2",
+    "ln_1":"layer_norm1",
+    "ln_2":"layer_norm2",
+    "attn":"self_attn",
+    "norm.":"post_layernorm.",
+    "ln_final":"final_layer_norm",
+    "visual.blocks":"vision_model.encoder.layers",
+    "text.transformer.resblocks":"text_model.encoder.layers",
+    "visual.head":"visual_projection",
+    "visual.":"vision_model.",
+    "text.":"text_model.",
+}
+def rename_state_dict(state_dict):
+    model_state_dict = {}
+    for key, value in state_dict.items():
+        # check if any key needs to be modified
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        if "text_projection" in key:
+            model_state_dict[key] = value.T
+        elif "attn.qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+            model_state_dict[key.replace("qkv", "q_proj")] = query_layer
+            model_state_dict[key.replace("qkv", "k_proj")] = key_layer
+            model_state_dict[key.replace("qkv", "v_proj")] = value_layer
+        elif "attn.in_proj" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+            model_state_dict[key.replace("in_proj_", "q_proj.")] = query_layer
+            model_state_dict[key.replace("in_proj_", "k_proj.")] = key_layer
+            model_state_dict[key.replace("in_proj_", "v_proj.")] = value_layer
+        elif "class_embedding" in key:
+            model_state_dict[key] = value[0,0,:]
+        elif "vision_model.embeddings.position_embedding" in key:
+            model_state_dict[key] = value[0,:,:]
+        else:
+            model_state_dict[key] = value
+    return model_state_dict
+def save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config):
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+    transformers_config.save_pretrained(pytorch_dump_folder_path)
+def check_loaded_model(pytorch_dump_folder_path, tokenizer, processor, image, captions):
+    hf_config = AutoConfig.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)
+    hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, config=hf_config, trust_remote_code=True)
+    detector = pipeline(model=hf_model, task="zero-shot-image-classification", tokenizer = tokenizer, image_processor=processor)
+    detector_probs = detector(image, candidate_labels=captions)
+    print(f"text_probs loaded hf_model using pipeline: {detector_probs}")
+def convert_evaclip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, image_path, save=False):
+    processor = CLIPImageProcessor(size={"shortest_edge":448}, do_center_crop=True, crop_size=448)
+    print(f"processor={str(processor)}")
+    image = Image.open(image_path)
+    captions = ["a diagram", "a dog", "a cat"]
+    tokenizer = CLIPTokenizer.from_pretrained(pytorch_dump_folder_path)
+    input_ids = tokenizer(captions,  return_tensors="pt", padding=True).input_ids
+    input_pixels = processor(images=image, size=448, return_tensors="pt", padding=True).pixel_values
+    print("input_pixels.shape", input_pixels.shape)
+    transformers_config = EvaCLIPConfig.from_pretrained(config_path)
+    hf_model = EvaCLIPModel(transformers_config)
+    pt_model_state_dict = torch.load(checkpoint_path, map_location="cpu")
+    state_dict = rename_state_dict(pt_model_state_dict)
+    hf_model.load_state_dict(state_dict, strict=True)
+    with torch.no_grad():
+        image_features = hf_model.encode_image(input_pixels)
+        text_features = hf_model.encode_text(input_ids)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+    label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+    print(f"hf_model label probs: {label_probs}")
+    if save:
+        save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config)
+    check_loaded_model(pytorch_dump_folder_path, tokenizer, processor, image, captions)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default="EVA_CLIP_8B_448" ,type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default="EVA_CLIP_8B_psz14_plus_s0.6B.pt", type=str, help="Path to fairseq checkpoint" )
+    parser.add_argument("--config_path", default='EVA_CLIP_8B_448', type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument("--image_path", default='EVA_CLIP_8B_448/CLIP.png', type=str, help="Path to image")
+    parser.add_argument("--save", default=False, action="store_true", help="Save the model and config to the pytorch_dump_folder_path. Default is True.")
+    args = parser.parse_args()
+    convert_evaclip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.image_path, args.save)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_evaclip.py ADDED Viewed

	@@ -0,0 +1,1059 @@

+# coding=utf-8
+""" EvaCLIP model configuration"""
+# Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
+# and adjusted for evaclip
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+from .configuration_evaclip import EvaCLIPConfig, EvaCLIPTextConfig, EvaCLIPVisionConfig
+# try:
+#     from xformers import ops as xops
+# except ImportError:
+#     xops = None
+logger = logging.get_logger(__name__)
+class RMSNorm(nn.Module):
+    """
+    adepted from transformers T5LayerNorm
+    """
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+@dataclass
+class EvaCLIPVisionModelOutput(ModelOutput):
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class EvaCLIPTextModelOutput(ModelOutput):
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class EvaCLIPOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+class EvaCLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: EvaCLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent = False)
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+class EvaCLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: EvaCLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings
+class EvaCLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class EvaCLIPTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class EvaCLIPMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class EvaCLIPEncoderLayer(nn.Module):
+    def __init__(self, config: EvaCLIPConfig):
+        super().__init__()
+        self.config = config
+        norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
+        self.embed_dim = config.hidden_size
+        self.post_layernorm = config.post_layernorm if config.post_layernorm is not None else False
+        self.self_attn = EvaCLIPAttention(config)
+        self.layer_norm1 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = EvaCLIPMLP(config)
+        self.layer_norm2 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+        if not self.post_layernorm:
+            hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        if self.post_layernorm:
+            hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        if not self.post_layernorm:
+            hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.post_layernorm:
+            hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class EvaCLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = EvaCLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, EvaCLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, EvaCLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, EvaCLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, EvaCLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, EvaCLIPModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, EvaCLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, EvaCLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EvaCLIPEncoder):
+            module.gradient_checkpointing = value
+class EvaCLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+    def __init__(self, config: EvaCLIPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([EvaCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class EvaCLIPTextTransformer(EvaCLIPPreTrainedModel):
+    def __init__(self, config: EvaCLIPTextConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
+        self.embeddings = EvaCLIPTextEmbeddings(config)
+        self.encoder = EvaCLIPEncoder(config)
+        self.final_layer_norm = norm_layer(embed_dim, eps=config.layer_norm_eps)
+    def gradient_checkpointing_enable(self):
+        self.encoder.gradient_checkpointing = True
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        bsz, seq_len = input_shape
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+class EvaCLIPTextModel(EvaCLIPPreTrainedModel):
+    config_class = EvaCLIPTextConfig
+    _no_split_modules = ["EvaCLIPEncoderLayer"]
+    def __init__(self, config: EvaCLIPTextConfig):
+        super().__init__(config)
+        self.text_model = EvaCLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class EvaCLIPVisionTransformer(EvaCLIPPreTrainedModel):
+    def __init__(self, config: EvaCLIPVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
+        self.embeddings = EvaCLIPVisionEmbeddings(config)
+        self.encoder = EvaCLIPEncoder(config)
+        self.post_layernorm = norm_layer(embed_dim, eps=config.layer_norm_eps)
+    def gradient_checkpointing_enable(self):
+        self.encoder.gradient_checkpointing = True
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class EvaCLIPVisionModel(nn.Module):
+    config_class = EvaCLIPVisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: EvaCLIPVisionConfig):
+        super().__init__(config)
+        # super().__init__()
+        self.vision_model = EvaCLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class EvaCLIPModel(EvaCLIPPreTrainedModel):
+    config_class = EvaCLIPConfig
+    def __init__(self, config: EvaCLIPConfig):
+        super().__init__(config)
+        if not (type(config.text_config).__name__ == "EvaCLIPTextConfig"):
+            raise ValueError(
+                "config.text_config is expected to be of type EvaCLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not (type(config.vision_config).__name__ == "EvaCLIPVisionConfig"):
+            raise ValueError(
+                "config.vision_config is expected to be of type EvaCLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+        self.text_model = EvaCLIPTextTransformer(text_config)
+        self.vision_model = EvaCLIPVisionTransformer(vision_config)
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = torch.tensor(100., requires_grad=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def encode_text(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        return text_features
+    def encode_image(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use EvaCLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+        return image_features
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, EvaCLIPOutput]:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+        return EvaCLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+class EvaCLIPTextModelWithProjection(EvaCLIPPreTrainedModel):
+    config_class = EvaCLIPTextConfig
+    _no_split_modules = ["EvaCLIPEncoderLayer"]
+    def __init__(self, config: EvaCLIPTextConfig):
+        super().__init__(config)
+        self.text_model = EvaCLIPTextTransformer(config)
+        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, EvaCLIPTextModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        text_embeds = self.text_projection(pooled_output)
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+        return EvaCLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+class EvaCLIPVisionModelWithProjection(EvaCLIPPreTrainedModel):
+    config_class = EvaCLIPVisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: EvaCLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = EvaCLIPVisionTransformer(config)
+        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, EvaCLIPVisionModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+        if not return_dict:
+            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+        return EvaCLIPVisionModelOutput(
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )

pytorch_model-00001-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2349b395cde764769d957ad2657d251f4be7888e30cb8c1be941015285c33c05
+size 9980551498

pytorch_model-00002-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9d8fd9cbfa0beacae3b36e62f815a18694650f1bc5adb92dae2cfed842109b
+size 9933660099

pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b3c9647c38f0abd79978806433b619a17ce9f6e86675a104c41ca04bf778ac
+size 9799524012

pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4d6d58e2b2d428aac6b3896cf95631f3f7e579f60ddf17f77edb17f9b02a67
+size 3182146573

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,882 @@

+{
+  "metadata": {
+    "total_size": 32895577088
+  },
+  "weight_map": {
+    "text_model.embeddings.position_embedding.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.embeddings.token_embedding.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "text_model.final_layer_norm.bias": "pytorch_model-00001-of-00004.bin",
+    "text_model.final_layer_norm.weight": "pytorch_model-00001-of-00004.bin",
+    "text_projection.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.embeddings.class_embedding": "pytorch_model-00001-of-00004.bin",
+    "vision_model.embeddings.patch_embedding.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.embeddings.patch_embedding.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.embeddings.position_embedding.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "vision_model.encoder.layers.29.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.30.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "vision_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "vision_model.post_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "visual_projection.weight": "pytorch_model-00004-of-00004.bin"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff