Spaces:

thu-ml
/

ZH-CLIP

Runtime error

App Files Files Community

nlpcver commited on Mar 30, 2023

Commit

ceb28a7

1 Parent(s): 5328e40

Add application file

Browse files

Files changed (8) hide show

.gitignore +31 -0
app.py +29 -0
images/dog.jpeg +0 -0
requirements.txt +3 -0
zhclip/__init__.py +6 -0
zhclip/configuration_zhclip.py +95 -0
zhclip/modeling_zhclip.py +239 -0
zhclip/processing_zhclip.py +135 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+**/__pycache__
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# ours
+tools/

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import gradio as gr
+from typing import List
+from PIL import Image
+from zhclip import ZhCLIPProcessor, ZhCLIPModel  # From https://www.github.com/thu-ml/zh-clip
+version = 'thu-ml/zh-clip-vit-roberta-large-patch14'
+model = ZhCLIPModel.from_pretrained(version)
+processor = ZhCLIPProcessor.from_pretrained(version)
+def inference(image, texts: List[str]):
+    texts = [x[0] for x in texts]
+    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+    outputs = model(**inputs)
+    image_features = outputs.image_features
+    text_features = outputs.text_features
+    text_probs = (image_features @ text_features.T).softmax(dim=-1)[0].detach().cpu().numpy()
+    return {i: float(text_probs[i]) for i in range(len(text_probs))}
+title = "ZH-CLIP zero-shot classification"
+description = "Chinese Clip Model (ZH-CLIP) zero-shot classification"
+article="<p style='text-align: center'><a href='https://www.github.com/thu-ml/zh-clip' target='_blank'>github: zh-clip</a>  <a href='https://huggingface.co/thu-ml/zh-clip-vit-roberta-large-patch14' target='_blank'>huggingface model: thu-ml/zh-clip-vit-roberta-large-patch14</a></p>"
+examples = [['./images/dog.jpeg', [['一只狗'], ['一只猫']]]]
+interpretation='default'
+enable_queue=True
+iface = gr.Interface(fn=inference, inputs=["image", "list"], outputs="label",
+                     title=title, description=description, article=article, examples=examples,
+                     enable_queue=enable_queue)
+iface.launch(server_name='0.0.0.0')

images/dog.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+transformers==4.26.1
+multilingual_clip

zhclip/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .modeling_zhclip import (
+    ZhCLIPModel,
+)
+from .configuration_zhclip import ZhCLIPConfig
+from .processing_zhclip import ZhCLIPProcessor

zhclip/configuration_zhclip.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ZhClip model configuration"""
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.clip.configuration_clip import CLIPVisionConfig
+from typing import Union, Dict
+logger = logging.get_logger(__name__)
+class ZhCLIPConfig(PretrainedConfig):
+    model_type = "zhclip"
+    is_composition = True
+    def __init__(
+        self,
+        text_config: Union[PretrainedConfig, Dict],
+        vision_config: Union[PretrainedConfig, Dict],
+        num_token_types=2,
+        hidden_size=768,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=None,
+        **kwargs):
+        super().__init__(**kwargs)
+        if not isinstance(text_config, PretrainedConfig):
+            text_model_type = text_config.pop('model_type')
+            text_config = AutoConfig.for_model(text_model_type, **text_config)
+        self.text_config = text_config
+        if not isinstance(vision_config, PretrainedConfig):
+            vision_model_type = vision_config.pop('model_type')
+            if vision_model_type == "clip":
+                vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config
+            elif vision_model_type == "clip_vision_model":
+                vision_config = CLIPVisionConfig(**vision_config)
+            else:
+                vision_config = AutoConfig.for_model(vision_model_type, **vision_config)
+            self.vision_config = vision_config
+        else:
+            vision_model_type = vision_config.model_type
+            if vision_model_type== "clip":
+                vision_config = vision_config.vision_config
+            self.vision_config = vision_config
+        # co-attention
+        self.num_token_types=num_token_types
+        self.hidden_size=hidden_size
+        self.num_hidden_layers=num_hidden_layers
+        self.num_attention_heads=num_attention_heads
+        self.intermediate_size=intermediate_size
+        self.hidden_act=hidden_act
+        self.hidden_dropout_prob=hidden_dropout_prob
+        self.attention_probs_dropout_prob=attention_probs_dropout_prob
+        self.initializer_range=initializer_range
+        self.layer_norm_eps=layer_norm_eps
+        self.classifier_dropout=classifier_dropout
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

zhclip/modeling_zhclip.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ZH-CLIP model."""
+from typing import Optional, Tuple, Union
+from torch import TensorType
+import torch
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging, ModelOutput
+from transformers.models.auto.modeling_auto import AutoModel
+from transformers.models.clip.modeling_clip import CLIPVisionConfig, CLIPVisionModel
+from .configuration_zhclip import ZhCLIPConfig
+from dataclasses import dataclass
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "ZhCLIPConfig"
+@dataclass
+class ZhCLIPModelOutput(ModelOutput):
+    text_features: torch.FloatTensor = None
+    image_features: torch.FloatTensor = None
+class MeanPooler(nn.Module):
+    """Mean pooling"""
+    def forward(self, last_hidden_state: TensorType, attention_mask: TensorType):
+        masked_output = last_hidden_state * attention_mask.unsqueeze(-1)
+        return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
+class ZhCLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization.
+    """
+    config_class = ZhCLIPConfig
+    base_model_prefix = "zhclip"
+    supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class ZhCLIPModel(ZhCLIPPreTrainedModel):
+    def __init__(
+        self,
+        config: Optional[ZhCLIPConfig] = None,
+        vision_model: Optional[PreTrainedModel] = None,
+        text_model: Optional[PreTrainedModel] = None,
+    ):
+        if config is None and (vision_model is None or text_model is None):
+            raise ValueError("Either a configuration or an vision and a text model has to be provided")
+        if config is None:
+            config = ZhCLIPConfig(vision_model.config, text_model.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"config: {config} has to be of type {self.config_class}")
+        # initialize with config
+        super().__init__(config)
+        if vision_model is None:
+            if isinstance(config.vision_config, CLIPVisionConfig):
+                vision_model = CLIPVisionModel(config.vision_config).vision_model
+            else:
+                vision_model = AutoModel.from_config(config.vision_config)
+        if text_model is None:
+            text_model = AutoModel.from_config(config.text_config)
+        self.vision_model = vision_model
+        self.text_model = text_model
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.vision_model.config = self.config.vision_config
+        self.text_model.config = self.config.text_config
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
+        self.coattention_dim = config.hidden_size
+        # add projection layers
+        mlp_hidden_size = (self.text_embed_dim + self.coattention_dim) // 2
+        self.text_projection = nn.Sequential(
+            nn.Linear(self.text_embed_dim, mlp_hidden_size, bias=False),
+            nn.GELU(),
+            nn.Linear(mlp_hidden_size, self.coattention_dim, bias=False),
+        )
+        self.text_pooler = MeanPooler()
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.coattention_dim)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        patch_ids = None,
+        extend_token_type_ids = None,
+        return_loss: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], ZhCLIPModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        image_features = self.get_image_features(
+            pixel_values=pixel_values,
+            return_dict=return_dict,
+        )
+        text_features = self.get_text_features(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            return_dict=return_dict,
+        )
+        return ZhCLIPModelOutput(
+            image_features = image_features,
+            text_features = text_features,
+        )
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        # At the moment fast initialization is not supported
+        # for composite models
+        kwargs["_fast_init"] = False
+        return super().from_pretrained(*args, **kwargs)
+    def get_text_features(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        token_type_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            #output_attentions=output_attentions,
+            #output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if attention_mask is None:
+            attention_mask = (input_ids != self.config.pad_token_id).long()
+        text_pool = self.text_pooler(text_outputs[0], attention_mask)
+        text_feat = self.text_projection(text_pool)
+        return text_feat
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+        return image_features

zhclip/processing_zhclip.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for ZH-CLIP
+"""
+import warnings
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+class ZhCLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs a VLE processor which wraps an image processor and a tokenizer into a single
+    processor.
+    [`VLEProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`].
+    See the [`~VLEProcessor.__call__`] and [`~VLEProcessor.decode`] for more
+    information.
+    Args:
+        image_processor ([`AutoImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = "BertTokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to VLETokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
+        `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor