From 1b3a3ed290c3142f0d9918903c8173c0575ca936 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 11 Jun 2021 21:48:27 +0200
Subject: [PATCH 01/12] [Proposal] Adding ZeroShotImageClassificationPipeline

- Based on CLIP
---
 .../models/auto/tokenization_auto.py          | 114 ++++++++++++++
 src/transformers/pipelines/__init__.py        |   7 +
 .../zero_shot_image_classification.py         | 146 ++++++++++++++++++
 ...ipelines_zero_shot_image_classification.py | 118 ++++++++++++++
 4 files changed, 385 insertions(+)
 create mode 100644 src/transformers/pipelines/zero_shot_image_classification.py
 create mode 100644 tests/test_pipelines_zero_shot_image_classification.py

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 41d44c641f3348..c6c2948c83dff5 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -34,6 +34,59 @@
     AutoConfig,
     config_class_to_model_type,
     model_type_to_module_name,
+    BartConfig,
+    BertConfig,
+    BertGenerationConfig,
+    BigBirdConfig,
+    BigBirdPegasusConfig,
+    BlenderbotConfig,
+    BlenderbotSmallConfig,
+    CamembertConfig,
+    CanineConfig,
+    CLIPConfig,
+    ConvBertConfig,
+    CTRLConfig,
+    DebertaConfig,
+    DebertaV2Config,
+    DistilBertConfig,
+    DPRConfig,
+    ElectraConfig,
+    EncoderDecoderConfig,
+    FlaubertConfig,
+    FSMTConfig,
+    FunnelConfig,
+    GPT2Config,
+    HubertConfig,
+    IBertConfig,
+    LayoutLMConfig,
+    LEDConfig,
+    LongformerConfig,
+    LukeConfig,
+    LxmertConfig,
+    M2M100Config,
+    MarianConfig,
+    MBartConfig,
+    MobileBertConfig,
+    MPNetConfig,
+    MT5Config,
+    OpenAIGPTConfig,
+    PegasusConfig,
+    ProphetNetConfig,
+    RagConfig,
+    ReformerConfig,
+    RetriBertConfig,
+    RobertaConfig,
+    RoFormerConfig,
+    Speech2TextConfig,
+    SqueezeBertConfig,
+    T5Config,
+    TapasConfig,
+    TransfoXLConfig,
+    Wav2Vec2Config,
+    XLMConfig,
+    XLMProphetNetConfig,
+    XLMRobertaConfig,
+    XLNetConfig,
     replace_list_option_in_docstrings,
 )
 
@@ -235,7 +288,68 @@
         ]
     )
 
+<<<<<<< HEAD
 TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
+=======
+TOKENIZER_MAPPING = OrderedDict(
+    [
+        (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
+        (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)),
+        (T5Config, (T5Tokenizer, T5TokenizerFast)),
+        (MT5Config, (MT5Tokenizer, MT5TokenizerFast)),
+        (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
+        (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
+        (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
+        (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
+        (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
+        (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
+        (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
+        (MarianConfig, (MarianTokenizer, None)),
+        (BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)),
+        (BlenderbotConfig, (BlenderbotTokenizer, None)),
+        (BartConfig, (BartTokenizer, BartTokenizerFast)),
+        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
+        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
+        (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)),
+        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
+        (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
+        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
+        (LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)),
+        (DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)),
+        (SqueezeBertConfig, (SqueezeBertTokenizer, SqueezeBertTokenizerFast)),
+        (BertConfig, (BertTokenizer, BertTokenizerFast)),
+        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
+        (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
+        (TransfoXLConfig, (TransfoXLTokenizer, None)),
+        (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)),
+        (FlaubertConfig, (FlaubertTokenizer, None)),
+        (XLMConfig, (XLMTokenizer, None)),
+        (CLIPConfig, (CLIPTokenizer, None)),
+        (CTRLConfig, (CTRLTokenizer, None)),
+        (FSMTConfig, (FSMTTokenizer, None)),
+        (BertGenerationConfig, (BertGenerationTokenizer, None)),
+        (DebertaConfig, (DebertaTokenizer, DebertaTokenizerFast)),
+        (DebertaV2Config, (DebertaV2Tokenizer, None)),
+        (RagConfig, (RagTokenizer, None)),
+        (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
+        (Speech2TextConfig, (Speech2TextTokenizer, None)),
+        (M2M100Config, (M2M100Tokenizer, None)),
+        (ProphetNetConfig, (ProphetNetTokenizer, None)),
+        (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)),
+        (TapasConfig, (TapasTokenizer, None)),
+        (LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
+        (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
+        (BigBirdConfig, (BigBirdTokenizer, BigBirdTokenizerFast)),
+        (IBertConfig, (RobertaTokenizer, RobertaTokenizerFast)),
+        (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
+        (HubertConfig, (Wav2Vec2CTCTokenizer, None)),
+        (GPTNeoConfig, (GPT2Tokenizer, GPT2TokenizerFast)),
+        (LukeConfig, (LukeTokenizer, None)),
+        (BigBirdPegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
+        (CanineConfig, (CanineTokenizer, None)),
+    ]
+)
+>>>>>>> 35ebd464f ([Proposal] Adding ZeroShotImageClassificationPipeline)
 
 CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
 
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index ca6cb37547bbaf..1eea5aeb6de503 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -62,6 +62,7 @@
     TokenClassificationPipeline,
 )
 from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
+from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
 
 
 if is_tf_available():
@@ -239,6 +240,12 @@
         },
         "type": "text",
     },
+    "zero-shot-image-classification": {
+        "impl": ZeroShotImageClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {"pt": "openai/clip-vit-base-patch32"},
+    },
     "conversational": {
         "impl": ConversationalPipeline,
         "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (),
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 00000000000000..ef2863bef616af
--- /dev/null
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -0,0 +1,146 @@
+import os
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import requests
+
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import logging
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+if is_vision_available():
+    from PIL import Image
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class ZeroShotImageClassificationPipeline(Pipeline):
+    """
+    Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the
+    class of an image.
+
+    This image classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    task identifier: :obj:`"image-classification"`.
+
+    See the list of available models on `huggingface.co/models
+    <https://huggingface.co/models?filter=image-classification>`__.
+    """
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        feature_extractor: PreTrainedFeatureExtractor,
+        tokenizer: PreTrainedTokenizer,
+        framework: Optional[str] = None,
+        **kwargs
+    ):
+        super().__init__(
+            model, feature_extractor=feature_extractor, tokenizer=tokenizer, framework=framework, **kwargs
+        )
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+
+        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
+
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+
+    @staticmethod
+    def load_image(image: Union[str, "Image.Image"]):
+        if isinstance(image, str):
+            if image.startswith("http://") or image.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                return Image.open(requests.get(image, stream=True).raw)
+            elif os.path.isfile(image):
+                return Image.open(image)
+        elif isinstance(image, Image.Image):
+            return image
+
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+
+    def __call__(
+        self,
+        images: Union[str, List[str], "Image", List["Image"]],
+        candidate_labels: List[str],
+        hypothesis_template: str = "a photo of {}",
+    ):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            candidate_labels (:obj:`List[str]`):
+                The candidate labels for this image
+            hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This is a photo of a {}"`):
+                The sentence used in cunjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
+                likelihood_per_image
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **label** (:obj:`str`) -- The label identified by the model.
+            - **score** (:obj:`int`) -- The score attributed by the model for that label.
+        """
+        is_batched = isinstance(images, list)
+
+        if not is_batched:
+            images = [images]
+
+        images = [self.load_image(image) for image in images]
+
+        with torch.no_grad():
+            images = self.feature_extractor(images=images, return_tensors="pt")
+            inputs = self.tokenizer(candidate_labels, return_tensors="pt")
+            inputs["pixel_values"] = images.pixel_values
+            outputs = self.model(**inputs)
+
+            logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+            probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+            scores = probs.tolist()
+
+        if not is_batched:
+            scores = scores[0]
+            labels = [
+                {"score": score, "label": candidate_label}
+                for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+            ]
+        else:
+            labels = []
+            all_scores = scores
+            for scores in all_scores:
+                element_labels = [
+                    {"score": score, "label": candidate_label}
+                    for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+                ]
+                labels.append(element_labels)
+        return labels
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
new file mode 100644
index 00000000000000..34a2d1a282256d
--- /dev/null
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -0,0 +1,118 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, is_vision_available
+from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline
+from transformers.testing_utils import require_torch, require_vision
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@require_vision
+@require_torch
+class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
+    pipeline_task = "zero-shot-image-classification"
+    small_models = ["openai/clip-vit-base-patch32"]  # Models tested without the @slow decorator
+    simple_inputs = [
+        {"images": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+        {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"},
+        {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")},
+    ]
+    batched_inputs = [
+        {
+            "images": [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        },
+        {
+            "images": [
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            ]
+        },
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
+    ]
+
+    def test_small_model_from_factory(self):
+        for small_model in self.small_models:
+            image_classifier = pipeline("zero-shot-image-classification", model=small_model)
+
+            candidate_labels = ["a dog", "a cat"]
+
+            for valid_input in self.simple_inputs:
+                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
+                self.assertTrue(isinstance(output, list))
+                self.assertEqual(len(output), 2)
+                for label_result in output:
+                    self.assertTrue(isinstance(label_result, dict))
+                    self.assertEqual(set(label_result.keys()), {"label", "score"})
+
+            for valid_input in self.batched_inputs:
+                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
+                self.assertTrue(isinstance(output, list))
+                self.assertEqual(len(output), 2)
+                for item in output:
+                    for label_result in item:
+                        self.assertTrue(isinstance(label_result, dict))
+                        self.assertEqual(set(label_result.keys()), {"label", "score"})
+
+    def test_small_model_from_pipeline(self):
+        for small_model in self.small_models:
+            model = AutoModel.from_pretrained(small_model)
+            feature_extractor = AutoFeatureExtractor.from_pretrained(small_model)
+            tokenizer = AutoTokenizer.from_pretrained(small_model)
+            image_classifier = ZeroShotImageClassificationPipeline(
+                model=model, feature_extractor=feature_extractor, tokenizer=tokenizer
+            )
+
+            candidate_labels = ["a dog", "a cat"]
+
+            for valid_input in self.simple_inputs:
+                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
+                self.assertTrue(isinstance(output, list))
+                self.assertEqual(len(output), 2)
+                for label_result in output:
+                    self.assertTrue(isinstance(label_result, dict))
+                    self.assertEqual(set(label_result.keys()), {"label", "score"})
+
+            for valid_input in self.batched_inputs:
+                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
+                self.assertTrue(isinstance(output, list))
+                self.assertEqual(len(output), 2)
+                for item in output:
+                    for label_result in item:
+                        self.assertTrue(isinstance(label_result, dict))
+                        self.assertEqual(set(label_result.keys()), {"label", "score"})

From aaf48802b7662603e625738e2909f4606c4e721d Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 17:03:33 +0100
Subject: [PATCH 02/12] WIP, Resurection in progress.

---
 .../models/auto/tokenization_auto.py          | 114 ----------------
 src/transformers/pipelines/__init__.py        |   1 +
 .../zero_shot_image_classification.py         |  21 +--
 ...ipelines_zero_shot_image_classification.py | 125 +++++++-----------
 4 files changed, 50 insertions(+), 211 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c6c2948c83dff5..41d44c641f3348 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -34,59 +34,6 @@
     AutoConfig,
     config_class_to_model_type,
     model_type_to_module_name,
-    BartConfig,
-    BertConfig,
-    BertGenerationConfig,
-    BigBirdConfig,
-    BigBirdPegasusConfig,
-    BlenderbotConfig,
-    BlenderbotSmallConfig,
-    CamembertConfig,
-    CanineConfig,
-    CLIPConfig,
-    ConvBertConfig,
-    CTRLConfig,
-    DebertaConfig,
-    DebertaV2Config,
-    DistilBertConfig,
-    DPRConfig,
-    ElectraConfig,
-    EncoderDecoderConfig,
-    FlaubertConfig,
-    FSMTConfig,
-    FunnelConfig,
-    GPT2Config,
-    HubertConfig,
-    IBertConfig,
-    LayoutLMConfig,
-    LEDConfig,
-    LongformerConfig,
-    LukeConfig,
-    LxmertConfig,
-    M2M100Config,
-    MarianConfig,
-    MBartConfig,
-    MobileBertConfig,
-    MPNetConfig,
-    MT5Config,
-    OpenAIGPTConfig,
-    PegasusConfig,
-    ProphetNetConfig,
-    RagConfig,
-    ReformerConfig,
-    RetriBertConfig,
-    RobertaConfig,
-    RoFormerConfig,
-    Speech2TextConfig,
-    SqueezeBertConfig,
-    T5Config,
-    TapasConfig,
-    TransfoXLConfig,
-    Wav2Vec2Config,
-    XLMConfig,
-    XLMProphetNetConfig,
-    XLMRobertaConfig,
-    XLNetConfig,
     replace_list_option_in_docstrings,
 )
 
@@ -288,68 +235,7 @@
         ]
     )
 
-<<<<<<< HEAD
 TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
-=======
-TOKENIZER_MAPPING = OrderedDict(
-    [
-        (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
-        (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)),
-        (T5Config, (T5Tokenizer, T5TokenizerFast)),
-        (MT5Config, (MT5Tokenizer, MT5TokenizerFast)),
-        (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
-        (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
-        (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
-        (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
-        (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
-        (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
-        (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
-        (MarianConfig, (MarianTokenizer, None)),
-        (BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)),
-        (BlenderbotConfig, (BlenderbotTokenizer, None)),
-        (BartConfig, (BartTokenizer, BartTokenizerFast)),
-        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
-        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
-        (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)),
-        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
-        (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
-        (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
-        (LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)),
-        (DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)),
-        (SqueezeBertConfig, (SqueezeBertTokenizer, SqueezeBertTokenizerFast)),
-        (BertConfig, (BertTokenizer, BertTokenizerFast)),
-        (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
-        (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
-        (TransfoXLConfig, (TransfoXLTokenizer, None)),
-        (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)),
-        (FlaubertConfig, (FlaubertTokenizer, None)),
-        (XLMConfig, (XLMTokenizer, None)),
-        (CLIPConfig, (CLIPTokenizer, None)),
-        (CTRLConfig, (CTRLTokenizer, None)),
-        (FSMTConfig, (FSMTTokenizer, None)),
-        (BertGenerationConfig, (BertGenerationTokenizer, None)),
-        (DebertaConfig, (DebertaTokenizer, DebertaTokenizerFast)),
-        (DebertaV2Config, (DebertaV2Tokenizer, None)),
-        (RagConfig, (RagTokenizer, None)),
-        (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
-        (Speech2TextConfig, (Speech2TextTokenizer, None)),
-        (M2M100Config, (M2M100Tokenizer, None)),
-        (ProphetNetConfig, (ProphetNetTokenizer, None)),
-        (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)),
-        (TapasConfig, (TapasTokenizer, None)),
-        (LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
-        (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
-        (BigBirdConfig, (BigBirdTokenizer, BigBirdTokenizerFast)),
-        (IBertConfig, (RobertaTokenizer, RobertaTokenizerFast)),
-        (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
-        (HubertConfig, (Wav2Vec2CTCTokenizer, None)),
-        (GPTNeoConfig, (GPT2Tokenizer, GPT2TokenizerFast)),
-        (LukeConfig, (LukeTokenizer, None)),
-        (BigBirdPegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
-        (CanineConfig, (CanineTokenizer, None)),
-    ]
-)
->>>>>>> 35ebd464f ([Proposal] Adding ZeroShotImageClassificationPipeline)
 
 CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
 
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 1eea5aeb6de503..d149d35cc3f90b 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -245,6 +245,7 @@
         "tf": (),
         "pt": (AutoModel,) if is_torch_available() else (),
         "default": {"pt": "openai/clip-vit-base-patch32"},
+        "type": "multimodal",
     },
     "conversational": {
         "impl": ConversationalPipeline,
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index ef2863bef616af..2fecf1fa5c5c67 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -36,27 +36,14 @@ class of an image.
     <https://huggingface.co/models?filter=image-classification>`__.
     """
 
-    def __init__(
-        self,
-        model: Union["PreTrainedModel", "TFPreTrainedModel"],
-        feature_extractor: PreTrainedFeatureExtractor,
-        tokenizer: PreTrainedTokenizer,
-        framework: Optional[str] = None,
-        **kwargs
-    ):
-        super().__init__(
-            model, feature_extractor=feature_extractor, tokenizer=tokenizer, framework=framework, **kwargs
-        )
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
 
-        if self.framework == "tf":
+        if self.framework != "pt":
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-
-        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
-
-        self.feature_extractor = feature_extractor
-        self.tokenizer = tokenizer
+        self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
 
     @staticmethod
     def load_image(image: Union[str, "Image.Image"]):
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
index 34a2d1a282256d..02b3624876d47e 100644
--- a/tests/test_pipelines_zero_shot_image_classification.py
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -14,9 +14,17 @@
 
 import unittest
 
-from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, is_vision_available
+from transformers import (
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    AutoFeatureExtractor,
+    AutoModel,
+    AutoTokenizer,
+    is_vision_available,
+)
 from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
 
 
 if is_vision_available():
@@ -31,88 +39,45 @@ def open(*args, **kwargs):
 
 @require_vision
 @require_torch
-class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
-    pipeline_task = "zero-shot-image-classification"
-    small_models = ["openai/clip-vit-base-patch32"]  # Models tested without the @slow decorator
-    simple_inputs = [
-        {"images": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-        {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"},
-        {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")},
-    ]
-    batched_inputs = [
-        {
-            "images": [
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-            ]
-        },
-        {
-            "images": [
-                "./tests/fixtures/tests_samples/COCO/000000039769.png",
-                "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            ]
-        },
-        {
-            "images": [
-                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-            ]
-        },
-        {
-            "images": [
-                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-                "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            ]
-        },
-    ]
-
-    def test_small_model_from_factory(self):
-        for small_model in self.small_models:
-            image_classifier = pipeline("zero-shot-image-classification", model=small_model)
+@is_pipeline_test
+class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-            candidate_labels = ["a dog", "a cat"]
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        if tokenizer is None:
+            # Side effect of no Fast Tokenizer class for these model, so skipping
+            # But the slow tokenizer test should still run as they're quite small
+            self.skipTest("No tokenizer available")
+            return
+            # return None, None
 
-            for valid_input in self.simple_inputs:
-                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
-                self.assertTrue(isinstance(output, list))
-                self.assertEqual(len(output), 2)
-                for label_result in output:
-                    self.assertTrue(isinstance(label_result, dict))
-                    self.assertEqual(set(label_result.keys()), {"label", "score"})
+        speech_recognizer = ZeroShotImageClassificationPipeline(
+            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+        )
 
-            for valid_input in self.batched_inputs:
-                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
-                self.assertTrue(isinstance(output, list))
-                self.assertEqual(len(output), 2)
-                for item in output:
-                    for label_result in item:
-                        self.assertTrue(isinstance(label_result, dict))
-                        self.assertEqual(set(label_result.keys()), {"label", "score"})
+        # test with a raw waveform
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        return speech_recognizer, [image, image2]
 
-    def test_small_model_from_pipeline(self):
-        for small_model in self.small_models:
-            model = AutoModel.from_pretrained(small_model)
-            feature_extractor = AutoFeatureExtractor.from_pretrained(small_model)
-            tokenizer = AutoTokenizer.from_pretrained(small_model)
-            image_classifier = ZeroShotImageClassificationPipeline(
-                model=model, feature_extractor=feature_extractor, tokenizer=tokenizer
-            )
+    def run_pipeline_test(self, pipe, examples):
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        outputs = pipe(image, candidate_labels=["A", "B"])
+        self.assertEqual(outputs, {"text": ANY(str)})
 
-            candidate_labels = ["a dog", "a cat"]
+        # Batching
+        outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
-            for valid_input in self.simple_inputs:
-                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
-                self.assertTrue(isinstance(output, list))
-                self.assertEqual(len(output), 2)
-                for label_result in output:
-                    self.assertTrue(isinstance(label_result, dict))
-                    self.assertEqual(set(label_result.keys()), {"label", "score"})
+    @require_tf
+    def test_small_model_tf(self):
+        self.skipTest("Not implemented in Tensorflow")
 
-            for valid_input in self.batched_inputs:
-                output = image_classifier(**valid_input, candidate_labels=candidate_labels)
-                self.assertTrue(isinstance(output, list))
-                self.assertEqual(len(output), 2)
-                for item in output:
-                    for label_result in item:
-                        self.assertTrue(isinstance(label_result, dict))
-                        self.assertEqual(set(label_result.keys()), {"label", "score"})
+    @require_torch
+    def test_small_model_pt(self):
+        speech_recognizer = pipeline(
+            task="zero-shot-image-classification",
+            model="hf-internal-testing/tiny-random-clip",
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = speech_recognizer(image, candidate_labels=["A", "B", "C"])
+        self.assertEqual(output, {"text": "(Applaudissements)"})

From 617c6c7e9553c62a6a47904f88b7a35a17c314a3 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 19:29:46 +0100
Subject: [PATCH 03/12] Resurrection... achieved.

---
 src/transformers/pipelines/base.py            |  28 ++--
 .../zero_shot_image_classification.py         | 128 ++++++++----------
 ...ipelines_zero_shot_image_classification.py | 122 ++++++++++++-----
 3 files changed, 166 insertions(+), 112 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index fbfe56375d86eb..81e4f01628be2c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -84,7 +84,12 @@ def _pad(items, key, padding_value, padding_side):
         dtype = items[0][key].dtype
 
         if dim == 2:
-            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+            try:
+                tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+            except Exception:
+                import ipdb
+
+                ipdb.set_trace()
         elif dim == 3:
             tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
 
@@ -105,7 +110,6 @@ def _pad(items, key, padding_value, padding_side):
 
 
 def pad_collate_fn(tokenizer, feature_extractor):
-    padding_side = "right"
     if tokenizer is None and feature_extractor is None:
         raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
     if tokenizer is not None:
@@ -115,12 +119,12 @@ def pad_collate_fn(tokenizer, feature_extractor):
                 "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
             )
         else:
-            padding_value = tokenizer.pad_token_id
-            padding_side = tokenizer.padding_side
+            t_padding_value = tokenizer.pad_token_id
+            t_padding_side = tokenizer.padding_side
     if feature_extractor is not None:
         # Feature extractor can be images, where no padding is expected
-        padding_value = getattr(feature_extractor, "padding_value", None)
-        padding_side = getattr(feature_extractor, "padding_side", None)
+        f_padding_value = getattr(feature_extractor, "padding_value", None)
+        f_padding_side = getattr(feature_extractor, "padding_side", None)
 
     def inner(items):
         keys = set(items[0].keys())
@@ -132,13 +136,19 @@ def inner(items):
         # input_values, input_pixels, input_ids, ...
         padded = {}
         for key in keys:
-            if key.startswith("input_"):
-                _padding_value = padding_value
+            if key == "input_ids":
+                _padding_value = t_padding_value
+                _padding_side = t_padding_side
+            if key in {"input_values", "pixel_values", "input_features"}:
+                _padding_value = f_padding_value
+                _padding_side = f_padding_side
             elif key == "p_mask":
                 _padding_value = 1
+                _padding_side = t_padding_side
             else:
                 _padding_value = 0
-            padded[key] = _pad(items, key, _padding_value, padding_side)
+                _padding_side = f_padding_side
+            padded[key] = _pad(items, key, _padding_value, _padding_side)
         return padded
 
     return inner
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 2fecf1fa5c5c67..0bc9be879ae0cb 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,22 +1,15 @@
-import os
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import List, Union
 
-import requests
-
-from ..feature_extraction_utils import PreTrainedFeatureExtractor
 from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
-from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import logging
-from .base import PIPELINE_INIT_ARGS, Pipeline
-
+from .base import PIPELINE_INIT_ARGS, ChunkPipeline
 
-if TYPE_CHECKING:
-    from ..modeling_tf_utils import TFPreTrainedModel
-    from ..modeling_utils import PreTrainedModel
 
 if is_vision_available():
     from PIL import Image
 
+    from ..image_utils import load_image
+
 if is_torch_available():
     import torch
 
@@ -24,7 +17,7 @@
 
 
 @add_end_docstrings(PIPELINE_INIT_ARGS)
-class ZeroShotImageClassificationPipeline(Pipeline):
+class ZeroShotImageClassificationPipeline(ChunkPipeline):
     """
     Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the
     class of an image.
@@ -43,30 +36,10 @@ def __init__(self, **kwargs):
             raise ValueError(f"The {self.__class__} is only available in PyTorch.")
 
         requires_backends(self, "vision")
-        self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
-
-    @staticmethod
-    def load_image(image: Union[str, "Image.Image"]):
-        if isinstance(image, str):
-            if image.startswith("http://") or image.startswith("https://"):
-                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
-                # like http_huggingface_co.png
-                return Image.open(requests.get(image, stream=True).raw)
-            elif os.path.isfile(image):
-                return Image.open(image)
-        elif isinstance(image, Image.Image):
-            return image
-
-        raise ValueError(
-            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
-        )
-
-    def __call__(
-        self,
-        images: Union[str, List[str], "Image", List["Image"]],
-        candidate_labels: List[str],
-        hypothesis_template: str = "a photo of {}",
-    ):
+        # No specific FOR_XXX available yet
+        # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
+
+    def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs):
         """
         Assign labels to the image(s) passed as inputs.
 
@@ -98,36 +71,55 @@ def __call__(
             - **label** (:obj:`str`) -- The label identified by the model.
             - **score** (:obj:`int`) -- The score attributed by the model for that label.
         """
-        is_batched = isinstance(images, list)
-
-        if not is_batched:
-            images = [images]
-
-        images = [self.load_image(image) for image in images]
-
-        with torch.no_grad():
-            images = self.feature_extractor(images=images, return_tensors="pt")
-            inputs = self.tokenizer(candidate_labels, return_tensors="pt")
+        return super().__call__(images, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        postprocess_params = {}
+        if "multi_label" in kwargs:
+            postprocess_params["multi_label"] = kwargs["multi_label"]
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."):
+        n = len(candidate_labels)
+        for i, candidate_label in enumerate(candidate_labels):
+            image = load_image(image)
+            images = self.feature_extractor(images=[image], return_tensors="pt")
+            sequence = hypothesis_template.format(candidate_label)
+            inputs = self.tokenizer(sequence, return_tensors="pt")
             inputs["pixel_values"] = images.pixel_values
-            outputs = self.model(**inputs)
-
-            logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-            probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-            scores = probs.tolist()
-
-        if not is_batched:
-            scores = scores[0]
-            labels = [
-                {"score": score, "label": candidate_label}
-                for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
-            ]
-        else:
-            labels = []
-            all_scores = scores
-            for scores in all_scores:
-                element_labels = [
-                    {"score": score, "label": candidate_label}
-                    for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
-                ]
-                labels.append(element_labels)
-        return labels
+            yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
+
+    def _forward(self, model_inputs):
+        is_last = model_inputs.pop("is_last")
+        candidate_label = model_inputs.pop("candidate_label")
+        outputs = self.model(**model_inputs)
+
+        # Clip does crossproduct scoring by default, so we're only
+        # interested in the results where image and text and in the same
+        # batch position.
+        logits_per_image = torch.diagonal(outputs.logits_per_image)
+
+        model_outputs = {
+            "is_last": is_last,
+            "candidate_label": candidate_label,
+            "logits_per_image": logits_per_image,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs, multi_label=False):
+        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        logits = torch.cat([output["logits_per_image"] for output in model_outputs])
+        probs = logits.softmax(dim=0)
+        scores = probs.tolist()
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
index 02b3624876d47e..1d0aa7df1264fe 100644
--- a/tests/test_pipelines_zero_shot_image_classification.py
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -14,17 +14,18 @@
 
 import unittest
 
-from transformers import (
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    AutoFeatureExtractor,
-    AutoModel,
-    AutoTokenizer,
-    is_vision_available,
+from transformers import is_vision_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
 )
-from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline
-from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision
 
-from .test_pipelines_common import ANY, PipelineTestCaseMeta
+from .test_pipelines_common import PipelineTestCaseMeta
 
 
 if is_vision_available():
@@ -41,32 +42,34 @@ def open(*args, **kwargs):
 @require_torch
 @is_pipeline_test
 class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-    model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-
-    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        if tokenizer is None:
-            # Side effect of no Fast Tokenizer class for these model, so skipping
-            # But the slow tokenizer test should still run as they're quite small
-            self.skipTest("No tokenizer available")
-            return
-            # return None, None
-
-        speech_recognizer = ZeroShotImageClassificationPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
-        )
+    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
+    # and only CLIP would be there for now.
+    # model_mapping = {CLIPConfig: CLIPModel}
 
-        # test with a raw waveform
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        return speech_recognizer, [image, image2]
+    # def get_test_pipeline(self, model, tokenizer, feature_extractor):
+    #     if tokenizer is None:
+    #         # Side effect of no Fast Tokenizer class for these model, so skipping
+    #         # But the slow tokenizer test should still run as they're quite small
+    #         self.skipTest("No tokenizer available")
+    #         return
+    #         # return None, None
 
-    def run_pipeline_test(self, pipe, examples):
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        outputs = pipe(image, candidate_labels=["A", "B"])
-        self.assertEqual(outputs, {"text": ANY(str)})
+    #     speech_recognizer = ZeroShotImageClassificationPipeline(
+    #         model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+    #     )
+
+    #     # test with a raw waveform
+    #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     return speech_recognizer, [image, image2]
+
+    # def run_pipeline_test(self, pipe, examples):
+    #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     outputs = pipe(image, candidate_labels=["A", "B"])
+    #     self.assertEqual(outputs, {"text": ANY(str)})
 
-        # Batching
-        outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
+    #     # Batching
+    #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
     @require_tf
     def test_small_model_tf(self):
@@ -74,10 +77,59 @@ def test_small_model_tf(self):
 
     @require_torch
     def test_small_model_pt(self):
+        speech_recognizer = pipeline(
+            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification",
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = speech_recognizer(image, candidate_labels=["a", "b", "c"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+        )
+
+        output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
+                # Very odd inversion, but it's a random model, floating errors might account for this since all scores are similar.
+                [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}],
+                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
+                [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}],
+                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
         speech_recognizer = pipeline(
             task="zero-shot-image-classification",
-            model="hf-internal-testing/tiny-random-clip",
+            model="openai/clip-vit-base-patch32",
         )
+        # This is an image of 2 cats with remotes and no planes
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = speech_recognizer(image, candidate_labels=["A", "B", "C"])
-        self.assertEqual(output, {"text": "(Applaudissements)"})
+        output = speech_recognizer(image, candidate_labels=["cat", "plane", "remote"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.941, "label": "cat"},
+                {"score": 0.055, "label": "remote"},
+                {"score": 0.003, "label": "plane"},
+            ],
+        )
+
+        output = speech_recognizer([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.941, "label": "cat"},
+                    {"score": 0.055, "label": "remote"},
+                    {"score": 0.003, "label": "plane"},
+                ],
+            ]
+            * 5,
+        )

From 51511a7b0498dfce4c98566856ee5864d338cad9 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 19:46:14 +0100
Subject: [PATCH 04/12] Reword handling different `padding_value` for
 `feature_extractor` and `tokenizer`.

---
 src/transformers/pipelines/base.py | 34 ++++++++++++++++++------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 81e4f01628be2c..8d80075d949d8c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -84,12 +84,7 @@ def _pad(items, key, padding_value, padding_side):
         dtype = items[0][key].dtype
 
         if dim == 2:
-            try:
-                tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
-            except Exception:
-                import ipdb
-
-                ipdb.set_trace()
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
         elif dim == 3:
             tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
 
@@ -110,6 +105,8 @@ def _pad(items, key, padding_value, padding_side):
 
 
 def pad_collate_fn(tokenizer, feature_extractor):
+    t_padding_side = None
+    f_padding_side = None
     if tokenizer is None and feature_extractor is None:
         raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
     if tokenizer is not None:
@@ -126,6 +123,16 @@ def pad_collate_fn(tokenizer, feature_extractor):
         f_padding_value = getattr(feature_extractor, "padding_value", None)
         f_padding_side = getattr(feature_extractor, "padding_side", None)
 
+    if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
+        raise ValueError(
+            f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
+        )
+    padding_side = "right"
+    if t_padding_side is not None:
+        padding_side = t_padding_side
+    if f_padding_side is not None:
+        padding_side = f_padding_side
+
     def inner(items):
         keys = set(items[0].keys())
         for item in items:
@@ -136,19 +143,18 @@ def inner(items):
         # input_values, input_pixels, input_ids, ...
         padded = {}
         for key in keys:
-            if key == "input_ids":
+            if key in {"input_ids"}:
                 _padding_value = t_padding_value
-                _padding_side = t_padding_side
-            if key in {"input_values", "pixel_values", "input_features"}:
+            elif key in {"input_values", "pixel_values", "input_features"}:
                 _padding_value = f_padding_value
-                _padding_side = f_padding_side
-            elif key == "p_mask":
+            elif key in {"p_mask"}:
                 _padding_value = 1
-                _padding_side = t_padding_side
+            elif key in {"attention_mask", "token_type_ids"}:
+                _padding_value = 0
             else:
+                # This is likely another random key maybe even user provided
                 _padding_value = 0
-                _padding_side = f_padding_side
-            padded[key] = _pad(items, key, _padding_value, _padding_side)
+            padded[key] = _pad(items, key, _padding_value, padding_side)
         return padded
 
     return inner

From 453f089fa0469bda4d3e8c1c39adff209793e06f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 19:53:25 +0100
Subject: [PATCH 05/12] Thanks doc-builder !

---
 .../zero_shot_image_classification.py         | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 0bc9be879ae0cb..c149fbb38c8212 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -19,14 +19,13 @@
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ZeroShotImageClassificationPipeline(ChunkPipeline):
     """
-    Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the
+    Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the
     class of an image.
 
-    This image classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"image-classification"`.
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"image-classification"`.
 
-    See the list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=image-classification>`__.
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
     """
 
     def __init__(self, **kwargs):
@@ -44,7 +43,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -54,10 +53,10 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                 Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                 images.
-            candidate_labels (:obj:`List[str]`):
+            candidate_labels (`List[str]`):
                 The candidate labels for this image
-            hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This is a photo of a {}"`):
-                The sentence used in cunjunction with `candidate_labels` to attempt the image classification by
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of a {}"`):
+                The sentence used in cunjunction with *candidate_labels* to attempt the image classification by
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
                 likelihood_per_image
 
@@ -68,8 +67,8 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
 
             The dictionaries contain the following keys:
 
-            - **label** (:obj:`str`) -- The label identified by the model.
-            - **score** (:obj:`int`) -- The score attributed by the model for that label.
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
         """
         return super().__call__(images, **kwargs)
 

From 711ff5202bb88fd4493bde13c3859bf37e37254d Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 20:13:06 +0100
Subject: [PATCH 06/12] Adding docs + global namespace
 `ZeroShotImageClassificationPipeline`.

---
 docs/source/main_classes/pipelines.mdx                |  6 ++++++
 src/transformers/__init__.py                          |  1 +
 .../pipelines/zero_shot_image_classification.py       | 11 ++++++-----
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx
index 6f5b5b74706591..b5c51229ca55d8 100644
--- a/docs/source/main_classes/pipelines.mdx
+++ b/docs/source/main_classes/pipelines.mdx
@@ -428,6 +428,12 @@ See [`TokenClassificationPipeline`] for all details.
     - __call__
     - all
 
+### ZeroShotImageClassificationPipeline
+
+[[autodoc]] ZeroShotImageClassificationPipeline
+    - __call__
+    - all
+
 ## Parent class: `Pipeline`
 
 [[autodoc]] Pipeline
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ad05486104ee79..de4f4aa399eea7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -363,6 +363,7 @@
         "TokenClassificationPipeline",
         "TranslationPipeline",
         "ZeroShotClassificationPipeline",
+        "ZeroShotImageClassificationPipeline",
         "pipeline",
     ],
     "processing_utils": ["ProcessorMixin"],
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index c149fbb38c8212..60071072a78ee0 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -19,13 +19,14 @@
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ZeroShotImageClassificationPipeline(ChunkPipeline):
     """
-    Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the
-    class of an image.
+    Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the class
+    of an image.
 
-    This image classification pipeline can currently be loaded from [`pipeline`] using the following
-    task identifier: `"image-classification"`.
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-classification"`.
 
-    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
     """
 
     def __init__(self, **kwargs):

From aaf02ee77fb5ae231ceb8ef391612b69166e8c00 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 21:26:23 +0100
Subject: [PATCH 07/12] Fixing templates.

---
 src/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index de4f4aa399eea7..18d78840ba8134 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2569,6 +2569,7 @@
         TokenClassificationPipeline,
         TranslationPipeline,
         ZeroShotClassificationPipeline,
+        ZeroShotImageClassificationPipeline,
         pipeline,
     )
     from .processing_utils import ProcessorMixin

From 7de48f46d9b210b7a50f406b8647d030dd2e2d95 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 16 Feb 2022 22:51:54 +0100
Subject: [PATCH 08/12] Make the test pass and be robust to floating error.

---
 .../zero_shot_image_classification.py         |  1 +
 ...ipelines_zero_shot_image_classification.py | 39 +++++++++++++++----
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 60071072a78ee0..c85e8d361584e9 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -115,6 +115,7 @@ def _forward(self, model_inputs):
     def postprocess(self, model_outputs, multi_label=False):
         candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
         logits = torch.cat([output["logits_per_image"] for output in model_outputs])
+        print("Logits", logits)
         probs = logits.softmax(dim=0)
         scores = probs.tolist()
 
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
index 1d0aa7df1264fe..568dc7288d694e 100644
--- a/tests/test_pipelines_zero_shot_image_classification.py
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -25,7 +25,7 @@
     slow,
 )
 
-from .test_pipelines_common import PipelineTestCaseMeta
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
 
 
 if is_vision_available():
@@ -91,13 +91,38 @@ def test_small_model_pt(self):
         output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
         self.assertEqual(
             nested_simplify(output),
+            # Pipeline outputs are supposed to be deterministic and
+            # So we could in theory have real values "A", "B", "C" instead
+            # of ANY(str).
+            # However it seems that in this particular case, the floating
+            # scores are so close, we enter floating error approximation
+            # and the order is not guaranteed anymore with batching.
             [
-                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
-                # Very odd inversion, but it's a random model, floating errors might account for this since all scores are similar.
-                [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}],
-                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
-                [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}],
-                [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
             ],
         )
 

From 5a6401e4919d12a2bb65c003bb2b718692bfa148 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 17 Feb 2022 09:06:48 +0100
Subject: [PATCH 09/12] Adressing suraj's comments on docs mostly.

---
 src/transformers/pipelines/base.py            |  2 ++
 .../zero_shot_image_classification.py         | 32 +++++++------------
 ...ipelines_zero_shot_image_classification.py | 16 +++++-----
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 8d80075d949d8c..62e3abf37ecd58 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -105,7 +105,9 @@ def _pad(items, key, padding_value, padding_side):
 
 
 def pad_collate_fn(tokenizer, feature_extractor):
+    # Tokenizer
     t_padding_side = None
+    # Feature extractor
     f_padding_side = None
     if tokenizer is None and feature_extractor is None:
         raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index c85e8d361584e9..65836c584dcc46 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -19,14 +19,14 @@
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ZeroShotImageClassificationPipeline(ChunkPipeline):
     """
-    Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the class
-    of an image.
+    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
+    provide an image and a set of `candidate_labels`.
 
     This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
-    `"image-classification"`.
+    `"zero-shot-image-classification"`.
 
     See the list of available models on
-    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+    [huggingface.co/models](https://huggingface.co/models?filter=zer-shot-image-classification).
     """
 
     def __init__(self, **kwargs):
@@ -51,25 +51,20 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 - A string containing a local path to an image
                 - An image loaded in PIL directly
 
-                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
-                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
-                images.
             candidate_labels (`List[str]`):
                 The candidate labels for this image
-            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of a {}"`):
+
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
                 The sentence used in cunjunction with *candidate_labels* to attempt the image classification by
                 replacing the placeholder with the candidate_labels. Then likelihood is estimated by using
-                likelihood_per_image
+                logits_per_image
 
         Return:
-            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
-            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
-            the images.
-
-            The dictionaries contain the following keys:
+            A list of dictionaries containing result, one dictionnary per proposed label. The dictionaries contain the
+            following keys:
 
-            - **label** (`str`) -- The label identified by the model.
-            - **score** (`int`) -- The score attributed by the model for that label.
+            - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
+            - **score** (`int`) -- The score attributed by the model for that label (between 0 and 1).
         """
         return super().__call__(images, **kwargs)
 
@@ -80,10 +75,7 @@ def _sanitize_parameters(self, **kwargs):
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
 
-        postprocess_params = {}
-        if "multi_label" in kwargs:
-            postprocess_params["multi_label"] = kwargs["multi_label"]
-        return preprocess_params, {}, postprocess_params
+        return preprocess_params, {}, {}
 
     def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."):
         n = len(candidate_labels)
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
index 568dc7288d694e..65272b3a5bab0d 100644
--- a/tests/test_pipelines_zero_shot_image_classification.py
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -54,14 +54,14 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     #         return
     #         # return None, None
 
-    #     speech_recognizer = ZeroShotImageClassificationPipeline(
+    #     image_classifier = ZeroShotImageClassificationPipeline(
     #         model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
     #     )
 
     #     # test with a raw waveform
     #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
     #     image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    #     return speech_recognizer, [image, image2]
+    #     return image_classifier, [image, image2]
 
     # def run_pipeline_test(self, pipe, examples):
     #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
@@ -77,18 +77,18 @@ def test_small_model_tf(self):
 
     @require_torch
     def test_small_model_pt(self):
-        speech_recognizer = pipeline(
+        image_classifier = pipeline(
             model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification",
         )
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = speech_recognizer(image, candidate_labels=["a", "b", "c"])
+        output = image_classifier(image, candidate_labels=["a", "b", "c"])
 
         self.assertEqual(
             nested_simplify(output),
             [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
         )
 
-        output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
         self.assertEqual(
             nested_simplify(output),
             # Pipeline outputs are supposed to be deterministic and
@@ -129,13 +129,13 @@ def test_small_model_pt(self):
     @slow
     @require_torch
     def test_large_model_pt(self):
-        speech_recognizer = pipeline(
+        image_classifier = pipeline(
             task="zero-shot-image-classification",
             model="openai/clip-vit-base-patch32",
         )
         # This is an image of 2 cats with remotes and no planes
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        output = speech_recognizer(image, candidate_labels=["cat", "plane", "remote"])
+        output = image_classifier(image, candidate_labels=["cat", "plane", "remote"])
 
         self.assertEqual(
             nested_simplify(output),
@@ -146,7 +146,7 @@ def test_large_model_pt(self):
             ],
         )
 
-        output = speech_recognizer([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
         self.assertEqual(
             nested_simplify(output),
             [

From df0faf800351190ff45afecf4088e59bf4e25930 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 17 Feb 2022 14:51:38 +0100
Subject: [PATCH 10/12] Tf support start.

---
 src/transformers/pipelines/__init__.py        |  4 +-
 ...ipelines_zero_shot_image_classification.py | 88 +++++++++++++++++--
 2 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index d149d35cc3f90b..4e20e71240b318 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -242,9 +242,9 @@
     },
     "zero-shot-image-classification": {
         "impl": ZeroShotImageClassificationPipeline,
-        "tf": (),
+        "tf": (TFAutoModel,) if is_tf_available() else (),
         "pt": (AutoModel,) if is_torch_available() else (),
-        "default": {"pt": "openai/clip-vit-base-patch32"},
+        "default": {"pt": "openai/clip-vit-base-patch32", "tf": "openai/clip-vit-base-patch32"},
         "type": "multimodal",
     },
     "conversational": {
diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py
index 65272b3a5bab0d..c314b92a0b141d 100644
--- a/tests/test_pipelines_zero_shot_image_classification.py
+++ b/tests/test_pipelines_zero_shot_image_classification.py
@@ -39,7 +39,6 @@ def open(*args, **kwargs):
 
 
 @require_vision
-@require_torch
 @is_pipeline_test
 class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
@@ -71,10 +70,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
     #     # Batching
     #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
-    @require_tf
-    def test_small_model_tf(self):
-        self.skipTest("Not implemented in Tensorflow")
-
     @require_torch
     def test_small_model_pt(self):
         image_classifier = pipeline(
@@ -126,6 +121,57 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_tf
+    def test_small_model_tf(self):
+        image_classifier = pipeline(
+            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", framework="tf"
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(image, candidate_labels=["a", "b", "c"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+        )
+
+        output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            # Pipeline outputs are supposed to be deterministic and
+            # So we could in theory have real values "A", "B", "C" instead
+            # of ANY(str).
+            # However it seems that in this particular case, the floating
+            # scores are so close, we enter floating error approximation
+            # and the order is not guaranteed anymore with batching.
+            [
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+            ],
+        )
+
     @slow
     @require_torch
     def test_large_model_pt(self):
@@ -158,3 +204,35 @@ def test_large_model_pt(self):
             ]
             * 5,
         )
+
+    @slow
+    @require_tf
+    def test_large_model_tf(self):
+        image_classifier = pipeline(
+            task="zero-shot-image-classification", model="openai/clip-vit-base-patch32", framework="tf"
+        )
+        # This is an image of 2 cats with remotes and no planes
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(image, candidate_labels=["cat", "plane", "remote"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.941, "label": "cat"},
+                {"score": 0.055, "label": "remote"},
+                {"score": 0.003, "label": "plane"},
+            ],
+        )
+
+        output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.941, "label": "cat"},
+                    {"score": 0.055, "label": "remote"},
+                    {"score": 0.003, "label": "plane"},
+                ],
+            ]
+            * 5,
+        )

From 08cfaae8e4a2b02b02a9d1a30c3607345d2df044 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 17 Feb 2022 15:41:23 +0100
Subject: [PATCH 11/12] TF support.

---
 .../zero_shot_image_classification.py         | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 65836c584dcc46..062968138448fe 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,6 +1,12 @@
 from typing import List, Union
 
-from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
+from ..file_utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    requires_backends,
+)
 from ..utils import logging
 from .base import PIPELINE_INIT_ARGS, ChunkPipeline
 
@@ -13,6 +19,9 @@
 if is_torch_available():
     import torch
 
+if is_tf_available():
+    import tensorflow as tf
+
 logger = logging.get_logger(__name__)
 
 
@@ -32,9 +41,6 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-        if self.framework != "pt":
-            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
-
         requires_backends(self, "vision")
         # No specific FOR_XXX available yet
         # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING)
@@ -81,9 +87,9 @@ def preprocess(self, image, candidate_labels=None, hypothesis_template="This is
         n = len(candidate_labels)
         for i, candidate_label in enumerate(candidate_labels):
             image = load_image(image)
-            images = self.feature_extractor(images=[image], return_tensors="pt")
+            images = self.feature_extractor(images=[image], return_tensors=self.framework)
             sequence = hypothesis_template.format(candidate_label)
-            inputs = self.tokenizer(sequence, return_tensors="pt")
+            inputs = self.tokenizer(sequence, return_tensors=self.framework)
             inputs["pixel_values"] = images.pixel_values
             yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs}
 
@@ -95,7 +101,8 @@ def _forward(self, model_inputs):
         # Clip does crossproduct scoring by default, so we're only
         # interested in the results where image and text and in the same
         # batch position.
-        logits_per_image = torch.diagonal(outputs.logits_per_image)
+        diag = torch.diagonal if self.framework == "pt" else tf.linalg.diag_part
+        logits_per_image = diag(outputs.logits_per_image)
 
         model_outputs = {
             "is_last": is_last,
@@ -104,12 +111,16 @@ def _forward(self, model_inputs):
         }
         return model_outputs
 
-    def postprocess(self, model_outputs, multi_label=False):
+    def postprocess(self, model_outputs):
         candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
-        logits = torch.cat([output["logits_per_image"] for output in model_outputs])
-        print("Logits", logits)
-        probs = logits.softmax(dim=0)
-        scores = probs.tolist()
+        if self.framework == "pt":
+            logits = torch.cat([output["logits_per_image"] for output in model_outputs])
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            logits = tf.concat([output["logits_per_image"] for output in model_outputs], axis=0)
+            probs = tf.nn.softmax(logits, axis=0)
+            scores = probs.numpy().tolist()
 
         result = [
             {"score": score, "label": candidate_label}

From 7bd23df78d213f1940ca9dce970ba7b023aba47a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 17 Feb 2022 15:42:50 +0100
Subject: [PATCH 12/12] Update
 src/transformers/pipelines/zero_shot_image_classification.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 src/transformers/pipelines/zero_shot_image_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 062968138448fe..fb4036a9fa3333 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -70,7 +70,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
             following keys:
 
             - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`.
-            - **score** (`int`) -- The score attributed by the model for that label (between 0 and 1).
+            - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1).
         """
         return super().__call__(images, **kwargs)