From 1b3a3ed290c3142f0d9918903c8173c0575ca936 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 11 Jun 2021 21:48:27 +0200 Subject: [PATCH 01/12] [Proposal] Adding ZeroShotImageClassificationPipeline - Based on CLIP --- .../models/auto/tokenization_auto.py | 114 ++++++++++++++ src/transformers/pipelines/__init__.py | 7 + .../zero_shot_image_classification.py | 146 ++++++++++++++++++ ...ipelines_zero_shot_image_classification.py | 118 ++++++++++++++ 4 files changed, 385 insertions(+) create mode 100644 src/transformers/pipelines/zero_shot_image_classification.py create mode 100644 tests/test_pipelines_zero_shot_image_classification.py diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 41d44c641f3348..c6c2948c83dff5 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -34,6 +34,59 @@ AutoConfig, config_class_to_model_type, model_type_to_module_name, + BartConfig, + BertConfig, + BertGenerationConfig, + BigBirdConfig, + BigBirdPegasusConfig, + BlenderbotConfig, + BlenderbotSmallConfig, + CamembertConfig, + CanineConfig, + CLIPConfig, + ConvBertConfig, + CTRLConfig, + DebertaConfig, + DebertaV2Config, + DistilBertConfig, + DPRConfig, + ElectraConfig, + EncoderDecoderConfig, + FlaubertConfig, + FSMTConfig, + FunnelConfig, + GPT2Config, + HubertConfig, + IBertConfig, + LayoutLMConfig, + LEDConfig, + LongformerConfig, + LukeConfig, + LxmertConfig, + M2M100Config, + MarianConfig, + MBartConfig, + MobileBertConfig, + MPNetConfig, + MT5Config, + OpenAIGPTConfig, + PegasusConfig, + ProphetNetConfig, + RagConfig, + ReformerConfig, + RetriBertConfig, + RobertaConfig, + RoFormerConfig, + Speech2TextConfig, + SqueezeBertConfig, + T5Config, + TapasConfig, + TransfoXLConfig, + Wav2Vec2Config, + XLMConfig, + XLMProphetNetConfig, + XLMRobertaConfig, + XLNetConfig, replace_list_option_in_docstrings, ) @@ -235,7 +288,68 @@ ] ) +<<<<<<< HEAD TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) +======= +TOKENIZER_MAPPING = OrderedDict( + [ + (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)), + (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)), + (T5Config, (T5Tokenizer, T5TokenizerFast)), + (MT5Config, (MT5Tokenizer, MT5TokenizerFast)), + (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)), + (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), + (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)), + (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)), + (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)), + (MBartConfig, (MBartTokenizer, MBartTokenizerFast)), + (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)), + (MarianConfig, (MarianTokenizer, None)), + (BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)), + (BlenderbotConfig, (BlenderbotTokenizer, None)), + (BartConfig, (BartTokenizer, BartTokenizerFast)), + (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)), + (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)), + (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)), + (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)), + (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)), + (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)), + (LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)), + (DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)), + (SqueezeBertConfig, (SqueezeBertTokenizer, SqueezeBertTokenizerFast)), + (BertConfig, (BertTokenizer, BertTokenizerFast)), + (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)), + (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)), + (TransfoXLConfig, (TransfoXLTokenizer, None)), + (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)), + (FlaubertConfig, (FlaubertTokenizer, None)), + (XLMConfig, (XLMTokenizer, None)), + (CLIPConfig, (CLIPTokenizer, None)), + (CTRLConfig, (CTRLTokenizer, None)), + (FSMTConfig, (FSMTTokenizer, None)), + (BertGenerationConfig, (BertGenerationTokenizer, None)), + (DebertaConfig, (DebertaTokenizer, DebertaTokenizerFast)), + (DebertaV2Config, (DebertaV2Tokenizer, None)), + (RagConfig, (RagTokenizer, None)), + (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)), + (Speech2TextConfig, (Speech2TextTokenizer, None)), + (M2M100Config, (M2M100Tokenizer, None)), + (ProphetNetConfig, (ProphetNetTokenizer, None)), + (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)), + (TapasConfig, (TapasTokenizer, None)), + (LEDConfig, (LEDTokenizer, LEDTokenizerFast)), + (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)), + (BigBirdConfig, (BigBirdTokenizer, BigBirdTokenizerFast)), + (IBertConfig, (RobertaTokenizer, RobertaTokenizerFast)), + (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)), + (HubertConfig, (Wav2Vec2CTCTokenizer, None)), + (GPTNeoConfig, (GPT2Tokenizer, GPT2TokenizerFast)), + (LukeConfig, (LukeTokenizer, None)), + (BigBirdPegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)), + (CanineConfig, (CanineTokenizer, None)), + ] +) +>>>>>>> 35ebd464f ([Proposal] Adding ZeroShotImageClassificationPipeline) CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index ca6cb37547bbaf..1eea5aeb6de503 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -62,6 +62,7 @@ TokenClassificationPipeline, ) from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline +from .zero_shot_image_classification import ZeroShotImageClassificationPipeline if is_tf_available(): @@ -239,6 +240,12 @@ }, "type": "text", }, + "zero-shot-image-classification": { + "impl": ZeroShotImageClassificationPipeline, + "tf": (), + "pt": (AutoModel,) if is_torch_available() else (), + "default": {"pt": "openai/clip-vit-base-patch32"}, + }, "conversational": { "impl": ConversationalPipeline, "tf": (TFAutoModelForSeq2SeqLM, TFAutoModelForCausalLM) if is_tf_available() else (), diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py new file mode 100644 index 00000000000000..ef2863bef616af --- /dev/null +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -0,0 +1,146 @@ +import os +from typing import TYPE_CHECKING, List, Optional, Union + +import requests + +from ..feature_extraction_utils import PreTrainedFeatureExtractor +from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import logging +from .base import PIPELINE_INIT_ARGS, Pipeline + + +if TYPE_CHECKING: + from ..modeling_tf_utils import TFPreTrainedModel + from ..modeling_utils import PreTrainedModel + +if is_vision_available(): + from PIL import Image + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(PIPELINE_INIT_ARGS) +class ZeroShotImageClassificationPipeline(Pipeline): + """ + Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the + class of an image. + + This image classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"image-classification"`. + + See the list of available models on `huggingface.co/models + `__. + """ + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + feature_extractor: PreTrainedFeatureExtractor, + tokenizer: PreTrainedTokenizer, + framework: Optional[str] = None, + **kwargs + ): + super().__init__( + model, feature_extractor=feature_extractor, tokenizer=tokenizer, framework=framework, **kwargs + ) + + if self.framework == "tf": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + requires_backends(self, "vision") + + # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) + + self.feature_extractor = feature_extractor + self.tokenizer = tokenizer + + @staticmethod + def load_image(image: Union[str, "Image.Image"]): + if isinstance(image, str): + if image.startswith("http://") or image.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + return Image.open(requests.get(image, stream=True).raw) + elif os.path.isfile(image): + return Image.open(image) + elif isinstance(image, Image.Image): + return image + + raise ValueError( + "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." + ) + + def __call__( + self, + images: Union[str, List[str], "Image", List["Image"]], + candidate_labels: List[str], + hypothesis_template: str = "a photo of {}", + ): + """ + Assign labels to the image(s) passed as inputs. + + Args: + images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images, which must then be passed as a string. + Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL + images. + candidate_labels (:obj:`List[str]`): + The candidate labels for this image + hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This is a photo of a {}"`): + The sentence used in cunjunction with `candidate_labels` to attempt the image classification by + replacing the placeholder with the candidate_labels. Then likelihood is estimated by using + likelihood_per_image + + Return: + A dictionary or a list of dictionaries containing result. If the input is a single image, will return a + dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to + the images. + + The dictionaries contain the following keys: + + - **label** (:obj:`str`) -- The label identified by the model. + - **score** (:obj:`int`) -- The score attributed by the model for that label. + """ + is_batched = isinstance(images, list) + + if not is_batched: + images = [images] + + images = [self.load_image(image) for image in images] + + with torch.no_grad(): + images = self.feature_extractor(images=images, return_tensors="pt") + inputs = self.tokenizer(candidate_labels, return_tensors="pt") + inputs["pixel_values"] = images.pixel_values + outputs = self.model(**inputs) + + logits_per_image = outputs.logits_per_image # this is the image-text similarity score + probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + scores = probs.tolist() + + if not is_batched: + scores = scores[0] + labels = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + else: + labels = [] + all_scores = scores + for scores in all_scores: + element_labels = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + labels.append(element_labels) + return labels diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py new file mode 100644 index 00000000000000..34a2d1a282256d --- /dev/null +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -0,0 +1,118 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, is_vision_available +from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline +from transformers.testing_utils import require_torch, require_vision + + +if is_vision_available(): + from PIL import Image +else: + + class Image: + @staticmethod + def open(*args, **kwargs): + pass + + +@require_vision +@require_torch +class ZeroShotImageClassificationPipelineTests(unittest.TestCase): + pipeline_task = "zero-shot-image-classification" + small_models = ["openai/clip-vit-base-patch32"] # Models tested without the @slow decorator + simple_inputs = [ + {"images": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"}, + {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")}, + ] + batched_inputs = [ + { + "images": [ + "http://images.cocodataset.org/val2017/000000039769.jpg", + "http://images.cocodataset.org/val2017/000000039769.jpg", + ] + }, + { + "images": [ + "./tests/fixtures/tests_samples/COCO/000000039769.png", + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + }, + { + "images": [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + ] + }, + { + "images": [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + }, + ] + + def test_small_model_from_factory(self): + for small_model in self.small_models: + image_classifier = pipeline("zero-shot-image-classification", model=small_model) + + candidate_labels = ["a dog", "a cat"] + + for valid_input in self.simple_inputs: + output = image_classifier(**valid_input, candidate_labels=candidate_labels) + self.assertTrue(isinstance(output, list)) + self.assertEqual(len(output), 2) + for label_result in output: + self.assertTrue(isinstance(label_result, dict)) + self.assertEqual(set(label_result.keys()), {"label", "score"}) + + for valid_input in self.batched_inputs: + output = image_classifier(**valid_input, candidate_labels=candidate_labels) + self.assertTrue(isinstance(output, list)) + self.assertEqual(len(output), 2) + for item in output: + for label_result in item: + self.assertTrue(isinstance(label_result, dict)) + self.assertEqual(set(label_result.keys()), {"label", "score"}) + + def test_small_model_from_pipeline(self): + for small_model in self.small_models: + model = AutoModel.from_pretrained(small_model) + feature_extractor = AutoFeatureExtractor.from_pretrained(small_model) + tokenizer = AutoTokenizer.from_pretrained(small_model) + image_classifier = ZeroShotImageClassificationPipeline( + model=model, feature_extractor=feature_extractor, tokenizer=tokenizer + ) + + candidate_labels = ["a dog", "a cat"] + + for valid_input in self.simple_inputs: + output = image_classifier(**valid_input, candidate_labels=candidate_labels) + self.assertTrue(isinstance(output, list)) + self.assertEqual(len(output), 2) + for label_result in output: + self.assertTrue(isinstance(label_result, dict)) + self.assertEqual(set(label_result.keys()), {"label", "score"}) + + for valid_input in self.batched_inputs: + output = image_classifier(**valid_input, candidate_labels=candidate_labels) + self.assertTrue(isinstance(output, list)) + self.assertEqual(len(output), 2) + for item in output: + for label_result in item: + self.assertTrue(isinstance(label_result, dict)) + self.assertEqual(set(label_result.keys()), {"label", "score"}) From aaf48802b7662603e625738e2909f4606c4e721d Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 17:03:33 +0100 Subject: [PATCH 02/12] WIP, Resurection in progress. --- .../models/auto/tokenization_auto.py | 114 ---------------- src/transformers/pipelines/__init__.py | 1 + .../zero_shot_image_classification.py | 21 +-- ...ipelines_zero_shot_image_classification.py | 125 +++++++----------- 4 files changed, 50 insertions(+), 211 deletions(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index c6c2948c83dff5..41d44c641f3348 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -34,59 +34,6 @@ AutoConfig, config_class_to_model_type, model_type_to_module_name, - BartConfig, - BertConfig, - BertGenerationConfig, - BigBirdConfig, - BigBirdPegasusConfig, - BlenderbotConfig, - BlenderbotSmallConfig, - CamembertConfig, - CanineConfig, - CLIPConfig, - ConvBertConfig, - CTRLConfig, - DebertaConfig, - DebertaV2Config, - DistilBertConfig, - DPRConfig, - ElectraConfig, - EncoderDecoderConfig, - FlaubertConfig, - FSMTConfig, - FunnelConfig, - GPT2Config, - HubertConfig, - IBertConfig, - LayoutLMConfig, - LEDConfig, - LongformerConfig, - LukeConfig, - LxmertConfig, - M2M100Config, - MarianConfig, - MBartConfig, - MobileBertConfig, - MPNetConfig, - MT5Config, - OpenAIGPTConfig, - PegasusConfig, - ProphetNetConfig, - RagConfig, - ReformerConfig, - RetriBertConfig, - RobertaConfig, - RoFormerConfig, - Speech2TextConfig, - SqueezeBertConfig, - T5Config, - TapasConfig, - TransfoXLConfig, - Wav2Vec2Config, - XLMConfig, - XLMProphetNetConfig, - XLMRobertaConfig, - XLNetConfig, replace_list_option_in_docstrings, ) @@ -288,68 +235,7 @@ ] ) -<<<<<<< HEAD TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) -======= -TOKENIZER_MAPPING = OrderedDict( - [ - (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)), - (RoFormerConfig, (RoFormerTokenizer, RoFormerTokenizerFast)), - (T5Config, (T5Tokenizer, T5TokenizerFast)), - (MT5Config, (MT5Tokenizer, MT5TokenizerFast)), - (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)), - (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), - (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)), - (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)), - (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)), - (MBartConfig, (MBartTokenizer, MBartTokenizerFast)), - (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)), - (MarianConfig, (MarianTokenizer, None)), - (BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)), - (BlenderbotConfig, (BlenderbotTokenizer, None)), - (BartConfig, (BartTokenizer, BartTokenizerFast)), - (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)), - (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)), - (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)), - (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)), - (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)), - (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)), - (LayoutLMConfig, (LayoutLMTokenizer, LayoutLMTokenizerFast)), - (DPRConfig, (DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast)), - (SqueezeBertConfig, (SqueezeBertTokenizer, SqueezeBertTokenizerFast)), - (BertConfig, (BertTokenizer, BertTokenizerFast)), - (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)), - (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)), - (TransfoXLConfig, (TransfoXLTokenizer, None)), - (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)), - (FlaubertConfig, (FlaubertTokenizer, None)), - (XLMConfig, (XLMTokenizer, None)), - (CLIPConfig, (CLIPTokenizer, None)), - (CTRLConfig, (CTRLTokenizer, None)), - (FSMTConfig, (FSMTTokenizer, None)), - (BertGenerationConfig, (BertGenerationTokenizer, None)), - (DebertaConfig, (DebertaTokenizer, DebertaTokenizerFast)), - (DebertaV2Config, (DebertaV2Tokenizer, None)), - (RagConfig, (RagTokenizer, None)), - (XLMProphetNetConfig, (XLMProphetNetTokenizer, None)), - (Speech2TextConfig, (Speech2TextTokenizer, None)), - (M2M100Config, (M2M100Tokenizer, None)), - (ProphetNetConfig, (ProphetNetTokenizer, None)), - (MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)), - (TapasConfig, (TapasTokenizer, None)), - (LEDConfig, (LEDTokenizer, LEDTokenizerFast)), - (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)), - (BigBirdConfig, (BigBirdTokenizer, BigBirdTokenizerFast)), - (IBertConfig, (RobertaTokenizer, RobertaTokenizerFast)), - (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)), - (HubertConfig, (Wav2Vec2CTCTokenizer, None)), - (GPTNeoConfig, (GPT2Tokenizer, GPT2TokenizerFast)), - (LukeConfig, (LukeTokenizer, None)), - (BigBirdPegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)), - (CanineConfig, (CanineTokenizer, None)), - ] -) ->>>>>>> 35ebd464f ([Proposal] Adding ZeroShotImageClassificationPipeline) CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 1eea5aeb6de503..d149d35cc3f90b 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -245,6 +245,7 @@ "tf": (), "pt": (AutoModel,) if is_torch_available() else (), "default": {"pt": "openai/clip-vit-base-patch32"}, + "type": "multimodal", }, "conversational": { "impl": ConversationalPipeline, diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index ef2863bef616af..2fecf1fa5c5c67 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -36,27 +36,14 @@ class of an image. `__. """ - def __init__( - self, - model: Union["PreTrainedModel", "TFPreTrainedModel"], - feature_extractor: PreTrainedFeatureExtractor, - tokenizer: PreTrainedTokenizer, - framework: Optional[str] = None, - **kwargs - ): - super().__init__( - model, feature_extractor=feature_extractor, tokenizer=tokenizer, framework=framework, **kwargs - ) + def __init__(self, **kwargs): + super().__init__(**kwargs) - if self.framework == "tf": + if self.framework != "pt": raise ValueError(f"The {self.__class__} is only available in PyTorch.") requires_backends(self, "vision") - - # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) - - self.feature_extractor = feature_extractor - self.tokenizer = tokenizer + self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) @staticmethod def load_image(image: Union[str, "Image.Image"]): diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py index 34a2d1a282256d..02b3624876d47e 100644 --- a/tests/test_pipelines_zero_shot_image_classification.py +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -14,9 +14,17 @@ import unittest -from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, is_vision_available +from transformers import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + AutoFeatureExtractor, + AutoModel, + AutoTokenizer, + is_vision_available, +) from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision + +from .test_pipelines_common import ANY, PipelineTestCaseMeta if is_vision_available(): @@ -31,88 +39,45 @@ def open(*args, **kwargs): @require_vision @require_torch -class ZeroShotImageClassificationPipelineTests(unittest.TestCase): - pipeline_task = "zero-shot-image-classification" - small_models = ["openai/clip-vit-base-patch32"] # Models tested without the @slow decorator - simple_inputs = [ - {"images": "http://images.cocodataset.org/val2017/000000039769.jpg"}, - {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"}, - {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")}, - ] - batched_inputs = [ - { - "images": [ - "http://images.cocodataset.org/val2017/000000039769.jpg", - "http://images.cocodataset.org/val2017/000000039769.jpg", - ] - }, - { - "images": [ - "./tests/fixtures/tests_samples/COCO/000000039769.png", - "./tests/fixtures/tests_samples/COCO/000000039769.png", - ] - }, - { - "images": [ - Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), - Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), - ] - }, - { - "images": [ - Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), - "./tests/fixtures/tests_samples/COCO/000000039769.png", - ] - }, - ] - - def test_small_model_from_factory(self): - for small_model in self.small_models: - image_classifier = pipeline("zero-shot-image-classification", model=small_model) +@is_pipeline_test +class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): + model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING - candidate_labels = ["a dog", "a cat"] + def get_test_pipeline(self, model, tokenizer, feature_extractor): + if tokenizer is None: + # Side effect of no Fast Tokenizer class for these model, so skipping + # But the slow tokenizer test should still run as they're quite small + self.skipTest("No tokenizer available") + return + # return None, None - for valid_input in self.simple_inputs: - output = image_classifier(**valid_input, candidate_labels=candidate_labels) - self.assertTrue(isinstance(output, list)) - self.assertEqual(len(output), 2) - for label_result in output: - self.assertTrue(isinstance(label_result, dict)) - self.assertEqual(set(label_result.keys()), {"label", "score"}) + speech_recognizer = ZeroShotImageClassificationPipeline( + model=model, tokenizer=tokenizer, feature_extractor=feature_extractor + ) - for valid_input in self.batched_inputs: - output = image_classifier(**valid_input, candidate_labels=candidate_labels) - self.assertTrue(isinstance(output, list)) - self.assertEqual(len(output), 2) - for item in output: - for label_result in item: - self.assertTrue(isinstance(label_result, dict)) - self.assertEqual(set(label_result.keys()), {"label", "score"}) + # test with a raw waveform + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return speech_recognizer, [image, image2] - def test_small_model_from_pipeline(self): - for small_model in self.small_models: - model = AutoModel.from_pretrained(small_model) - feature_extractor = AutoFeatureExtractor.from_pretrained(small_model) - tokenizer = AutoTokenizer.from_pretrained(small_model) - image_classifier = ZeroShotImageClassificationPipeline( - model=model, feature_extractor=feature_extractor, tokenizer=tokenizer - ) + def run_pipeline_test(self, pipe, examples): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + outputs = pipe(image, candidate_labels=["A", "B"]) + self.assertEqual(outputs, {"text": ANY(str)}) - candidate_labels = ["a dog", "a cat"] + # Batching + outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"]) - for valid_input in self.simple_inputs: - output = image_classifier(**valid_input, candidate_labels=candidate_labels) - self.assertTrue(isinstance(output, list)) - self.assertEqual(len(output), 2) - for label_result in output: - self.assertTrue(isinstance(label_result, dict)) - self.assertEqual(set(label_result.keys()), {"label", "score"}) + @require_tf + def test_small_model_tf(self): + self.skipTest("Not implemented in Tensorflow") - for valid_input in self.batched_inputs: - output = image_classifier(**valid_input, candidate_labels=candidate_labels) - self.assertTrue(isinstance(output, list)) - self.assertEqual(len(output), 2) - for item in output: - for label_result in item: - self.assertTrue(isinstance(label_result, dict)) - self.assertEqual(set(label_result.keys()), {"label", "score"}) + @require_torch + def test_small_model_pt(self): + speech_recognizer = pipeline( + task="zero-shot-image-classification", + model="hf-internal-testing/tiny-random-clip", + ) + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + output = speech_recognizer(image, candidate_labels=["A", "B", "C"]) + self.assertEqual(output, {"text": "(Applaudissements)"}) From 617c6c7e9553c62a6a47904f88b7a35a17c314a3 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 19:29:46 +0100 Subject: [PATCH 03/12] Resurrection... achieved. --- src/transformers/pipelines/base.py | 28 ++-- .../zero_shot_image_classification.py | 128 ++++++++---------- ...ipelines_zero_shot_image_classification.py | 122 ++++++++++++----- 3 files changed, 166 insertions(+), 112 deletions(-) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index fbfe56375d86eb..81e4f01628be2c 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -84,7 +84,12 @@ def _pad(items, key, padding_value, padding_side): dtype = items[0][key].dtype if dim == 2: - tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value + try: + tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value + except Exception: + import ipdb + + ipdb.set_trace() elif dim == 3: tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value @@ -105,7 +110,6 @@ def _pad(items, key, padding_value, padding_side): def pad_collate_fn(tokenizer, feature_extractor): - padding_side = "right" if tokenizer is None and feature_extractor is None: raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching") if tokenizer is not None: @@ -115,12 +119,12 @@ def pad_collate_fn(tokenizer, feature_extractor): "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`." ) else: - padding_value = tokenizer.pad_token_id - padding_side = tokenizer.padding_side + t_padding_value = tokenizer.pad_token_id + t_padding_side = tokenizer.padding_side if feature_extractor is not None: # Feature extractor can be images, where no padding is expected - padding_value = getattr(feature_extractor, "padding_value", None) - padding_side = getattr(feature_extractor, "padding_side", None) + f_padding_value = getattr(feature_extractor, "padding_value", None) + f_padding_side = getattr(feature_extractor, "padding_side", None) def inner(items): keys = set(items[0].keys()) @@ -132,13 +136,19 @@ def inner(items): # input_values, input_pixels, input_ids, ... padded = {} for key in keys: - if key.startswith("input_"): - _padding_value = padding_value + if key == "input_ids": + _padding_value = t_padding_value + _padding_side = t_padding_side + if key in {"input_values", "pixel_values", "input_features"}: + _padding_value = f_padding_value + _padding_side = f_padding_side elif key == "p_mask": _padding_value = 1 + _padding_side = t_padding_side else: _padding_value = 0 - padded[key] = _pad(items, key, _padding_value, padding_side) + _padding_side = f_padding_side + padded[key] = _pad(items, key, _padding_value, _padding_side) return padded return inner diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 2fecf1fa5c5c67..0bc9be879ae0cb 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -1,22 +1,15 @@ -import os -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Union -import requests - -from ..feature_extraction_utils import PreTrainedFeatureExtractor from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends -from ..tokenization_utils import PreTrainedTokenizer from ..utils import logging -from .base import PIPELINE_INIT_ARGS, Pipeline - +from .base import PIPELINE_INIT_ARGS, ChunkPipeline -if TYPE_CHECKING: - from ..modeling_tf_utils import TFPreTrainedModel - from ..modeling_utils import PreTrainedModel if is_vision_available(): from PIL import Image + from ..image_utils import load_image + if is_torch_available(): import torch @@ -24,7 +17,7 @@ @add_end_docstrings(PIPELINE_INIT_ARGS) -class ZeroShotImageClassificationPipeline(Pipeline): +class ZeroShotImageClassificationPipeline(ChunkPipeline): """ Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the class of an image. @@ -43,30 +36,10 @@ def __init__(self, **kwargs): raise ValueError(f"The {self.__class__} is only available in PyTorch.") requires_backends(self, "vision") - self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) - - @staticmethod - def load_image(image: Union[str, "Image.Image"]): - if isinstance(image, str): - if image.startswith("http://") or image.startswith("https://"): - # We need to actually check for a real protocol, otherwise it's impossible to use a local file - # like http_huggingface_co.png - return Image.open(requests.get(image, stream=True).raw) - elif os.path.isfile(image): - return Image.open(image) - elif isinstance(image, Image.Image): - return image - - raise ValueError( - "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." - ) - - def __call__( - self, - images: Union[str, List[str], "Image", List["Image"]], - candidate_labels: List[str], - hypothesis_template: str = "a photo of {}", - ): + # No specific FOR_XXX available yet + # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) + + def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwargs): """ Assign labels to the image(s) passed as inputs. @@ -98,36 +71,55 @@ def __call__( - **label** (:obj:`str`) -- The label identified by the model. - **score** (:obj:`int`) -- The score attributed by the model for that label. """ - is_batched = isinstance(images, list) - - if not is_batched: - images = [images] - - images = [self.load_image(image) for image in images] - - with torch.no_grad(): - images = self.feature_extractor(images=images, return_tensors="pt") - inputs = self.tokenizer(candidate_labels, return_tensors="pt") + return super().__call__(images, **kwargs) + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = kwargs["candidate_labels"] + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + + postprocess_params = {} + if "multi_label" in kwargs: + postprocess_params["multi_label"] = kwargs["multi_label"] + return preprocess_params, {}, postprocess_params + + def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."): + n = len(candidate_labels) + for i, candidate_label in enumerate(candidate_labels): + image = load_image(image) + images = self.feature_extractor(images=[image], return_tensors="pt") + sequence = hypothesis_template.format(candidate_label) + inputs = self.tokenizer(sequence, return_tensors="pt") inputs["pixel_values"] = images.pixel_values - outputs = self.model(**inputs) - - logits_per_image = outputs.logits_per_image # this is the image-text similarity score - probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities - scores = probs.tolist() - - if not is_batched: - scores = scores[0] - labels = [ - {"score": score, "label": candidate_label} - for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) - ] - else: - labels = [] - all_scores = scores - for scores in all_scores: - element_labels = [ - {"score": score, "label": candidate_label} - for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) - ] - labels.append(element_labels) - return labels + yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs} + + def _forward(self, model_inputs): + is_last = model_inputs.pop("is_last") + candidate_label = model_inputs.pop("candidate_label") + outputs = self.model(**model_inputs) + + # Clip does crossproduct scoring by default, so we're only + # interested in the results where image and text and in the same + # batch position. + logits_per_image = torch.diagonal(outputs.logits_per_image) + + model_outputs = { + "is_last": is_last, + "candidate_label": candidate_label, + "logits_per_image": logits_per_image, + } + return model_outputs + + def postprocess(self, model_outputs, multi_label=False): + candidate_labels = [outputs["candidate_label"] for outputs in model_outputs] + logits = torch.cat([output["logits_per_image"] for output in model_outputs]) + probs = logits.softmax(dim=0) + scores = probs.tolist() + + result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + return result diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py index 02b3624876d47e..1d0aa7df1264fe 100644 --- a/tests/test_pipelines_zero_shot_image_classification.py +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -14,17 +14,18 @@ import unittest -from transformers import ( - MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, - AutoFeatureExtractor, - AutoModel, - AutoTokenizer, - is_vision_available, +from transformers import is_vision_available +from transformers.pipelines import pipeline +from transformers.testing_utils import ( + is_pipeline_test, + nested_simplify, + require_tf, + require_torch, + require_vision, + slow, ) -from transformers.pipelines import ZeroShotImageClassificationPipeline, pipeline -from transformers.testing_utils import is_pipeline_test, require_tf, require_torch, require_vision -from .test_pipelines_common import ANY, PipelineTestCaseMeta +from .test_pipelines_common import PipelineTestCaseMeta if is_vision_available(): @@ -41,32 +42,34 @@ def open(*args, **kwargs): @require_torch @is_pipeline_test class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): - model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING - - def get_test_pipeline(self, model, tokenizer, feature_extractor): - if tokenizer is None: - # Side effect of no Fast Tokenizer class for these model, so skipping - # But the slow tokenizer test should still run as they're quite small - self.skipTest("No tokenizer available") - return - # return None, None - - speech_recognizer = ZeroShotImageClassificationPipeline( - model=model, tokenizer=tokenizer, feature_extractor=feature_extractor - ) + # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping, + # and only CLIP would be there for now. + # model_mapping = {CLIPConfig: CLIPModel} - # test with a raw waveform - image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - return speech_recognizer, [image, image2] + # def get_test_pipeline(self, model, tokenizer, feature_extractor): + # if tokenizer is None: + # # Side effect of no Fast Tokenizer class for these model, so skipping + # # But the slow tokenizer test should still run as they're quite small + # self.skipTest("No tokenizer available") + # return + # # return None, None - def run_pipeline_test(self, pipe, examples): - image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - outputs = pipe(image, candidate_labels=["A", "B"]) - self.assertEqual(outputs, {"text": ANY(str)}) + # speech_recognizer = ZeroShotImageClassificationPipeline( + # model=model, tokenizer=tokenizer, feature_extractor=feature_extractor + # ) + + # # test with a raw waveform + # image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + # image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + # return speech_recognizer, [image, image2] + + # def run_pipeline_test(self, pipe, examples): + # image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + # outputs = pipe(image, candidate_labels=["A", "B"]) + # self.assertEqual(outputs, {"text": ANY(str)}) - # Batching - outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"]) + # # Batching + # outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"]) @require_tf def test_small_model_tf(self): @@ -74,10 +77,59 @@ def test_small_model_tf(self): @require_torch def test_small_model_pt(self): + speech_recognizer = pipeline( + model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", + ) + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + output = speech_recognizer(image, candidate_labels=["a", "b", "c"]) + + self.assertEqual( + nested_simplify(output), + [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}], + ) + + output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2) + self.assertEqual( + nested_simplify(output), + [ + [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], + # Very odd inversion, but it's a random model, floating errors might account for this since all scores are similar. + [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}], + [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], + [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}], + [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], + ], + ) + + @slow + @require_torch + def test_large_model_pt(self): speech_recognizer = pipeline( task="zero-shot-image-classification", - model="hf-internal-testing/tiny-random-clip", + model="openai/clip-vit-base-patch32", ) + # This is an image of 2 cats with remotes and no planes image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - output = speech_recognizer(image, candidate_labels=["A", "B", "C"]) - self.assertEqual(output, {"text": "(Applaudissements)"}) + output = speech_recognizer(image, candidate_labels=["cat", "plane", "remote"]) + + self.assertEqual( + nested_simplify(output), + [ + {"score": 0.941, "label": "cat"}, + {"score": 0.055, "label": "remote"}, + {"score": 0.003, "label": "plane"}, + ], + ) + + output = speech_recognizer([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2) + self.assertEqual( + nested_simplify(output), + [ + [ + {"score": 0.941, "label": "cat"}, + {"score": 0.055, "label": "remote"}, + {"score": 0.003, "label": "plane"}, + ], + ] + * 5, + ) From 51511a7b0498dfce4c98566856ee5864d338cad9 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 19:46:14 +0100 Subject: [PATCH 04/12] Reword handling different `padding_value` for `feature_extractor` and `tokenizer`. --- src/transformers/pipelines/base.py | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 81e4f01628be2c..8d80075d949d8c 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -84,12 +84,7 @@ def _pad(items, key, padding_value, padding_side): dtype = items[0][key].dtype if dim == 2: - try: - tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value - except Exception: - import ipdb - - ipdb.set_trace() + tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value elif dim == 3: tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value @@ -110,6 +105,8 @@ def _pad(items, key, padding_value, padding_side): def pad_collate_fn(tokenizer, feature_extractor): + t_padding_side = None + f_padding_side = None if tokenizer is None and feature_extractor is None: raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching") if tokenizer is not None: @@ -126,6 +123,16 @@ def pad_collate_fn(tokenizer, feature_extractor): f_padding_value = getattr(feature_extractor, "padding_value", None) f_padding_side = getattr(feature_extractor, "padding_side", None) + if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side: + raise ValueError( + f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}" + ) + padding_side = "right" + if t_padding_side is not None: + padding_side = t_padding_side + if f_padding_side is not None: + padding_side = f_padding_side + def inner(items): keys = set(items[0].keys()) for item in items: @@ -136,19 +143,18 @@ def inner(items): # input_values, input_pixels, input_ids, ... padded = {} for key in keys: - if key == "input_ids": + if key in {"input_ids"}: _padding_value = t_padding_value - _padding_side = t_padding_side - if key in {"input_values", "pixel_values", "input_features"}: + elif key in {"input_values", "pixel_values", "input_features"}: _padding_value = f_padding_value - _padding_side = f_padding_side - elif key == "p_mask": + elif key in {"p_mask"}: _padding_value = 1 - _padding_side = t_padding_side + elif key in {"attention_mask", "token_type_ids"}: + _padding_value = 0 else: + # This is likely another random key maybe even user provided _padding_value = 0 - _padding_side = f_padding_side - padded[key] = _pad(items, key, _padding_value, _padding_side) + padded[key] = _pad(items, key, _padding_value, padding_side) return padded return inner From 453f089fa0469bda4d3e8c1c39adff209793e06f Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 19:53:25 +0100 Subject: [PATCH 05/12] Thanks doc-builder ! --- .../zero_shot_image_classification.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 0bc9be879ae0cb..c149fbb38c8212 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -19,14 +19,13 @@ @add_end_docstrings(PIPELINE_INIT_ARGS) class ZeroShotImageClassificationPipeline(ChunkPipeline): """ - Image classification pipeline using any :obj:`AutoModelForZeroShotImageClassification`. This pipeline predicts the + Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the class of an image. - This image classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following - task identifier: :obj:`"image-classification"`. + This image classification pipeline can currently be loaded from [`pipeline`] using the following + task identifier: `"image-classification"`. - See the list of available models on `huggingface.co/models - `__. + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-classification). """ def __init__(self, **kwargs): @@ -44,7 +43,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar Assign labels to the image(s) passed as inputs. Args: - images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`): + images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): The pipeline handles three types of images: - A string containing a http link pointing to an image @@ -54,10 +53,10 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar The pipeline accepts either a single image or a batch of images, which must then be passed as a string. Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL images. - candidate_labels (:obj:`List[str]`): + candidate_labels (`List[str]`): The candidate labels for this image - hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This is a photo of a {}"`): - The sentence used in cunjunction with `candidate_labels` to attempt the image classification by + hypothesis_template (`str`, *optional*, defaults to `"This is a photo of a {}"`): + The sentence used in cunjunction with *candidate_labels* to attempt the image classification by replacing the placeholder with the candidate_labels. Then likelihood is estimated by using likelihood_per_image @@ -68,8 +67,8 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar The dictionaries contain the following keys: - - **label** (:obj:`str`) -- The label identified by the model. - - **score** (:obj:`int`) -- The score attributed by the model for that label. + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. """ return super().__call__(images, **kwargs) From 711ff5202bb88fd4493bde13c3859bf37e37254d Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 20:13:06 +0100 Subject: [PATCH 06/12] Adding docs + global namespace `ZeroShotImageClassificationPipeline`. --- docs/source/main_classes/pipelines.mdx | 6 ++++++ src/transformers/__init__.py | 1 + .../pipelines/zero_shot_image_classification.py | 11 ++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx index 6f5b5b74706591..b5c51229ca55d8 100644 --- a/docs/source/main_classes/pipelines.mdx +++ b/docs/source/main_classes/pipelines.mdx @@ -428,6 +428,12 @@ See [`TokenClassificationPipeline`] for all details. - __call__ - all +### ZeroShotImageClassificationPipeline + +[[autodoc]] ZeroShotImageClassificationPipeline + - __call__ + - all + ## Parent class: `Pipeline` [[autodoc]] Pipeline diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ad05486104ee79..de4f4aa399eea7 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -363,6 +363,7 @@ "TokenClassificationPipeline", "TranslationPipeline", "ZeroShotClassificationPipeline", + "ZeroShotImageClassificationPipeline", "pipeline", ], "processing_utils": ["ProcessorMixin"], diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index c149fbb38c8212..60071072a78ee0 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -19,13 +19,14 @@ @add_end_docstrings(PIPELINE_INIT_ARGS) class ZeroShotImageClassificationPipeline(ChunkPipeline): """ - Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the - class of an image. + Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the class + of an image. - This image classification pipeline can currently be loaded from [`pipeline`] using the following - task identifier: `"image-classification"`. + This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"image-classification"`. - See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-classification). + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=image-classification). """ def __init__(self, **kwargs): From aaf02ee77fb5ae231ceb8ef391612b69166e8c00 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 21:26:23 +0100 Subject: [PATCH 07/12] Fixing templates. --- src/transformers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index de4f4aa399eea7..18d78840ba8134 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2569,6 +2569,7 @@ TokenClassificationPipeline, TranslationPipeline, ZeroShotClassificationPipeline, + ZeroShotImageClassificationPipeline, pipeline, ) from .processing_utils import ProcessorMixin From 7de48f46d9b210b7a50f406b8647d030dd2e2d95 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 16 Feb 2022 22:51:54 +0100 Subject: [PATCH 08/12] Make the test pass and be robust to floating error. --- .../zero_shot_image_classification.py | 1 + ...ipelines_zero_shot_image_classification.py | 39 +++++++++++++++---- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 60071072a78ee0..c85e8d361584e9 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -115,6 +115,7 @@ def _forward(self, model_inputs): def postprocess(self, model_outputs, multi_label=False): candidate_labels = [outputs["candidate_label"] for outputs in model_outputs] logits = torch.cat([output["logits_per_image"] for output in model_outputs]) + print("Logits", logits) probs = logits.softmax(dim=0) scores = probs.tolist() diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py index 1d0aa7df1264fe..568dc7288d694e 100644 --- a/tests/test_pipelines_zero_shot_image_classification.py +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -25,7 +25,7 @@ slow, ) -from .test_pipelines_common import PipelineTestCaseMeta +from .test_pipelines_common import ANY, PipelineTestCaseMeta if is_vision_available(): @@ -91,13 +91,38 @@ def test_small_model_pt(self): output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2) self.assertEqual( nested_simplify(output), + # Pipeline outputs are supposed to be deterministic and + # So we could in theory have real values "A", "B", "C" instead + # of ANY(str). + # However it seems that in this particular case, the floating + # scores are so close, we enter floating error approximation + # and the order is not guaranteed anymore with batching. [ - [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], - # Very odd inversion, but it's a random model, floating errors might account for this since all scores are similar. - [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}], - [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], - [{"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}, {"score": 0.333, "label": "B"}], - [{"score": 0.333, "label": "B"}, {"score": 0.333, "label": "A"}, {"score": 0.333, "label": "C"}], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], ], ) From 5a6401e4919d12a2bb65c003bb2b718692bfa148 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 17 Feb 2022 09:06:48 +0100 Subject: [PATCH 09/12] Adressing suraj's comments on docs mostly. --- src/transformers/pipelines/base.py | 2 ++ .../zero_shot_image_classification.py | 32 +++++++------------ ...ipelines_zero_shot_image_classification.py | 16 +++++----- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 8d80075d949d8c..62e3abf37ecd58 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -105,7 +105,9 @@ def _pad(items, key, padding_value, padding_side): def pad_collate_fn(tokenizer, feature_extractor): + # Tokenizer t_padding_side = None + # Feature extractor f_padding_side = None if tokenizer is None and feature_extractor is None: raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching") diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index c85e8d361584e9..65836c584dcc46 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -19,14 +19,14 @@ @add_end_docstrings(PIPELINE_INIT_ARGS) class ZeroShotImageClassificationPipeline(ChunkPipeline): """ - Image classification pipeline using any `AutoModelForZeroShotImageClassification`. This pipeline predicts the class - of an image. + Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you + provide an image and a set of `candidate_labels`. This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: - `"image-classification"`. + `"zero-shot-image-classification"`. See the list of available models on - [huggingface.co/models](https://huggingface.co/models?filter=image-classification). + [huggingface.co/models](https://huggingface.co/models?filter=zer-shot-image-classification). """ def __init__(self, **kwargs): @@ -51,25 +51,20 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar - A string containing a local path to an image - An image loaded in PIL directly - The pipeline accepts either a single image or a batch of images, which must then be passed as a string. - Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL - images. candidate_labels (`List[str]`): The candidate labels for this image - hypothesis_template (`str`, *optional*, defaults to `"This is a photo of a {}"`): + + hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`): The sentence used in cunjunction with *candidate_labels* to attempt the image classification by replacing the placeholder with the candidate_labels. Then likelihood is estimated by using - likelihood_per_image + logits_per_image Return: - A dictionary or a list of dictionaries containing result. If the input is a single image, will return a - dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to - the images. - - The dictionaries contain the following keys: + A list of dictionaries containing result, one dictionnary per proposed label. The dictionaries contain the + following keys: - - **label** (`str`) -- The label identified by the model. - - **score** (`int`) -- The score attributed by the model for that label. + - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`. + - **score** (`int`) -- The score attributed by the model for that label (between 0 and 1). """ return super().__call__(images, **kwargs) @@ -80,10 +75,7 @@ def _sanitize_parameters(self, **kwargs): if "hypothesis_template" in kwargs: preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] - postprocess_params = {} - if "multi_label" in kwargs: - postprocess_params["multi_label"] = kwargs["multi_label"] - return preprocess_params, {}, postprocess_params + return preprocess_params, {}, {} def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}."): n = len(candidate_labels) diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py index 568dc7288d694e..65272b3a5bab0d 100644 --- a/tests/test_pipelines_zero_shot_image_classification.py +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -54,14 +54,14 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe # return # # return None, None - # speech_recognizer = ZeroShotImageClassificationPipeline( + # image_classifier = ZeroShotImageClassificationPipeline( # model=model, tokenizer=tokenizer, feature_extractor=feature_extractor # ) # # test with a raw waveform # image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") # image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - # return speech_recognizer, [image, image2] + # return image_classifier, [image, image2] # def run_pipeline_test(self, pipe, examples): # image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") @@ -77,18 +77,18 @@ def test_small_model_tf(self): @require_torch def test_small_model_pt(self): - speech_recognizer = pipeline( + image_classifier = pipeline( model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", ) image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - output = speech_recognizer(image, candidate_labels=["a", "b", "c"]) + output = image_classifier(image, candidate_labels=["a", "b", "c"]) self.assertEqual( nested_simplify(output), [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}], ) - output = speech_recognizer([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2) + output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2) self.assertEqual( nested_simplify(output), # Pipeline outputs are supposed to be deterministic and @@ -129,13 +129,13 @@ def test_small_model_pt(self): @slow @require_torch def test_large_model_pt(self): - speech_recognizer = pipeline( + image_classifier = pipeline( task="zero-shot-image-classification", model="openai/clip-vit-base-patch32", ) # This is an image of 2 cats with remotes and no planes image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - output = speech_recognizer(image, candidate_labels=["cat", "plane", "remote"]) + output = image_classifier(image, candidate_labels=["cat", "plane", "remote"]) self.assertEqual( nested_simplify(output), @@ -146,7 +146,7 @@ def test_large_model_pt(self): ], ) - output = speech_recognizer([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2) + output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2) self.assertEqual( nested_simplify(output), [ From df0faf800351190ff45afecf4088e59bf4e25930 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 17 Feb 2022 14:51:38 +0100 Subject: [PATCH 10/12] Tf support start. --- src/transformers/pipelines/__init__.py | 4 +- ...ipelines_zero_shot_image_classification.py | 88 +++++++++++++++++-- 2 files changed, 85 insertions(+), 7 deletions(-) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index d149d35cc3f90b..4e20e71240b318 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -242,9 +242,9 @@ }, "zero-shot-image-classification": { "impl": ZeroShotImageClassificationPipeline, - "tf": (), + "tf": (TFAutoModel,) if is_tf_available() else (), "pt": (AutoModel,) if is_torch_available() else (), - "default": {"pt": "openai/clip-vit-base-patch32"}, + "default": {"pt": "openai/clip-vit-base-patch32", "tf": "openai/clip-vit-base-patch32"}, "type": "multimodal", }, "conversational": { diff --git a/tests/test_pipelines_zero_shot_image_classification.py b/tests/test_pipelines_zero_shot_image_classification.py index 65272b3a5bab0d..c314b92a0b141d 100644 --- a/tests/test_pipelines_zero_shot_image_classification.py +++ b/tests/test_pipelines_zero_shot_image_classification.py @@ -39,7 +39,6 @@ def open(*args, **kwargs): @require_vision -@require_torch @is_pipeline_test class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping, @@ -71,10 +70,6 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe # # Batching # outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"]) - @require_tf - def test_small_model_tf(self): - self.skipTest("Not implemented in Tensorflow") - @require_torch def test_small_model_pt(self): image_classifier = pipeline( @@ -126,6 +121,57 @@ def test_small_model_pt(self): ], ) + @require_tf + def test_small_model_tf(self): + image_classifier = pipeline( + model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", framework="tf" + ) + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + output = image_classifier(image, candidate_labels=["a", "b", "c"]) + + self.assertEqual( + nested_simplify(output), + [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}], + ) + + output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2) + self.assertEqual( + nested_simplify(output), + # Pipeline outputs are supposed to be deterministic and + # So we could in theory have real values "A", "B", "C" instead + # of ANY(str). + # However it seems that in this particular case, the floating + # scores are so close, we enter floating error approximation + # and the order is not guaranteed anymore with batching. + [ + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + [ + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + {"score": 0.333, "label": ANY(str)}, + ], + ], + ) + @slow @require_torch def test_large_model_pt(self): @@ -158,3 +204,35 @@ def test_large_model_pt(self): ] * 5, ) + + @slow + @require_tf + def test_large_model_tf(self): + image_classifier = pipeline( + task="zero-shot-image-classification", model="openai/clip-vit-base-patch32", framework="tf" + ) + # This is an image of 2 cats with remotes and no planes + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + output = image_classifier(image, candidate_labels=["cat", "plane", "remote"]) + + self.assertEqual( + nested_simplify(output), + [ + {"score": 0.941, "label": "cat"}, + {"score": 0.055, "label": "remote"}, + {"score": 0.003, "label": "plane"}, + ], + ) + + output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2) + self.assertEqual( + nested_simplify(output), + [ + [ + {"score": 0.941, "label": "cat"}, + {"score": 0.055, "label": "remote"}, + {"score": 0.003, "label": "plane"}, + ], + ] + * 5, + ) From 08cfaae8e4a2b02b02a9d1a30c3607345d2df044 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 17 Feb 2022 15:41:23 +0100 Subject: [PATCH 11/12] TF support. --- .../zero_shot_image_classification.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 65836c584dcc46..062968138448fe 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -1,6 +1,12 @@ from typing import List, Union -from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends +from ..file_utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + is_vision_available, + requires_backends, +) from ..utils import logging from .base import PIPELINE_INIT_ARGS, ChunkPipeline @@ -13,6 +19,9 @@ if is_torch_available(): import torch +if is_tf_available(): + import tensorflow as tf + logger = logging.get_logger(__name__) @@ -32,9 +41,6 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline): def __init__(self, **kwargs): super().__init__(**kwargs) - if self.framework != "pt": - raise ValueError(f"The {self.__class__} is only available in PyTorch.") - requires_backends(self, "vision") # No specific FOR_XXX available yet # self.check_model_type(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING) @@ -81,9 +87,9 @@ def preprocess(self, image, candidate_labels=None, hypothesis_template="This is n = len(candidate_labels) for i, candidate_label in enumerate(candidate_labels): image = load_image(image) - images = self.feature_extractor(images=[image], return_tensors="pt") + images = self.feature_extractor(images=[image], return_tensors=self.framework) sequence = hypothesis_template.format(candidate_label) - inputs = self.tokenizer(sequence, return_tensors="pt") + inputs = self.tokenizer(sequence, return_tensors=self.framework) inputs["pixel_values"] = images.pixel_values yield {"is_last": i == n - 1, "candidate_label": candidate_label, **inputs} @@ -95,7 +101,8 @@ def _forward(self, model_inputs): # Clip does crossproduct scoring by default, so we're only # interested in the results where image and text and in the same # batch position. - logits_per_image = torch.diagonal(outputs.logits_per_image) + diag = torch.diagonal if self.framework == "pt" else tf.linalg.diag_part + logits_per_image = diag(outputs.logits_per_image) model_outputs = { "is_last": is_last, @@ -104,12 +111,16 @@ def _forward(self, model_inputs): } return model_outputs - def postprocess(self, model_outputs, multi_label=False): + def postprocess(self, model_outputs): candidate_labels = [outputs["candidate_label"] for outputs in model_outputs] - logits = torch.cat([output["logits_per_image"] for output in model_outputs]) - print("Logits", logits) - probs = logits.softmax(dim=0) - scores = probs.tolist() + if self.framework == "pt": + logits = torch.cat([output["logits_per_image"] for output in model_outputs]) + probs = logits.softmax(dim=0) + scores = probs.tolist() + else: + logits = tf.concat([output["logits_per_image"] for output in model_outputs], axis=0) + probs = tf.nn.softmax(logits, axis=0) + scores = probs.numpy().tolist() result = [ {"score": score, "label": candidate_label} From 7bd23df78d213f1940ca9dce970ba7b023aba47a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 17 Feb 2022 15:42:50 +0100 Subject: [PATCH 12/12] Update src/transformers/pipelines/zero_shot_image_classification.py Co-authored-by: Suraj Patil --- src/transformers/pipelines/zero_shot_image_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 062968138448fe..fb4036a9fa3333 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -70,7 +70,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar following keys: - **label** (`str`) -- The label identified by the model. It is one of the suggested `candidate_label`. - - **score** (`int`) -- The score attributed by the model for that label (between 0 and 1). + - **score** (`float`) -- The score attributed by the model for that label (between 0 and 1). """ return super().__call__(images, **kwargs)