import torch from PIL import Image from open_clip.factory import get_tokenizer import pytest import open_clip import os os.environ["CUDA_VISIBLE_DEVICES"] = "" @pytest.mark.parametrize("model_type,pretrained", [("ViT-B-32-quickgelu", "laion400m_e32"), ("roberta-ViT-B-32", "laion2b_s12b_b32k")]) def test_inference_simple(model_type, pretrained): model, _, preprocess = open_clip.create_model_and_transforms(model_type, pretrained=pretrained, jit=False) tokenizer = get_tokenizer(model_type) current_dir = os.path.dirname(os.path.realpath(__file__)) image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0) text = tokenizer(["a diagram", "a dog", "a cat"]) with torch.no_grad(): image_features = model.encode_image(image) text_features = model.encode_text(text) text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0]