kunyi commited on May 12, 2023

Commit

f76d30f

1 Parent(s): 330aea7

Upload 30 files

Files changed (30) hide show

README.md +279 -3
README_CN.md +281 -0
clip/__init__.py +5 -0
clip/bert_tokenizer.py +436 -0
clip/configuration_bert.py +86 -0
clip/model.py +914 -0
clip/model_configs/RBT3-chinese.json +13 -0
clip/model_configs/RN50.json +7 -0
clip/model_configs/RoBERTa-wwm-ext-base-chinese.json +13 -0
clip/model_configs/RoBERTa-wwm-ext-large-chinese.json +13 -0
clip/model_configs/ViT-B-16.json +7 -0
clip/model_configs/ViT-B-32.json +7 -0
clip/model_configs/ViT-H-14.json +8 -0
clip/model_configs/ViT-L-14-336.json +7 -0
clip/model_configs/ViT-L-14.json +7 -0
clip/modeling_bert.py +484 -0
clip/utils.py +184 -0
clip/vocab.txt +0 -0
eval/cvinw_zeroshot_templates.py +474 -0
eval/data.py +164 -0
eval/evaluation.py +157 -0
eval/evaluation_tr.py +157 -0
eval/extract_features.py +212 -0
eval/make_topk_predictions.py +88 -0
eval/make_topk_predictions_tr.py +88 -0
eval/transform_ir_annotation_to_tr.py +36 -0
eval/zeroshot_evaluation.py +267 -0
examples/pokemon.jpeg +0 -0
requirements.txt +10 -0
scripts/zeroshot_eval.sh +34 -0

README.md CHANGED Viewed

@@ -1,3 +1,279 @@
----
-license: apache-2.0
----

+[**中文说明**](README_CN.md) | [**English**](README.md)
+# Introduction
+<br><br>
+This project aims to provide a better Chinese CLIP model. The training data used in this project consists of publicly accessible image URLs and related Chinese text descriptions, totaling 400 million. After screening, we ultimately used 100 million data for training.
+This project is produced by QQ-ARC Joint Lab, Tencent PCG.
+<br><br>
+# Models and Results
+<span id="model_card"></span>
+## Model Card
+QA-CLIP currently has three different open-source models of different sizes, and their model information and download links are shown in the table below:
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Model</th><th>Ckp</th><th>Params</th><th>Vision</th><th>Params of Vision</th><th>Text</th><th>Params of Text</th><th>Resolution</th>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>RN50</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-RN50.pt">Download</a></td><td>77M</td><td>ResNet50</td><td>38M</td><td>RBT3</td><td>39M</td><td>224</td>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>ViT-B/16</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-base.pt">Download</a></td><td>188M</td><td>ViT-B/16</td><td>86M</td><td>RoBERTa-wwm-Base</td><td>102M</td><td>224</td>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>ViT-L/14</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-large.pt">Download</a></td><td>406M</td><td>ViT-L/14</td><td>304M</td><td>RoBERTa-wwm-Base</td><td>102M</td><td>224</td>
+    </tr>
+</table>
+<br>
+## Results
+We conducted zero-shot tests on [MUGE Retrieval](https://tianchi.aliyun.com/muge), [Flickr30K-CN](https://github.com/li-xirong/cross-lingual-cap), and [COCO-CN](https://github.com/li-xirong/coco-cn) datasets for image-text retrieval tasks. For the image zero-shot classification task, we tested on the ImageNet dataset. The test results are shown in the table below:
+**Flickr30K-CN Zero-shot Retrieval (Official Test Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>48.8</td><td>76.0</td><td>84.6</td><td>60.0</td><td>85.9</td><td>92.0</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>50.5</b></td><td><b>77.4</b></td><td><b>86.1</b></td><td><b>67.1</b></td><td><b>87.9</b></td><td><b>93.2</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>62.7</td><td>86.9</td><td>92.8</td><td>74.6</td><td>93.5</td><td>97.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>63.8</b></td><td><b>88.0</b></td><td><b>93.2</b></td><td><b>78.4</b></td><td><b>96.1</b></td><td><b>98.5</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>68.0</td><td>89.7</td><td>94.4</td><td>80.2</td><td>96.6</td><td>98.2</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td><b>69.7</b></td><td>90.1</td><td>94.8</td><td>84.8</td><td>97.7</td><td>99.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>69.3</td><td><b>90.3</b></td><td><b>94.7</b></td><td><b>85.3</b></td><td><b>97.9</b></td><td><b>99.2</b></td>
+    </tr>
+</table>
+<br>
+**MUGE Zero-shot Retrieval (Official Validation Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>42.6</td><td>68.5</td><td>78.0</td><td>30.0</td><td>56.2</td><td>66.9</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>44.0</b></td><td><b>69.9</b></td><td><b>79.5</b></td><td><b>32.4</b></td><td><b>59.5</b></td><td><b>70.3</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>52.1</td><td>76.7</td><td>84.4</td><td>38.7</td><td>65.6</td><td>75.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>53.2</b></td><td><b>77.7</b></td><td><b>85.1</b></td><td><b>40.7</b></td><td><b>68.2</b></td><td><b>77.2</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>56.4</td><td>79.8</td><td>86.2</td><td>42.6</td><td>69.8</td><td>78.6</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td>29.6</td><td>49.9</td><td>58.8</td><td>21.4</td><td>42.0</td><td>51.9</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>57.4</b></td><td><b>81.0</b></td><td><b>87.7</b></td><td><b>45.5</b></td><td><b>73.0</b></td><td><b>81.4</b></td>
+    </tr>
+</table>
+<br>
+**COCO-CN Zero-shot Retrieval (Official Test Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>48.1</td><td>81.3</td><td>90.5</td><td>50.9</td><td>81.1</td><td>90.5</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>50.1</b></td><td><b>82.5</b></td><td><b>91.7</b></td><td><b>56.7</b></td><td><b>85.2</b></td><td><b>92.9</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>62.2</td><td>87.1</td><td>94.9</td><td>56.3</td><td>84.0</td><td>93.3</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>62.9</b></td><td><b>87.7</b></td><td><b>94.7</b></td><td><b>61.5</b></td><td><b>87.6</b></td><td><b>94.8</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>64.9</td><td>88.8</td><td>94.2</td><td>60.6</td><td>84.4</td><td>93.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td>63.5</td><td>87.6</td><td>93.5</td><td>62.6</td><td><b>88.5</b></td><td><b>95.9</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>65.7</b></td><td><b>90.2</b></td><td><b>95.0</b></td><td><b>64.5</b></td><td>88.3</td><td>95.1</td>
+    </tr>
+</table>
+<br>
+**Zero-shot Image Classification on ImageNet**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="1">ImageNet</th>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>33.5</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>35.5</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>48.4</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>49.7</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>54.7</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>55.8</b></td>
+    </tr>
+</table>
+<br>
+<br><br>
+# Getting Started
+## Installation Requirements
+Environment configuration requirements:
+* python >= 3.6.4
+* pytorch >= 1.8.0 (with torchvision >= 0.9.0)
+* CUDA Version >= 10.2
+Install required packages:
+```bash
+cd /yourpath/QA-CLIP-main
+pip install -r requirements.txt
+```
+## Inference Code
+```bash
+export PYTHONPATH=/yourpath/QA-CLIP-main
+```
+Inference code example：
+```python
+import torch
+from PIL import Image
+import clip as clip
+from clip import load_from_name, available_models
+print("Available models:", available_models())
+# Available models: ['ViT-B-16', 'ViT-L-14', 'RN50']
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = load_from_name("ViT-B-16", device=device, download_root='./')
+model.eval()
+image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0).to(device)
+text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device)
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+    # Normalize the features. Please use the normalized features for downstream tasks.
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    logits_per_image, logits_per_text = model.get_similarity(image, text)
+    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+print("Label probs:", probs)
+```
+<br><br>
+## Prediction and Evaluation
+### Download Image-text Retrieval Test Dataset
+In Project <b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>, the test set has already been preprocessed. Here is the download link they provided:
+MUGE dataset：[download link](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/MUGE.zip)
+Flickr30K-CN dataset：[download link](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/Flickr30k-CN.zip)
+Additionally, obtaining the [COCO-CN](https://github.com/li-xirong/coco-cn) dataset requires applying to the original author.
+### Download ImageNet Dataset
+Please download the raw data yourself，[Chinese Label](http://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/ImageNet-1K/label_cn.txt) and [English Label](http://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/ImageNet-1K/label.txt) are provided by Project <b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>
+### Image-text Retrieval Evaluation
+The image-text retrieval evaluation code can be referred to as follows:
+```bash
+split=test # Designate the computation of features for the valid or test set
+resume=your_ckp_path
+DATAPATH=your_DATAPATH
+dataset_name=Flickr30k-CN
+# dataset_name=MUGE
+python -u eval/extract_features.py \
+    --extract-image-feats \
+    --extract-text-feats \
+    --image-data="${DATAPATH}/datasets/${dataset_name}/lmdb/${split}/imgs" \
+    --text-data="${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl" \
+    --img-batch-size=32 \
+    --text-batch-size=32 \
+    --context-length=52 \
+    --resume=${resume} \
+    --vision-model=ViT-B-16 \
+    --text-model=RoBERTa-wwm-ext-base-chinese
+python -u eval/make_topk_predictions.py \
+    --image-feats="${DATAPATH}/datasets/${dataset_name}/${split}_imgs.img_feat.jsonl" \
+    --text-feats="${DATAPATH}/datasets/${dataset_name}/${split}_texts.txt_feat.jsonl" \
+    --top-k=10 \
+    --eval-batch-size=32768 \
+    --output="${DATAPATH}/datasets/${dataset_name}/${split}_predictions.jsonl"
+python -u eval/make_topk_predictions_tr.py \
+    --image-feats="${DATAPATH}/datasets/${dataset_name}/${split}_imgs.img_feat.jsonl" \
+    --text-feats="${DATAPATH}/datasets/${dataset_name}/${split}_texts.txt_feat.jsonl" \
+    --top-k=10 \
+    --eval-batch-size=32768 \
+    --output="${DATAPATH}/datasets/${dataset_name}/${split}_tr_predictions.jsonl"
+python eval/evaluation.py \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_predictions.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/output1.json
+cat  ${DATAPATH}/datasets/${dataset_name}/output1.json
+python eval/transform_ir_annotation_to_tr.py \
+    --input ${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl
+python eval/evaluation_tr.py \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_texts.tr.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_tr_predictions.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/output2.json
+cat ${DATAPATH}/datasets/${dataset_name}/output2.json
+```
+### ImageNet Zero-shot Classification
+The ImageNet zero-shot classification code can be referred to as follows
+```bash
+bash scripts/zeroshot_eval.sh 0 \
+    ${DATAPATH} imagenet \
+    ViT-B-16 RoBERTa-wwm-ext-base-chinese \
+    ./pretrained_weights/QA-CLIP-base.pt
+```
+# Acknowledgments
+<br><br>
+The project code is based on implementation of <b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>, and we are very grateful for their outstanding open-source contributions.
+<br><br>

README_CN.md ADDED Viewed

	@@ -0,0 +1,281 @@

+[**中文说明**](README_CN.md) | [**English**](README.md)
+# 项目介绍
+<br><br>
+本项目旨在提供更好的中文CLIP模型。该项目使用的训练数据均为公开可访问的图像URL及相关中文文本描述，总量达到400M。经过筛选后，我们最终使用了100M的数据进行训练。
+本项目于QQ-ARC Joint Lab, Tencent PCG完成
+<br><br>
+# 模型及实验
+<span id="model_card"></span>
+## 模型规模 & 下载链接
+QA-CLIP目前开源3个不同规模，其模型信息和下载方式见下表：
+<table border="1" width="100%">
+    <tr align="center">
+        <th>模型规模</th><th>下载链接</th><th>参数量</th><th>视觉侧骨架</th><th>视觉侧参数量</th><th>文本侧骨架</th><th>文本侧参数量</th><th>分辨率</th>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>RN50</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-RN50.pt">Download</a></td><td>77M</td><td>ResNet50</td><td>38M</td><td>RBT3</td><td>39M</td><td>224</td>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>ViT-B/16</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-base.pt">Download</a></td><td>188M</td><td>ViT-B/16</td><td>86M</td><td>RoBERTa-wwm-Base</td><td>102M</td><td>224</td>
+    </tr>
+    <tr align="center">
+        <td>QA-CLIP<sub>ViT-L/14</sub></td><td><a href="https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-large.pt">Download</a></td><td>406M</td><td>ViT-L/14</td><td>304M</td><td>RoBERTa-wwm-Base</td><td>102M</td><td>224</td>
+    </tr>
+</table>
+<br>
+## 实验结果
+针对图文检索任务，我们在[MUGE Retrieval](https://tianchi.aliyun.com/muge)、[Flickr30K-CN](https://github.com/li-xirong/cross-lingual-cap)和[COCO-CN](https://github.com/li-xirong/coco-cn)上进行了zero-shot测试。
+针对图像零样本分类任务，我们在ImageNet数据集上进行了测试。测试结果见下表：
+**Flickr30K-CN Zero-shot Retrieval (Official Test Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>48.8</td><td>76.0</td><td>84.6</td><td>60.0</td><td>85.9</td><td>92.0</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>50.5</b></td><td><b>77.4</b></td><td><b>86.1</b></td><td><b>67.1</b></td><td><b>87.9</b></td><td><b>93.2</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>62.7</td><td>86.9</td><td>92.8</td><td>74.6</td><td>93.5</td><td>97.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>63.8</b></td><td><b>88.0</b></td><td><b>93.2</b></td><td><b>78.4</b></td><td><b>96.1</b></td><td><b>98.5</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>68.0</td><td>89.7</td><td>94.4</td><td>80.2</td><td>96.6</td><td>98.2</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td><b>69.7</b></td><td>90.1</td><td>94.8</td><td>84.8</td><td>97.7</td><td>99.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>69.3</td><td><b>90.3</b></td><td><b>94.7</b></td><td><b>85.3</b></td><td><b>97.9</b></td><td><b>99.2</b></td>
+    </tr>
+</table>
+<br>
+**MUGE Zero-shot Retrieval (Official Validation Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>42.6</td><td>68.5</td><td>78.0</td><td>30.0</td><td>56.2</td><td>66.9</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>44.0</b></td><td><b>69.9</b></td><td><b>79.5</b></td><td><b>32.4</b></td><td><b>59.5</b></td><td><b>70.3</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>52.1</td><td>76.7</td><td>84.4</td><td>38.7</td><td>65.6</td><td>75.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>53.2</b></td><td><b>77.7</b></td><td><b>85.1</b></td><td><b>40.7</b></td><td><b>68.2</b></td><td><b>77.2</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>56.4</td><td>79.8</td><td>86.2</td><td>42.6</td><td>69.8</td><td>78.6</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td>29.6</td><td>49.9</td><td>58.8</td><td>21.4</td><td>42.0</td><td>51.9</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>57.4</b></td><td><b>81.0</b></td><td><b>87.7</b></td><td><b>45.5</b></td><td><b>73.0</b></td><td><b>81.4</b></td>
+    </tr>
+</table>
+<br>
+**COCO-CN Zero-shot Retrieval (Official Test Set)**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="3">Text-to-Image</th><th colspan="3">Image-to-Text</th>
+    </tr>
+    <tr align="center">
+        <td>Metric</td><td>R@1</td><td>R@5</td><td>R@10</td><td>R@1</td><td>R@5</td><td>R@10</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>48.1</td><td>81.3</td><td>90.5</td><td>50.9</td><td>81.1</td><td>90.5</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>50.1</b></td><td><b>82.5</b></td><td><b>91.7</b></td><td><b>56.7</b></td><td><b>85.2</b></td><td><b>92.9</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>62.2</td><td>87.1</td><td>94.9</td><td>56.3</td><td>84.0</td><td>93.3</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>62.9</b></td><td><b>87.7</b></td><td><b>94.7</b></td><td><b>61.5</b></td><td><b>87.6</b></td><td><b>94.8</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>64.9</td><td>88.8</td><td>94.2</td><td>60.6</td><td>84.4</td><td>93.1</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">AltClip<sub>ViT-L/14</sub></td><td>63.5</td><td>87.6</td><td>93.5</td><td>62.6</td><td><b>88.5</b></td><td><b>95.9</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>65.7</b></td><td><b>90.2</b></td><td><b>95.0</b></td><td><b>64.5</b></td><td>88.3</td><td>95.1</td>
+    </tr>
+</table>
+<br>
+**Zero-shot Image Classification on ImageNet**:
+<table border="1" width="120%">
+	<tr align="center">
+        <th>Task</th><th colspan="1">ImageNet</th>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>RN50</sub></td><td>33.5</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>RN50</sub></td><td><b>35.5</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-B/16</sub></td><td>48.4</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-B/16</sub></td><td><b>49.7</b></td>
+    </tr>
+	<tr align="center">
+        <td width="120%">CN-CLIP<sub>ViT-L/14</sub></td><td>54.7</td>
+    </tr>
+	<tr align="center">
+        <td width="120%">QA-CLIP<sub>ViT-L/14</sub></td><td><b>55.8</b></td>
+    </tr>
+</table>
+<br>
+<br><br>
+# 使用教程
+## 安装要求
+环境配置要求:
+* python >= 3.6.4
+* pytorch >= 1.8.0 (with torchvision >= 0.9.0)
+* CUDA Version >= 10.2
+安装本项目所需库
+```bash
+cd /yourpath/QA-CLIP-main
+pip install -r requirements.txt
+```
+## 推理代码
+```bash
+export PYTHONPATH=/yourpath/QA-CLIP-main
+```
+推理代码示例：
+```python
+import torch
+from PIL import Image
+import clip as clip
+from clip import load_from_name, available_models
+print("Available models:", available_models())
+# Available models: ['ViT-B-16', 'ViT-L-14', 'RN50']
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = load_from_name("ViT-B-16", device=device, download_root='./')
+model.eval()
+image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0).to(device)
+text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device)
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+    # 对特征进行归一化，请使用归一化后的图文特征用于下游任务
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    logits_per_image, logits_per_text = model.get_similarity(image, text)
+    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+print("Label probs:", probs)
+```
+<br><br>
+## 预测及评估
+### 图文检索测试数据集下载
+<b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>项目中已经预处理好测试集，这是他们提供的下载链接：
+MUGE数据：[下载链接](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/MUGE.zip)
+Flickr30K-CN数据：[下载链接](https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/Flickr30k-CN.zip)
+另外[COCO-CN](https://github.com/li-xirong/coco-cn)数据的获取需要向原作者进行申请
+### ImageNet数据集下载
+原始数据请自行下载，[中文标签](http://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/ImageNet-1K/label_cn.txt)和[英文标签](http://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/datasets/ImageNet-1K/label.txt)同样由<b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>项目提供
+### 图文检索评估
+图文检索评估代码可以参考如下：
+```bash
+split=test # 指定计算valid或test集特征
+resume=your_ckp_path
+DATAPATH=your_DATAPATH
+dataset_name=Flickr30k-CN
+# dataset_name=MUGE
+python -u eval/extract_features.py \
+    --extract-image-feats \
+    --extract-text-feats \
+    --image-data="${DATAPATH}/datasets/${dataset_name}/lmdb/${split}/imgs" \
+    --text-data="${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl" \
+    --img-batch-size=32 \
+    --text-batch-size=32 \
+    --context-length=52 \
+    --resume=${resume} \
+    --vision-model=ViT-B-16 \
+    --text-model=RoBERTa-wwm-ext-base-chinese
+python -u eval/make_topk_predictions.py \
+    --image-feats="${DATAPATH}/datasets/${dataset_name}/${split}_imgs.img_feat.jsonl" \
+    --text-feats="${DATAPATH}/datasets/${dataset_name}/${split}_texts.txt_feat.jsonl" \
+    --top-k=10 \
+    --eval-batch-size=32768 \
+    --output="${DATAPATH}/datasets/${dataset_name}/${split}_predictions.jsonl"
+python -u eval/make_topk_predictions_tr.py \
+    --image-feats="${DATAPATH}/datasets/${dataset_name}/${split}_imgs.img_feat.jsonl" \
+    --text-feats="${DATAPATH}/datasets/${dataset_name}/${split}_texts.txt_feat.jsonl" \
+    --top-k=10 \
+    --eval-batch-size=32768 \
+    --output="${DATAPATH}/datasets/${dataset_name}/${split}_tr_predictions.jsonl"
+python eval/evaluation.py \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_predictions.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/output1.json
+cat  ${DATAPATH}/datasets/${dataset_name}/output1.json
+python eval/transform_ir_annotation_to_tr.py \
+    --input ${DATAPATH}/datasets/${dataset_name}/${split}_texts.jsonl
+python eval/evaluation_tr.py \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_texts.tr.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/${split}_tr_predictions.jsonl \
+    ${DATAPATH}/datasets/${dataset_name}/output2.json
+cat ${DATAPATH}/datasets/${dataset_name}/output2.json
+```
+### ImageNet零样本分类
+ImageNet零样本分类的代码参考如下
+```bash
+bash scripts/zeroshot_eval.sh 0 \
+    ${DATAPATH} imagenet \
+    ViT-B-16 RoBERTa-wwm-ext-base-chinese \
+    ./pretrained_weights/QA-CLIP-base.pt
+```
+# 致谢
+<br><br>
+项目代码基于<b>[Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP)</b>实现，非常感谢他们优秀的开源工作。
+<br><br>

clip/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bert_tokenizer import FullTokenizer
+_tokenizer = FullTokenizer()
+from .model import convert_state_dict
+from .utils import load_from_name, available_models, tokenize, image_transform, load

clip/bert_tokenizer.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import unicodedata
+import six
+from functools import lru_cache
+import os
+@lru_cache()
+def default_vocab():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "vocab.txt")
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+    if not init_checkpoint:
+        return
+    m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+    if m is None:
+        return
+    model_name = m.group(1)
+    lower_models = [
+        "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+        "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    ]
+    cased_models = [
+        "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+        "multi_cased_L-12_H-768_A-12"
+    ]
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = "False"
+        case_name = "lowercased"
+        opposite_flag = "True"
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = "True"
+        case_name = "cased"
+        opposite_flag = "False"
+    if is_bad_config:
+        raise ValueError(
+            "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+            "However, `%s` seems to be a %s model, so you "
+            "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+            "how the model was pre-training. If this error is wrong, please "
+            "just comment out this check." % (actual_flag, init_checkpoint,
+                                              model_name, case_name, opposite_flag))
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file=default_vocab(), do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+    @staticmethod
+    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of tokens (string) in a single string. """
+        def clean_up_tokenization(out_string):
+            """ Clean up a list of simple English tokenization artifacts
+            like spaces before punctuations and abreviated forms.
+            """
+            out_string = (
+                out_string.replace(" .", ".")
+                    .replace(" ?", "?")
+                    .replace(" !", "!")
+                    .replace(" ,", ",")
+                    .replace(" ' ", "'")
+                    .replace(" n't", "n't")
+                    .replace(" 'm", "'m")
+                    .replace(" 's", "'s")
+                    .replace(" 've", "'ve")
+                    .replace(" 're", "'re")
+            )
+            return out_string
+        text = ' '.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            clean_text = clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def vocab_size(self):
+        return len(self.vocab)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False

clip/configuration_bert.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+logger = logging.getLogger(__name__)
+class BertConfig(object):
+    r"""
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 output_attentions=False,
+                 output_hidden_states=False,
+                 use_flash_attention=False
+                 ):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_flash_attention = use_flash_attention

clip/model.py ADDED Viewed

	@@ -0,0 +1,914 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+from itertools import repeat
+import collections.abc
+import math
+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+import importlib.util
+if importlib.util.find_spec('flash_attn'):
+    FlashMHA = importlib.import_module('flash_attn.flash_attention').FlashMHA
+from clip import _tokenizer
+from clip.configuration_bert import BertConfig
+from clip.modeling_bert import BertModel
+try:
+    from transformers import CLIPTextModelWithProjection
+except:
+    pass
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        # FIXME support for non-transformer
+        pass
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, use_flash_attention: bool = False):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head) if not use_flash_attention else FlashMHA(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.use_flash_attention = use_flash_attention
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        if self.use_flash_attention:
+            # Batch first is needed for FlashAttention. See https://github.com/HazyResearch/flash-attention/issues/84 for more information.
+            return self.attn(x.transpose(1, 0))[0].transpose(1, 0)
+        else:
+            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_flash_attention: bool = False):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.grad_checkpointing = False
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, use_flash_attention) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for r in self.resblocks:
+                x = checkpoint(r, x)
+            return x
+        return self.resblocks(x)
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int, use_flash_attention: bool = False):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.grid_size = (self.input_resolution // patch_size, self.input_resolution // patch_size)
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, use_flash_attention=use_flash_attention)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.grad_checkpointing = enable
+    def random_masking(self, x, mask_ratio):
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int((L - 1) * (1 - mask_ratio))
+        noise = torch.rand(N, L - 1, device=x.device)
+        ids_shuffle = torch.argsort(noise, dim=1) + torch.ones(N, L - 1, device=x.device,
+                                                               dtype=int)
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        x0 = x[:, 0, :]
+        x0 = x0.reshape(N, 1, D)
+        x_masked_add = torch.cat([x0, x_masked], axis=1)
+        return x_masked_add
+    def forward(self, x: torch.Tensor, mask_ratio: float = 0.0):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        if mask_ratio != 0:
+            x = self.random_masking(x, mask_ratio)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 vocab_size: int,
+                 text_attention_probs_dropout_prob: float,
+                 text_hidden_act: str,
+                 text_hidden_dropout_prob: float,
+                 text_hidden_size: int,
+                 text_initializer_range: float,
+                 text_intermediate_size: int,
+                 text_max_position_embeddings: int,
+                 text_num_attention_heads: int,
+                 text_num_hidden_layers: int,
+                 text_type_vocab_size: int,
+                 tokenizer = _tokenizer,
+                 # vision head width, added this param for ViT-H
+                 vision_head_width: int = 64,
+                 use_flash_attention: bool = False,
+                 ):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // vision_head_width
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // vision_head_width
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                use_flash_attention=use_flash_attention
+            )
+        self.bert_config = BertConfig(
+            vocab_size_or_config_json_file=vocab_size,
+            hidden_size=text_hidden_size,
+            num_hidden_layers=text_num_hidden_layers,
+            num_attention_heads=text_num_attention_heads,
+            intermediate_size=text_intermediate_size,
+            hidden_act=text_hidden_act,
+            hidden_dropout_prob=text_hidden_dropout_prob,
+            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
+            max_position_embeddings=text_max_position_embeddings,
+            type_vocab_size=text_type_vocab_size,
+            initializer_range=text_initializer_range,
+            layer_norm_eps=1e-12,
+            use_flash_attention=use_flash_attention
+        )
+        self.bert = BertModel(self.bert_config)
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.tokenizer = tokenizer
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.bert_config.hidden_size ** -0.5)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.bert.set_grad_checkpointing(enable)
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, mask_ratio=0):
+        if isinstance(self.visual, ModifiedResNet):
+            # mask_ratio > 0 (FLIP strategy) is currently only implemented for VisualTransformer.
+            return self.visual(image.type(self.dtype))
+        return self.visual(image.type(self.dtype), mask_ratio)
+    def encode_text(self, text):
+        pad_index = self.tokenizer.vocab['[PAD]']
+        attn_mask = text.ne(pad_index).type(self.dtype)
+        x = self.bert(text, attention_mask=attn_mask)[0].type(self.dtype) # [batch_size, seq_length, hidden_size]
+        return x[:, 0, :] @ self.text_projection
+    def forward(self, image, text, mask_ratio=0):
+        assert image is not None or text is not None, "text and image cannot both be None!"
+        if image is None:
+            return self.encode_text(text)
+        elif text is None:
+            return self.encode_image(image, mask_ratio)
+        image_features = self.encode_image(image, mask_ratio)
+        text_features = self.encode_text(text)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return image_features, text_features, self.logit_scale.exp()
+    def get_similarity(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+class CLIPWithTwoTextEncoder(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 vocab_size: int,
+                 text_attention_probs_dropout_prob: float,
+                 text_hidden_act: str,
+                 text_hidden_dropout_prob: float,
+                 text_hidden_size: int,
+                 text_initializer_range: float,
+                 text_intermediate_size: int,
+                 text_max_position_embeddings: int,
+                 text_num_attention_heads: int,
+                 text_num_hidden_layers: int,
+                 text_type_vocab_size: int,
+                 tokenizer = _tokenizer,
+                 # vision head width, added this param for ViT-H
+                 vision_head_width: int = 64,
+                 use_flash_attention: bool = False,
+                 openai_clip_path: str = "/group/30042/kunyi/CLIP/clip-vit-large-patch14/",
+                 ):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // vision_head_width
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // vision_head_width
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                use_flash_attention=use_flash_attention
+            )
+        self.bert_config = BertConfig(
+            vocab_size_or_config_json_file=vocab_size,
+            hidden_size=text_hidden_size,
+            num_hidden_layers=text_num_hidden_layers,
+            num_attention_heads=text_num_attention_heads,
+            intermediate_size=text_intermediate_size,
+            hidden_act=text_hidden_act,
+            hidden_dropout_prob=text_hidden_dropout_prob,
+            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
+            max_position_embeddings=text_max_position_embeddings,
+            type_vocab_size=text_type_vocab_size,
+            initializer_range=text_initializer_range,
+            layer_norm_eps=1e-12,
+            use_flash_attention=use_flash_attention
+        )
+        self.bert = BertModel(self.bert_config)
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.tokenizer = tokenizer
+        print('loading openai clip text encoder')
+        self.openai_clip_text_encoder = CLIPTextModelWithProjection.from_pretrained(openai_clip_path)
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.bert_config.hidden_size ** -0.5)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.bert.set_grad_checkpointing(enable)
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, mask_ratio=0):
+        if isinstance(self.visual, ModifiedResNet):
+            # mask_ratio > 0 (FLIP strategy) is currently only implemented for VisualTransformer.
+            return self.visual(image.type(self.dtype))
+        return self.visual(image.type(self.dtype), mask_ratio)
+    def encode_text(self, text):
+        pad_index = self.tokenizer.vocab['[PAD]']
+        attn_mask = text.ne(pad_index).type(self.dtype)
+        x = self.bert(text, attention_mask=attn_mask)[0].type(self.dtype) # [batch_size, seq_length, hidden_size]
+        return x[:, 0, :] @ self.text_projection
+    def encode_text_ENG(self, text):
+        text_emb = self.openai_clip_text_encoder(text).text_embeds
+        return text_emb
+    def forward(self, image, text, is_ENG=False, mask_ratio=0):
+        assert image is not None or text is not None, "text and image cannot both be None!"
+        if image is None:
+            if not is_ENG:
+                return self.encode_text(text)
+            else:
+                return self.encode_text_ENG(text)
+        elif text is None:
+            return self.encode_image(image, mask_ratio)
+        image_features = self.encode_image(image, mask_ratio)
+        if not is_ENG:
+            text_features = self.encode_text(text)
+        else:
+            text_features = self.encode_text_ENG(text)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return image_features, text_features, self.logit_scale.exp()
+    def get_similarity(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+class CLIP4SD(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 vocab_size: int,
+                 text_attention_probs_dropout_prob: float,
+                 text_hidden_act: str,
+                 text_hidden_dropout_prob: float,
+                 text_hidden_size: int,
+                 text_initializer_range: float,
+                 text_intermediate_size: int,
+                 text_max_position_embeddings: int,
+                 text_num_attention_heads: int,
+                 text_num_hidden_layers: int,
+                 text_type_vocab_size: int,
+                 tokenizer = _tokenizer,
+                 # vision head width, added this param for ViT-H
+                 vision_head_width: int = 64,
+                 use_flash_attention: bool = False,
+                 ):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // vision_head_width
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // vision_head_width
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                use_flash_attention=use_flash_attention
+            )
+        self.bert_config = BertConfig(
+            vocab_size_or_config_json_file=vocab_size,
+            hidden_size=text_hidden_size,
+            num_hidden_layers=text_num_hidden_layers,
+            num_attention_heads=text_num_attention_heads,
+            intermediate_size=text_intermediate_size,
+            hidden_act=text_hidden_act,
+            hidden_dropout_prob=text_hidden_dropout_prob,
+            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
+            max_position_embeddings=text_max_position_embeddings,
+            type_vocab_size=text_type_vocab_size,
+            initializer_range=text_initializer_range,
+            layer_norm_eps=1e-12,
+            use_flash_attention=use_flash_attention
+        )
+        self.bert = BertModel(self.bert_config)
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.tokenizer = tokenizer
+        self.ln_final = LayerNorm(text_hidden_size)
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.bert_config.hidden_size ** -0.5)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.bert.set_grad_checkpointing(enable)
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, mask_ratio=0):
+        if isinstance(self.visual, ModifiedResNet):
+            # mask_ratio > 0 (FLIP strategy) is currently only implemented for VisualTransformer.
+            return self.visual(image.type(self.dtype))
+        return self.visual(image.type(self.dtype), mask_ratio)
+    # def encode_text(self, text):
+    #     pad_index = self.tokenizer.vocab['[PAD]']
+    #     attn_mask = text.ne(pad_index).type(self.dtype)
+    #     x = self.bert(text, attention_mask=attn_mask)[0].type(self.dtype) # [batch_size, seq_length, hidden_size]
+    #     return x[:, 0, :] @ self.text_projection
+    def encode_text(self, text):
+        pad_index = self.tokenizer.vocab['[PAD]']
+        attn_mask = text.ne(pad_index).type(self.dtype)
+        x = self.bert(text, attention_mask=attn_mask)[0].type(self.dtype) # [batch_size, seq_length, hidden_size]
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text, mask_ratio=0):
+        assert image is not None or text is not None, "text and image cannot both be None!"
+        if image is None:
+            return self.encode_text(text)
+        elif text is None:
+            return self.encode_image(image)
+        image_features = self.encode_image(image, mask_ratio)
+        text_features = self.encode_text(text)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return image_features, text_features, self.logit_scale.exp()
+    def get_similarity(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_models_to_fp32(model):
+    for p in model.parameters():
+        p.data = p.data.float()
+        if p.grad:
+            p.grad.data = p.grad.data.float()
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        if isinstance(l, BertModel):
+            l.to(torch.half)
+        for name in ["text_projection", "proj"]:
+            try:
+                if hasattr(l, name):
+                    attr = getattr(l, name)
+                    if attr is not None:
+                        attr.data = attr.data.half()
+            except:
+                print('name', name)
+    model.apply(_convert_weights_to_fp16)
+def restore_model(model, clip_state_dict: dict, bert_state_dict: dict, use_flash_attention: bool):
+    merged_state_dict = {}
+    # use clip_state_dict to initialize the image encoder & logit scale
+    if clip_state_dict is not None:
+        for k, v in clip_state_dict.items():
+            if k.startswith("visual") or k == "logit_scale":
+                merged_state_dict[k] = v
+    # use bert_state_dict to initialize the text encoder
+    if bert_state_dict is not None:
+        for k, v in bert_state_dict.items():
+            if k.startswith("bert") and "bert.pooler" not in k:
+                merged_state_dict[k] = v
+    # adapt flash attention
+    if use_flash_attention:
+        merged_state_dict = convert_state_dict(merged_state_dict)
+    convert_weights(model)
+    resize_pos_embed(merged_state_dict, model)
+    model.load_state_dict(merged_state_dict, strict=False)
+    return model.eval()
+def convert_state_dict(state_dict):
+    """Adapt to Flash Attention"""
+    if not state_dict:
+        return state_dict
+    prefix = 'module.' if list(state_dict.keys())[0].startswith('module') else ''
+    if f'{prefix}visual.transformer.resblocks.0.attn.in_proj_weight' in state_dict:
+        for k in list(state_dict.keys()):
+            if 'attn.in_proj_weight' in k:
+                state_dict[k.replace('attn.in_proj_weight', 'attn.Wqkv.weight')] = state_dict.pop(k)
+            elif 'attn.in_proj_bias' in k:
+                state_dict[k.replace('attn.in_proj_bias', 'attn.Wqkv.bias')] = state_dict.pop(k)
+    elif f'{prefix}visual.transformer.resblocks.0.attn.Wqkv.weight' in state_dict:
+        for k in list(state_dict.keys()):
+            if 'attn.Wqkv.weight' in k:
+                state_dict[k.replace('attn.Wqkv.weight', 'attn.in_proj_weight')] = state_dict.pop(k)
+            elif 'attn.Wqkv.bias' in k:
+                state_dict[k.replace('attn.Wqkv.bias', 'attn.in_proj_bias')] = state_dict.pop(k)
+    if f'{prefix}bert.encoder.layer.0.attention.self.query.weight' in state_dict:
+        i = 0
+        while f'{prefix}bert.encoder.layer.{i}.attention.self.query.weight' in state_dict:
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.Wqkv.weight'] = torch.cat(
+                (state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.query.weight'),
+                 state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.key.weight'),
+                 state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.value.weight'))
+            )
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.Wqkv.bias'] = torch.cat(
+                (state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.query.bias'),
+                 state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.key.bias'),
+                 state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.value.bias'))
+            )
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.out_proj.weight'] = \
+                state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.output.dense.weight')
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.out_proj.bias'] = \
+                state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.output.dense.bias')
+            i += 1
+    elif f'{prefix}bert.encoder.layer.0.attention.self.Wqkv.weight' in state_dict:
+        i = 0
+        while f'{prefix}bert.encoder.layer.{i}.attention.self.Wqkv.weight' in state_dict:
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.query.weight'], \
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.key.weight'], \
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.value.weight'] = \
+                torch.chunk(state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.Wqkv.weight'), chunks=3)
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.query.bias'], \
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.key.bias'], \
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.self.value.bias'] = \
+                torch.chunk(state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.Wqkv.bias'), chunks=3)
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.output.dense.weight'] = \
+                state_dict.pop(f'{prefix}bert.encoder.layer.{i}.attention.self.out_proj.weight')
+            state_dict[f'{prefix}bert.encoder.layer.{i}.attention.output.dense.bias'] = \
+                state_dict.pop(f'module.bert.encoder.layer.{i}.attention.self.out_proj.bias')
+            i += 1
+    return state_dict
+def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1, prefix=""):
+    # Rescale the grid of position embeddings when loading from state_dict
+    old_pos_embed = state_dict.get(prefix + 'visual.positional_embedding', None)
+    model = model.module if hasattr(model, 'module') else model
+    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
+        return
+    grid_size = to_2tuple(model.visual.grid_size)
+    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+    if new_seq_len == old_pos_embed.shape[0]:
+        return
+    if extra_tokens:
+        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+    else:
+        pos_emb_tok, pos_emb_img = None, old_pos_embed
+    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+    pos_emb_img = F.interpolate(
+        pos_emb_img,
+        size=grid_size,
+        mode=interpolation,
+        align_corners=True,
+    )
+    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+    if pos_emb_tok is not None:
+        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+    else:
+        new_pos_embed = pos_emb_img
+    state_dict[prefix + 'visual.positional_embedding'] = new_pos_embed
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = lambda n, x: _ntuple(n)(x)

clip/model_configs/RBT3-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 768,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 3072,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 12,
+    "text_num_hidden_layers": 3,
+    "text_type_vocab_size": 2
+}

clip/model_configs/RN50.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 1024,
+    "image_resolution": 224,
+    "vision_layers": "[3,4,6,3]",
+    "vision_width": 64,
+    "vision_patch_size": null
+}

clip/model_configs/RoBERTa-wwm-ext-base-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 768,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 3072,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 12,
+    "text_num_hidden_layers": 12,
+    "text_type_vocab_size": 2
+}

clip/model_configs/RoBERTa-wwm-ext-large-chinese.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "vocab_size": 21128,
+    "text_attention_probs_dropout_prob": 0.1,
+    "text_hidden_act": "gelu",
+    "text_hidden_dropout_prob": 0.1,
+    "text_hidden_size": 1024,
+    "text_initializer_range": 0.02,
+    "text_intermediate_size": 4096,
+    "text_max_position_embeddings": 512,
+    "text_num_attention_heads": 16,
+    "text_num_hidden_layers": 24,
+    "text_type_vocab_size": 2
+}

clip/model_configs/ViT-B-16.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 512,
+    "image_resolution": 224,
+    "vision_layers": 12,
+    "vision_width": 768,
+    "vision_patch_size": 16
+}

clip/model_configs/ViT-B-32.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 512,
+    "image_resolution": 224,
+    "vision_layers": 12,
+    "vision_width": 768,
+    "vision_patch_size": 32
+}

clip/model_configs/ViT-H-14.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "embed_dim": 1024,
+    "image_resolution": 224,
+    "vision_layers": 32,
+    "vision_width": 1280,
+    "vision_head_width": 80,
+    "vision_patch_size": 14
+}

clip/model_configs/ViT-L-14-336.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 768,
+    "image_resolution": 336,
+    "vision_layers": 24,
+    "vision_width": 1024,
+    "vision_patch_size": 14
+}

clip/model_configs/ViT-L-14.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "embed_dim": 768,
+    "image_resolution": 224,
+    "vision_layers": 24,
+    "vision_width": 1024,
+    "vision_patch_size": 14
+}

clip/modeling_bert.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+import importlib.util
+if importlib.util.find_spec('flash_attn'):
+    FlashMHA = importlib.import_module('flash_attn.flash_attention').FlashMHA
+from .configuration_bert import BertConfig
+logger = logging.getLogger(__name__)
+def gelu(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def swish(x):
+    return x * torch.sigmoid(x)
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
+BertLayerNorm = torch.nn.LayerNorm
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config) if not config.use_flash_attention else FlashMHA(config.hidden_size, config.num_attention_heads)
+        self.output = BertSelfOutput(config) if not config.use_flash_attention else BertSelfOutputForFlashAttention(config)
+        self.pruned_heads = set()
+        self.config = config
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
+        if not self.config.use_flash_attention:
+            self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        else:
+            key_padding_mask = self.get_key_padding_mask(attention_mask)
+            self_outputs = self.self(input_tensor, key_padding_mask=key_padding_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+    def get_key_padding_mask(self, attention_mask):
+        # key_padding_mask: bool tensor of shape (batch, seqlen)
+        return attention_mask.squeeze(1).squeeze(1) == 0
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertSelfOutputForFlashAttention(nn.Module):  # remove linear layer
+    def __init__(self, config):
+        super(BertSelfOutputForFlashAttention, self).__init__()
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        if len(outputs) == 1:
+            return outputs[0]
+        return outputs
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.grad_checkpointing = False
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                layer_outputs = checkpoint(layer_module, hidden_states, attention_mask, head_mask[i])
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            if not isinstance(layer_outputs, tuple):
+                layer_outputs = (layer_outputs, )
+            hidden_states = layer_outputs[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
+                                 bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(nn.Module):
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    def __init__(self, config):
+        super(BertPreTrainedModel, self).__init__()
+        self.config = config
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        # self.pooler = BertPooler(config)
+        self.apply(self._init_weights)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        if enable:
+            assert not self.config.output_attentions, \
+                "Grad checkpointing is currently conflict with output_attentions for BertEncoder, \
+                    please set it to False in BertConfig"
+        self.encoder.grad_checkpointing = enable
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        # pooled_output = self.pooler(sequence_output)
+        pooled_output = None
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

clip/utils.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Code modified from https://github.com/openai/CLIP
+import json
+import os
+from pathlib import Path
+from typing import Union, List
+import urllib
+import torch
+from torchvision.transforms import Compose, ToTensor, Normalize, Resize, InterpolationMode
+from tqdm import tqdm
+from clip import _tokenizer
+from clip.model import convert_weights, CLIP, restore_model
+__all__ = ["load", "tokenize", "available_models", "image_transform", "load_from_name"]
+_MODELS = {
+    "ViT-B-16": "https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-base.pt",
+    "ViT-L-14": "https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-large.pt",
+    "RN50": "https://huggingface.co/TencentARC/QA-CLIP/resolve/main/QA-CLIP-RN50.pt",
+}
+_MODEL_INFO = {
+    "ViT-B-16": {
+        "struct": "ViT-B-16@RoBERTa-wwm-ext-base-chinese",
+        "input_resolution": 224
+    },
+    "ViT-L-14": {
+        "struct": "ViT-L-14@RoBERTa-wwm-ext-base-chinese",
+        "input_resolution": 224
+    },
+    "RN50": {
+        "struct": "RN50@RBT3-chinese",
+        "input_resolution": 224
+    },
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        return download_target
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True,
+                  unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load_from_name(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+                   download_root: str = None, vision_model_name: str = None, text_model_name: str = None, input_resolution: int = None):
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+        model_name, model_input_resolution = _MODEL_INFO[name]['struct'], _MODEL_INFO[name]['input_resolution']
+    elif os.path.isfile(name):
+        assert vision_model_name and text_model_name and input_resolution, "Please specify specific 'vision_model_name', 'text_model_name', and 'input_resolution'"
+        model_path = name
+        model_name, model_input_resolution = f'{vision_model_name}@{text_model_name}', input_resolution
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        # loading saved checkpoint
+        checkpoint = torch.load(opened_file, map_location="cpu")
+    model = create_model(model_name, checkpoint)
+    if str(device) == "cpu":
+        model.float()
+    else:
+        model.to(device)
+    return model, image_transform(model_input_resolution)
+def load(model, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", clip_path=None,
+         bert_path=None, use_flash_attention=False):
+    """Load CLIP and BERT model weights
+    """
+    bert_state_dict = torch.load(bert_path, map_location="cpu") if bert_path else None
+    clip_state_dict = torch.load(clip_path, map_location="cpu") if clip_path else None
+    restore_model(model, clip_state_dict, bert_state_dict, use_flash_attention).to(device)
+    if str(device) == "cpu":
+        model.float()
+    return model
+def tokenize(texts: Union[str, List[str]], context_length: int = 52) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all baseline models use 52 as the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    all_tokens = []
+    for text in texts:
+        all_tokens.append([_tokenizer.vocab['[CLS]']] + _tokenizer.convert_tokens_to_ids(_tokenizer.tokenize(text))[
+                                                        :context_length - 2] + [_tokenizer.vocab['[SEP]']])
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        assert len(tokens) <= context_length
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def image_transform(image_size=224):
+    transform = Compose([
+        Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+        _convert_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    return transform
+def create_model(model_name, checkpoint=None):
+    vision_model, text_model = model_name.split('@')
+    # Initialize the model.
+    vision_model_config_file = Path(
+        __file__).parent / f"model_configs/{vision_model.replace('/', '-')}.json"
+    print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(
+        __file__).parent / f"model_configs/{text_model.replace('/', '-')}.json"
+    print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    if isinstance(model_info['vision_layers'], str):
+        model_info['vision_layers'] = eval(model_info['vision_layers'])
+    print('Model info', model_info)
+    model = CLIP(**model_info)
+    convert_weights(model)
+    if checkpoint:
+        sd = checkpoint["state_dict"]
+        if next(iter(sd.items()))[0].startswith('module'):
+            sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+        model.load_state_dict(sd)
+    return model

clip/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/cvinw_zeroshot_templates.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+This script provides templates for manual prompting for zero-shot image classification.
+"""
+openai_templates = [
+    lambda c: f"{c}的照片",
+    lambda c: f"质量差的{c}的照片",
+    lambda c: f"许多{c}的照片",
+    lambda c: f"{c}的雕塑",
+    lambda c: f"难以看到{c}的照片",
+    lambda c: f"{c}的低分辨率照片",
+    lambda c: f"{c}的渲染",
+    lambda c: f"涂鸦{c}",
+    lambda c: f"{c}的糟糕照片",
+    lambda c: f"{c}的裁剪照片",
+    lambda c: f"{c}的纹身",
+    lambda c: f"{c}的刺绣照片",
+    lambda c: f"很难看到{c}的照片",
+    lambda c: f"{c}的明亮照片",
+    lambda c: f"一张干净的{c}的照片",
+    lambda c: f"一张包含{c}的照片",
+    lambda c: f"{c}的深色照片",
+    lambda c: f"{c}的手绘画",
+    lambda c: f"我的{c}的照片",
+    lambda c: f"不自然的{c}的照片",
+    lambda c: f"一张酷的{c}的照片",
+    lambda c: f"{c}的特写照片",
+    lambda c: f"{c}的黑白照片",
+    lambda c: f"一幅{c}的画",
+    lambda c: f"一幅{c}的绘画",
+    lambda c: f"一张{c}的像素照片",
+    lambda c: f"{c}的雕像",
+    lambda c: f"一张{c}的明亮照片",
+    lambda c: f"{c}的裁剪照片",
+    lambda c: f"人造的{c}的照片",
+    lambda c: f"一张关于{c}的照片",
+    lambda c: f"损坏的{c}的jpeg照片",
+    lambda c: f"{c}的模糊照片",
+    lambda c: f"{c}的相片",
+    lambda c: f"一张{c}的好照片",
+    lambda c: f"{c}的渲染照",
+    lambda c: f"视频游戏中的{c}",
+    lambda c: f"一张{c}的照片",
+    lambda c: f"{c}的涂鸦",
+    lambda c: f"{c}的近距离照片",
+    lambda c: f"{c}的折纸",
+    lambda c: f"{c}在视频游戏中",
+    lambda c: f"{c}的草图",
+    lambda c: f"{c}的涂鸦照",
+    lambda c: f"{c}的折纸形状",
+    lambda c: f"低分辨率的{c}的照片",
+    lambda c: f"玩具{c}",
+    lambda c: f"{c}的副本",
+    lambda c: f"{c}的干净的照片",
+    lambda c: f"一张大{c}的照片",
+    lambda c: f"{c}的重现",
+    lambda c: f"一张漂亮的{c}的照片",
+    lambda c: f"一张奇怪的{c}的照片",
+    lambda c: f"模糊的{c}的照片",
+    lambda c: f"卡通{c}",
+    lambda c: f"{c}的艺术作品",
+    lambda c: f"{c}的素描",
+    lambda c: f"刺绣{c}",
+    lambda c: f"{c}的像素照",
+    lambda c: f"{c}的拍照",
+    lambda c: f"{c}的损坏的照片",
+    lambda c: f"高质量的{c}的照片",
+    lambda c: f"毛绒玩具{c}",
+    lambda c: f"漂亮的{c}的照片",
+    lambda c: f"小{c}的照片",
+    lambda c: f"照片是奇怪的{c}",
+    lambda c: f"漫画{c}",
+    lambda c: f"{c}的艺术照",
+    lambda c: f"{c}的图形",
+    lambda c: f"大{c}的照片",
+    lambda c: f"黑白的{c}的照片",
+    lambda c: f"{c}毛绒玩具",
+    lambda c: f"一张{c}的深色照片",
+    lambda c: f"{c}的摄影图",
+    lambda c: f"{c}的涂鸦照",
+    lambda c: f"玩具形状的{c}",
+    lambda c: f"拍了{c}的照片",
+    lambda c: f"酷酷的{c}的照片",
+    lambda c: f"照片里的小{c}",
+    lambda c: f"{c}的刺青",
+    lambda c: f"{c}的可爱的照片",
+    lambda c: f"一张{c}可爱的照片",
+    lambda c: f"{c}可爱图片",
+    lambda c: f"{c}酷炫图片",
+    lambda c: f"一张{c}的酷炫的照片",
+    lambda c: f"一张{c}的酷炫图片",
+    lambda c: f"这是{c}",
+    lambda c: f"{c}的好看照片",
+    lambda c: f"一张{c}的好看的图片",
+    lambda c: f"{c}的好看图片",
+    lambda c: f"{c}的照片。",
+    lambda c: f"质量差的{c}的照片。",
+    lambda c: f"许多{c}的照片。",
+    lambda c: f"{c}的雕塑。",
+    lambda c: f"难以看到{c}的照片。",
+    lambda c: f"{c}的低分辨率照片。",
+    lambda c: f"{c}的渲染。",
+    lambda c: f"涂鸦{c}。",
+    lambda c: f"{c}的糟糕照片。",
+    lambda c: f"{c}的裁剪照片。",
+    lambda c: f"{c}的纹身。",
+    lambda c: f"{c}的刺绣照片。",
+    lambda c: f"很难看到{c}的照片。",
+    lambda c: f"{c}的明亮照片。",
+    lambda c: f"一张干净的{c}的照片。",
+    lambda c: f"一张包含{c}的照片。",
+    lambda c: f"{c}的深色照片。",
+    lambda c: f"{c}的手绘画。",
+    lambda c: f"我的{c}的照片。",
+    lambda c: f"不自然的{c}的照片。",
+    lambda c: f"一张酷的{c}的照片。",
+    lambda c: f"{c}的特写照片。",
+    lambda c: f"{c}的黑白照片。",
+    lambda c: f"一幅{c}的画。",
+    lambda c: f"一幅{c}的绘画。",
+    lambda c: f"一张{c}的像素照片。",
+    lambda c: f"{c}的雕像。",
+    lambda c: f"一张{c}的明亮照片。",
+    lambda c: f"{c}的裁剪照片。",
+    lambda c: f"人造的{c}的照片。",
+    lambda c: f"一张关于{c}的照片。",
+    lambda c: f"损坏的{c}的jpeg照片。",
+    lambda c: f"{c}的模糊照片。",
+    lambda c: f"{c}的相片。",
+    lambda c: f"一张{c}的好照片。",
+    lambda c: f"{c}的渲染照。",
+    lambda c: f"视频游戏中的{c}。",
+    lambda c: f"一张{c}的照片。",
+    lambda c: f"{c}的涂鸦。",
+    lambda c: f"{c}的近距离照片。",
+    lambda c: f"{c}的折纸。",
+    lambda c: f"{c}在视频游戏中。",
+    lambda c: f"{c}的草图。",
+    lambda c: f"{c}的涂鸦照。",
+    lambda c: f"{c}的折纸形状。",
+    lambda c: f"低分辨率的{c}的照片。",
+    lambda c: f"玩具{c}。",
+    lambda c: f"{c}的副本。",
+    lambda c: f"{c}的干净的照片。",
+    lambda c: f"一张大{c}的照片。",
+    lambda c: f"{c}的重现。",
+    lambda c: f"一张漂亮的{c}的照片。",
+    lambda c: f"一张奇怪的{c}的照片。",
+    lambda c: f"模糊的{c}的照片。",
+    lambda c: f"卡通{c}。",
+    lambda c: f"{c}的艺术作品。",
+    lambda c: f"{c}的素描。",
+    lambda c: f"刺绣{c}。",
+    lambda c: f"{c}的像素照。",
+    lambda c: f"{c}的拍照。",
+    lambda c: f"{c}的损坏的照片。",
+    lambda c: f"高质量的{c}的照片。",
+    lambda c: f"毛绒玩具{c}。",
+    lambda c: f"漂亮的{c}的照片。",
+    lambda c: f"小{c}的照片。",
+    lambda c: f"照片是奇怪的{c}。",
+    lambda c: f"漫画{c}。",
+    lambda c: f"{c}的艺术照。",
+    lambda c: f"{c}的图形。",
+    lambda c: f"大{c}的照片。",
+    lambda c: f"黑白的{c}的照片。",
+    lambda c: f"{c}毛绒玩具。",
+    lambda c: f"一张{c}的深色照片。",
+    lambda c: f"{c}的摄影图。",
+    lambda c: f"{c}的涂鸦照。",
+    lambda c: f"玩具形状的{c}。",
+    lambda c: f"拍了{c}的照片。",
+    lambda c: f"酷酷的{c}的照片。",
+    lambda c: f"照片里的小{c}。",
+    lambda c: f"{c}的刺青。",
+    lambda c: f"{c}的可爱的照片。",
+    lambda c: f"一张{c}可爱的照片。",
+    lambda c: f"{c}可爱图片。",
+    lambda c: f"{c}酷炫图片。",
+    lambda c: f"一张{c}的酷炫的照片。",
+    lambda c: f"一张{c}的酷炫图片。",
+    lambda c: f"这是{c}。",
+    lambda c: f"{c}的好看照片。",
+    lambda c: f"一张{c}的好看的图片。",
+    lambda c: f"{c}的好看图片。",
+    lambda c: f"一种叫{c}的花的照片",
+    lambda c: f"一种叫{c}的食物的照片",
+    lambda c: f"{c}的卫星照片"
+]
+normal_templates = [lambda c: f"{c}的图片"]
+flower_templates = [
+    lambda c: f"一种叫{c}的花的照片",
+    lambda c: f"一种叫{c}的花卉的照片",
+    lambda c: f"一种叫{c}的花朵的照片",
+    lambda c: f"一种叫{c}的鲜花的照片",
+    lambda c: f"一种叫{c}的花的高清图",
+    lambda c: f"一种叫{c}的花卉的高清图",
+    lambda c: f"一种叫{c}的花朵的高清图",
+    lambda c: f"一种叫{c}的鲜花的高清图",
+    lambda c: f"一种叫{c}的花的模糊图片",
+    lambda c: f"一种叫{c}的花朵的模糊图片",
+    lambda c: f"一种叫{c}的花卉的模糊图片",
+    lambda c: f"一种叫{c}的鲜花的模糊图片",
+    lambda c: f"一种叫{c}的花的缩放图片",
+    lambda c: f"一种叫{c}的花朵的缩放图片",
+    lambda c: f"一种叫{c}的花卉的缩放图片",
+    lambda c: f"一种叫{c}的鲜花的缩放图片",
+    lambda c: f"一种叫{c}的花的摄影图",
+    lambda c: f"一种叫{c}的花卉的摄影图",
+    lambda c: f"一种叫{c}的花朵的摄影图",
+    lambda c: f"一种叫{c}的鲜花的摄影图",
+    lambda c: f"一种叫{c}的花的近距离照片",
+    lambda c: f"一种叫{c}的花朵的近距离照片",
+    lambda c: f"一种叫{c}的花卉的近距离照片",
+    lambda c: f"一种叫{c}的鲜花的近距离照片",
+    lambda c: f"一种叫{c}的花的裁剪照片",
+    lambda c: f"一种叫{c}的花朵的裁剪照片",
+    lambda c: f"一种叫{c}的花卉的裁剪照片",
+    lambda c: f"一种叫{c}的鲜花的裁剪照片",
+    lambda c: f"一种叫{c}的花的好看的图片",
+    lambda c: f"一种叫{c}的花朵的好看的图片",
+    lambda c: f"一种叫{c}的花卉的好看的图片",
+    lambda c: f"一种叫{c}的鲜花的好看的图片",
+]
+food_templates = [
+    lambda c: f"一种叫{c}的食物的照片",
+    lambda c: f"一种叫{c}的美食的照片",
+    lambda c: f"一种叫{c}的菜的照片",
+    lambda c: f"一种叫{c}的食物的高清图",
+    lambda c: f"一种叫{c}的美食的高清图",
+    lambda c: f"一种叫{c}的菜的高清图",
+    lambda c: f"一种叫{c}的食物的模糊图片",
+    lambda c: f"一种叫{c}的美食的模糊图片",
+    lambda c: f"一种叫{c}的菜的模糊图片",
+    lambda c: f"一种叫{c}的食物的缩放图片",
+    lambda c: f"一种叫{c}的美食的缩放图片",
+    lambda c: f"一种叫{c}的菜的缩放图片",
+    lambda c: f"一种叫{c}的食物的摄影图",
+    lambda c: f"一种叫{c}的美食的摄影图",
+    lambda c: f"一种叫{c}的菜的摄影图",
+    lambda c: f"一种叫{c}的食物的近距离照片",
+    lambda c: f"一种叫{c}的美食的近距离照片",
+    lambda c: f"一种叫{c}的菜的近距离照片",
+    lambda c: f"一种叫{c}的食物的���剪照片",
+    lambda c: f"一种叫{c}的美食的裁剪照片",
+    lambda c: f"一种叫{c}的菜的裁剪照片",
+]
+aircraft_templates = [
+    lambda c: f"{c}，飞机的照片",
+    lambda c: f"{c}，飞机的高清图",
+    lambda c: f"{c}，飞机的模糊图片",
+    lambda c: f"{c}，飞机的缩放图片",
+    lambda c: f"{c}，飞机的摄影图",
+    lambda c: f"{c}，战斗机的照片",
+    lambda c: f"{c}，战斗机的高清图",
+    lambda c: f"{c}，战斗机的模糊图片",
+    lambda c: f"{c}，战斗机的缩放图片",
+    lambda c: f"{c}，战斗机的摄影图",
+    lambda c: f"{c}，老飞机的照片",
+    lambda c: f"{c}，老飞机的高清图",
+    lambda c: f"{c}，老飞机的模糊图片",
+    lambda c: f"{c}，老飞机的缩放图片",
+    lambda c: f"{c}，老飞机的摄影图",
+    lambda c: f"{c}，大飞机的照片",
+    lambda c: f"{c}，大飞机的高清图",
+    lambda c: f"{c}，大飞机的模糊图片",
+    lambda c: f"{c}，大飞机的缩放图片",
+    lambda c: f"{c}，大飞机的摄影图",
+    lambda c: f"{c}，小飞机的照片",
+    lambda c: f"{c}，小飞机的高清图",
+    lambda c: f"{c}，小飞机的模糊图片",
+    lambda c: f"{c}，小飞机的缩放图片",
+    lambda c: f"{c}，小飞机的摄影图",
+    lambda c: f"{c}，军用飞机的照片",
+    lambda c: f"{c}，军用飞机的高清图",
+    lambda c: f"{c}，军用飞机的模糊图片",
+    lambda c: f"{c}，军用飞机的缩放图片",
+    lambda c: f"{c}，军用飞机的摄影图",
+    lambda c: f"{c}，运输机的照片",
+    lambda c: f"{c}，运输机的高清图",
+    lambda c: f"{c}，运输机的模糊图片",
+    lambda c: f"{c}，运输机的缩放图片",
+    lambda c: f"{c}，运输机的摄影图",
+    lambda c: f"{c}，公务机的照片",
+    lambda c: f"{c}，公务机的高清图",
+    lambda c: f"{c}，公务机的模糊图片",
+    lambda c: f"{c}，公务机的缩放图片",
+    lambda c: f"{c}，公务机的摄影图",
+    lambda c: f"{c}，客机的照片",
+    lambda c: f"{c}，客机的高清图",
+    lambda c: f"{c}，客机的模糊图片",
+    lambda c: f"{c}，客机的缩放图片",
+    lambda c: f"{c}，客机的摄影图",
+    lambda c: f"{c}，喷气机的照片",
+    lambda c: f"{c}，喷气机的高清图",
+    lambda c: f"{c}，喷气机的模糊图片",
+    lambda c: f"{c}，喷气机的缩放图片",
+    lambda c: f"{c}，喷气机的摄影图",
+    lambda c: f"一种叫{c}的飞机的照片",
+    lambda c: f"一种叫{c}的飞机的高清图",
+    lambda c: f"一种叫{c}的飞机的模糊图片",
+    lambda c: f"一种叫{c}的飞机的缩放图片",
+    lambda c: f"一种叫{c}的飞机的摄影图",
+    lambda c: f"一种叫{c}的战斗机的照片",
+    lambda c: f"一种叫{c}的战斗机的高清图",
+    lambda c: f"一种叫{c}的战斗机的模糊图片",
+    lambda c: f"一种叫{c}的战斗机的缩放图片",
+    lambda c: f"一种叫{c}的战斗机的摄影图",
+    lambda c: f"一种叫{c}的老飞机的照片",
+    lambda c: f"一种叫{c}的老飞机的高清图",
+    lambda c: f"一种叫{c}的老飞机的模糊图片",
+    lambda c: f"一种叫{c}的老飞机的缩放图片",
+    lambda c: f"一种叫{c}的老飞机的摄影图",
+    lambda c: f"一种叫{c}的大飞机的照片",
+    lambda c: f"一种叫{c}的大飞机的高清图",
+    lambda c: f"一种叫{c}的大飞机的模糊图片",
+    lambda c: f"一种叫{c}的大飞机的缩放图片",
+    lambda c: f"一种叫{c}的大飞机的摄影图",
+    lambda c: f"一种叫{c}的小飞机的照片",
+    lambda c: f"一种叫{c}的小飞机的高清图",
+    lambda c: f"一种叫{c}的小飞机的模糊图片",
+    lambda c: f"一种叫{c}的小飞机的缩放图片",
+    lambda c: f"一种叫{c}的小飞机的摄影图",
+    lambda c: f"一种叫{c}的军用飞机的照片",
+    lambda c: f"一种叫{c}的军用飞机的高清图",
+    lambda c: f"一种叫{c}的军用飞机的模糊图片",
+    lambda c: f"一种叫{c}的军用飞机的缩放图片",
+    lambda c: f"一种叫{c}的军用飞机的摄影图",
+    lambda c: f"一种叫{c}的运输机的照片",
+    lambda c: f"一种叫{c}的运输机的高清图",
+    lambda c: f"一种叫{c}的运输机的模糊图片",
+    lambda c: f"一种叫{c}的运输机的缩放图片",
+    lambda c: f"一种叫{c}的运输机的摄影图",
+    lambda c: f"一种叫{c}的公务机的照片",
+    lambda c: f"一种叫{c}的公务机的高清图",
+    lambda c: f"一种叫{c}的公务机的模糊图片",
+    lambda c: f"一种叫{c}的公务机的缩放图片",
+    lambda c: f"一种叫{c}的公务机的摄影图",
+    lambda c: f"一种叫{c}的客机的照片",
+    lambda c: f"一种叫{c}的客机的高清图",
+    lambda c: f"一种叫{c}的客机的模糊图片",
+    lambda c: f"一种叫{c}的客机的缩放图片",
+    lambda c: f"一种叫{c}的客机的摄影图",
+    lambda c: f"一种叫{c}的喷气机的照片",
+    lambda c: f"���种叫{c}的喷气机的高清图",
+    lambda c: f"一种叫{c}的喷气机的模糊图片",
+    lambda c: f"一种叫{c}的喷气机的缩放图片",
+    lambda c: f"一种叫{c}的喷气机的摄影图",
+]
+eurosat_templates = [
+    lambda c: f"一张{c}的卫星照片",
+    lambda c: f"{c}的卫星照片",
+    lambda c: f"一张{c}的高清卫星照片",
+    lambda c: f"{c}的高清卫星照片",
+    lambda c: f"一张{c}的清晰的卫星照片",
+    lambda c: f"{c}的清晰的卫星照片",
+    lambda c: f"一张{c}的高质量的卫星照片",
+    lambda c: f"{c}的高质量的卫星照片",
+    lambda c: f"一张{c}的卫星图",
+    lambda c: f"{c}的卫星图",
+    lambda c: f"一张{c}的高清卫星图",
+    lambda c: f"{c}的高清卫星图",
+    lambda c: f"一张{c}的清晰的卫星图",
+    lambda c: f"{c}的清晰的卫星图",
+    lambda c: f"一张{c}的高质量的卫星图",
+    lambda c: f"{c}的高质量的卫星图",
+    lambda c: f"一张{c}的卫星图片",
+    lambda c: f"{c}的卫星图片",
+    lambda c: f"一张{c}的高清卫星图片",
+    lambda c: f"{c}的高清卫星图片",
+    lambda c: f"一张{c}的清晰的卫星图片",
+    lambda c: f"{c}的清晰的卫星图片",
+    lambda c: f"一张{c}的高质量的卫星图片",
+    lambda c: f"{c}的高质量的卫星图片",
+]
+hatefulmemes_templates = [
+    lambda c: f"一个{c}",
+    lambda c: f"{c}",
+]
+kitti_templates = [
+    lambda c: f"照片里{c}",
+    lambda c: f"图片里{c}",
+    lambda c: f"{c}",
+]
+cars_templates = [
+    lambda c: f"一张{c}的照片",
+    lambda c: f"一张我的{c}的照片",
+    lambda c: f"我爱我的{c}",
+    lambda c: f"一张我肮脏的{c}的照片",
+    lambda c: f"一张我干净的{c}的照片",
+    lambda c: f"一张我新买的{c}的照片",
+    lambda c: f"一张我旧的{c}的照片",
+]
+dtd_templates = [
+    lambda c: f"一张{c}纹理的照片",
+    lambda c: f"一张{c}图案的照片",
+    lambda c: f"一张{c}物体的照片",
+    lambda c: f"一张{c}纹理的图片",
+    lambda c: f"一张{c}图案的图片",
+    lambda c: f"一张{c}物体的图片",
+]
+country211_templates = [
+    lambda c: f"一张在{c}拍的照片",
+    lambda c: f"一张在{c}旅行时拍的照片",
+    lambda c: f"一张我家乡{c}的照片",
+    lambda c: f"一张展示{c}风光的照片",
+]
+patch_templates = [
+    lambda c: f"一张{c}的医疗照片",
+    lambda c: f"一张{c}的ct照片",
+    lambda c: f"一张{c}的化验照片",
+]
+pet_templates = [
+    lambda c: f"一种叫{c}的宠物的照片",
+    lambda c: f"一种叫{c}的宠物的图片",
+    lambda c: f"一种叫{c}的宠物的可爱图片",
+    lambda c: f"一种叫{c}的宠物的高清图片",
+    lambda c: f"一种叫{c}的宠物的模糊图片",
+    lambda c: f"一种叫{c}的宠物的特写照片",
+]
+cifar100_templates = [
+    lambda c: f"一张{c}的照片",
+    lambda c: f"一张{c}的模糊照片",
+    lambda c: f"一张{c}",
+    lambda c: f"一张{c}的低对比度照片",
+    lambda c: f"一张{c}的高对比度照片",
+    lambda c: f"一张{c}的好照片",
+    lambda c: f"一张小{c}的照片",
+    lambda c: f"一张大{c}的照片",
+    lambda c: f"一张{c}的黑白照片",
+    lambda c: f"一张{c}的低对比度的照片",
+    lambda c: f"一张{c}的高对比度的照片",
+]
+caltech101_templates = [
+    lambda c: f"{c}的照片",
+    lambda c: f"{c}的绘画",
+    lambda c: f"{c}的塑料",
+    lambda c: f"{c}的雕像",
+    lambda c: f"{c}的草图",
+    lambda c: f"{c}的刺青",
+    lambda c: f"{c}的玩具",
+    lambda c: f"{c}的演绎",
+    lambda c: f"{c}的装饰",
+    lambda c: f"{c}的卡通画",
+    lambda c: f"{c}在游戏中",
+    lambda c: f"一个豪华的{c}.",
+    lambda c: f"{c}的折纸",
+    lambda c: f"{c}的艺术画",
+    lambda c: f"{c}的涂鸦画",
+    lambda c: f"{c}的画",
+]
+fer_templates = [
+    lambda c: f"一张表情{c}的照片",
+    lambda c: f"一张表达{c}情绪的照片",
+    lambda c: f"一张看起来很{c}的照片",
+    lambda c: f"他的脸看起来{c}",
+    lambda c: f"他们看起来很{c}",
+]

eval/data.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import logging
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from PIL import Image
+import base64
+from io import BytesIO
+import torch
+import lmdb
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize, InterpolationMode
+from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import SequentialSampler
+import torchvision.datasets as datasets
+from clip import tokenize
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def _preprocess_text(text):
+    # adapt the text to Chinese BERT vocab
+    text = text.lower().replace("“", "\"").replace("”", "\"")
+    return text
+class EvalTxtDataset(Dataset):
+    def __init__(self, jsonl_filename, max_txt_length=24):
+        assert os.path.exists(jsonl_filename), "The annotation datafile {} not exists!".format(jsonl_filename)
+        logging.debug(f'Loading jsonl data from {jsonl_filename}.')
+        self.texts = []
+        with open(jsonl_filename, "r", encoding="utf-8") as fin:
+            for line in fin:
+                obj = json.loads(line.strip())
+                text_id = obj['text_id']
+                text = obj['text']
+                self.texts.append((text_id, text))
+        logging.debug(f'Finished loading jsonl data from {jsonl_filename}.')
+        self.max_txt_length = max_txt_length
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text_id, text = self.texts[idx]
+        text = tokenize([_preprocess_text(str(text))], context_length=self.max_txt_length)[0]
+        return text_id, text
+class EvalImgDataset(Dataset):
+    def __init__(self, lmdb_imgs, resolution=224):
+        assert os.path.isdir(lmdb_imgs), "The image LMDB directory {} not exists!".format(lmdb_imgs)
+        logging.debug(f'Loading image LMDB from {lmdb_imgs}.')
+        self.env_imgs = lmdb.open(lmdb_imgs, readonly=True, create=False, lock=False, readahead=False, meminit=False)
+        self.txn_imgs = self.env_imgs.begin(buffers=True)
+        self.cursor_imgs = self.txn_imgs.cursor()
+        self.iter_imgs = iter(self.cursor_imgs)
+        self.number_images = int(self.txn_imgs.get(key=b'num_images').tobytes().decode('utf-8'))
+        logging.info("The specified LMDB directory contains {} images.".format(self.number_images))
+        self.transform = self._build_transform(resolution)
+    def _build_transform(self, resolution):
+        normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        return Compose([
+                Resize((resolution, resolution), interpolation=InterpolationMode.BICUBIC),
+                _convert_to_rgb,
+                ToTensor(),
+                normalize,
+            ])
+    def __len__(self):
+        return self.number_images
+    def __getitem__(self, idx):
+        img_id, image_b64 = next(self.iter_imgs)
+        if img_id == b"num_images":
+            img_id, image_b64 = next(self.iter_imgs)
+        img_id = img_id.tobytes()
+        image_b64 = image_b64.tobytes()
+        img_id = int(img_id.decode(encoding="utf8", errors="ignore"))
+        image_b64 = image_b64.decode(encoding="utf8", errors="ignore")
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(image_b64))) # already resized
+        image = self.transform(image)
+        return img_id, image
+@dataclass
+class DataInfo:
+    dataloader: DataLoader
+    sampler: DistributedSampler
+def get_eval_txt_dataset(args, max_txt_length=24):
+    input_filename = args.text_data
+    dataset = EvalTxtDataset(
+        input_filename,
+        max_txt_length=max_txt_length)
+    num_samples = len(dataset)
+    sampler = SequentialSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.text_batch_size,
+        num_workers=0,
+        pin_memory=True,
+        sampler=sampler,
+        drop_last=False,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+    return DataInfo(dataloader, sampler)
+def fetch_resolution(vision_model):
+    # fetch the resolution from the vision model config
+    vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{vision_model.replace('/', '-')}.json"
+    with open(vision_model_config_file, 'r') as fv:
+        model_info = json.load(fv)
+    return model_info["image_resolution"]
+def get_eval_img_dataset(args):
+    lmdb_imgs = args.image_data
+    dataset = EvalImgDataset(
+        lmdb_imgs, resolution=fetch_resolution(args.vision_model))
+    num_samples = len(dataset)
+    sampler = SequentialSampler(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.img_batch_size,
+        num_workers=0,
+        pin_memory=True,
+        sampler=sampler,
+        drop_last=False,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+    return DataInfo(dataloader, sampler)
+def get_zeroshot_dataset(args, preprocess_fn):
+    dataset = datasets.ImageFolder(args.datapath, transform=preprocess_fn)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.img_batch_size,
+        num_workers=args.num_workers,
+        sampler=None,
+    )
+    return DataInfo(dataloader, None)

eval/evaluation.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -*- coding: utf-8 -*-
+'''
+This script computes the recall scores given the ground-truth annotations and predictions.
+'''
+import json
+import sys
+import os
+import string
+import numpy as np
+import time
+NUM_K = 10
+def read_submission(submit_path, reference, k=5):
+    # check whether the path of submitted file exists
+    if not os.path.exists(submit_path):
+        raise Exception("The submission file is not found!")
+    submission_dict = {}
+    ref_qids = set(reference.keys())
+    with open(submit_path, encoding="utf-8") as fin:
+        for line in fin:
+            line = line.strip()
+            try:
+                pred_obj = json.loads(line)
+            except:
+                raise Exception('Cannot parse this line into json object: {}'.format(line))
+            if "text_id" not in pred_obj:
+                raise Exception('There exists one line not containing text_id: {}'.format(line))
+            if not isinstance(pred_obj['text_id'], int):
+                raise Exception('Found an invalid text_id {}, it should be an integer (not string), please check your schema'.format(qid))
+            qid = pred_obj["text_id"]
+            if "image_ids" not in pred_obj:
+                raise Exception('There exists one line not containing the predicted image_ids: {}'.format(line))
+            image_ids = pred_obj["image_ids"]
+            if not isinstance(image_ids, list):
+                raise Exception('The image_ids field of text_id {} is not a list, please check your schema'.format(qid))
+            # check whether there are K products for each text
+            if len(image_ids) != k:
+                raise Exception('Text_id {} has wrong number of predicted image_ids! Require {}, but {} founded.'.format(qid, k, len(image_ids)))
+            # check whether there exist an invalid prediction for any text
+            for rank, image_id in enumerate(image_ids):
+                if not isinstance(image_id, int):
+                    raise Exception('Text_id {} has an invalid predicted image_id {} at rank {}, it should be an integer (not string), please check your schema'.format(qid, image_id, rank + 1))
+            # check whether there are duplicate predicted products for a single text
+            if len(set(image_ids)) != k:
+                raise Exception('Text_id {} has duplicate products in your prediction. Pleace check again!'.format(qid))
+            submission_dict[qid] = image_ids # here we save the list of product ids
+    # check if any text is missing in the submission
+    pred_qids = set(submission_dict.keys())
+    nopred_qids = ref_qids - pred_qids
+    if len(nopred_qids) != 0:
+        raise Exception('The following text_ids have no prediction in your submission, please check again: {}'.format(", ".join([str(idx) for idx in nopred_qids])))
+    return submission_dict
+def dump_2_json(info, path):
+    with open(path, 'w', encoding="utf-8") as output_json_file:
+        json.dump(info, output_json_file)
+def report_error_msg(detail, showMsg, out_p):
+    error_dict=dict()
+    error_dict['errorDetail']=detail
+    error_dict['errorMsg']=showMsg
+    error_dict['score']=0
+    error_dict['scoreJson']={}
+    error_dict['success']=False
+    dump_2_json(error_dict,out_p)
+def report_score(r1, r5, r10, out_p):
+    result = dict()
+    result['success']=True
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result['score'] = mean_recall * 100
+    result['scoreJson'] = {'score': mean_recall * 100, 'mean_recall': mean_recall * 100, 'r1': r1 * 100, 'r5': r5 * 100, 'r10': r10 * 100}
+    dump_2_json(result,out_p)
+def read_reference(path):
+    fin = open(path, encoding="utf-8")
+    reference = dict()
+    for line in fin:
+        line = line.strip()
+        obj = json.loads(line)
+        reference[obj['text_id']] = obj['image_ids']
+    return reference
+def compute_score(golden_file, predict_file):
+    # read ground-truth
+    reference = read_reference(golden_file)
+    # read predictions
+    k = 10
+    predictions = read_submission(predict_file, reference, k)
+    # compute score for each text
+    r1_stat, r5_stat, r10_stat = 0, 0, 0
+    for qid in reference.keys():
+        ground_truth_ids = set(reference[qid])
+        top10_pred_ids = predictions[qid]
+        if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+            r1_stat += 1
+        if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+            r5_stat += 1
+        if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+            r10_stat += 1
+    # the higher score, the better
+    r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result = [mean_recall, r1, r5, r10]
+    result = [score * 100 for score in result]
+    return result
+if __name__=="__main__":
+    # the path of answer json file (eg. test_queries_answers.jsonl)
+    standard_path = sys.argv[1]
+    # the path of prediction file (eg. example_pred.jsonl)
+    submit_path = sys.argv[2]
+    # the score will be dumped into this output json file
+    out_path = sys.argv[3]
+    print("Read standard from %s" % standard_path)
+    print("Read user submit file from %s" % submit_path)
+    try:
+        # read ground-truth
+        reference = read_reference(standard_path)
+        # read predictions
+        k = 10
+        predictions = read_submission(submit_path, reference, k)
+        # compute score for each text
+        r1_stat, r5_stat, r10_stat = 0, 0, 0
+        for qid in reference.keys():
+            ground_truth_ids = set(reference[qid])
+            top10_pred_ids = predictions[qid]
+            if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+                r1_stat += 1
+            if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+                r5_stat += 1
+            if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+                r10_stat += 1
+        # the higher score, the better
+        r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+        report_score(r1, r5, r10, out_path)
+        print("The evaluation finished successfully.")
+    except Exception as e:
+        report_error_msg(e.args[0], e.args[0], out_path)
+        print("The evaluation failed: {}".format(e.args[0]))

eval/evaluation_tr.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -*- coding: utf-8 -*-
+'''
+This script computes the recall scores given the ground-truth annotations and predictions.
+'''
+import json
+import sys
+import os
+import string
+import numpy as np
+import time
+NUM_K = 10
+def read_submission(submit_path, reference, k=5):
+    # check whether the path of submitted file exists
+    if not os.path.exists(submit_path):
+        raise Exception("The submission file is not found!")
+    submission_dict = {}
+    ref_image_ids = set(reference.keys())
+    with open(submit_path, encoding="utf-8") as fin:
+        for line in fin:
+            line = line.strip()
+            try:
+                pred_obj = json.loads(line)
+            except:
+                raise Exception('Cannot parse this line into json object: {}'.format(line))
+            if "image_id" not in pred_obj:
+                raise Exception('There exists one line not containing image_id: {}'.format(line))
+            if not isinstance(pred_obj['image_id'], int):
+                raise Exception('Found an invalid image_id {}, it should be an integer (not string), please check your schema'.format(pred_obj['image_id']))
+            image_id = pred_obj['image_id']
+            if "text_ids" not in pred_obj:
+                raise Exception('There exists one line not containing the predicted text_ids: {}'.format(line))
+            text_ids = pred_obj["text_ids"]
+            if not isinstance(text_ids, list):
+                raise Exception('The text_ids field of image_id {} is not a list, please check your schema'.format(image_id))
+            # check whether there are K products for each text
+            if len(text_ids) != k:
+                raise Exception('Image_id {} has wrong number of predicted text_ids! Require {}, but {} founded.'.format(image_id, k, len(text_ids)))
+            # check whether there exist an invalid prediction for any text
+            for rank, text_id in enumerate(text_ids):
+                if not isinstance(text_id, int):
+                    raise Exception('Image_id {} has an invalid predicted text_id {} at rank {}, it should be an integer (not string), please check your schema'.format(image_id, text_id, rank + 1))
+            # check whether there are duplicate predicted products for a single text
+            if len(set(text_ids)) != k:
+                raise Exception('Image_id {} has duplicate products in your prediction. Pleace check again!'.format(image_id))
+            submission_dict[image_id] = text_ids # here we save the list of product ids
+    # check if any text is missing in the submission
+    pred_image_ids = set(submission_dict.keys())
+    nopred_image_ids = ref_image_ids - pred_image_ids
+    if len(nopred_image_ids) != 0:
+        raise Exception('The following image_ids have no prediction in your submission, please check again: {}'.format(", ".join([str(idx) for idx in nopred_image_ids])))
+    return submission_dict
+def dump_2_json(info, path):
+    with open(path, 'w', encoding="utf-8") as output_json_file:
+        json.dump(info, output_json_file)
+def report_error_msg(detail, showMsg, out_p):
+    error_dict=dict()
+    error_dict['errorDetail']=detail
+    error_dict['errorMsg']=showMsg
+    error_dict['score']=0
+    error_dict['scoreJson']={}
+    error_dict['success']=False
+    dump_2_json(error_dict,out_p)
+def report_score(r1, r5, r10, out_p):
+    result = dict()
+    result['success']=True
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result['score'] = mean_recall * 100
+    result['scoreJson'] = {'score': mean_recall * 100, 'mean_recall': mean_recall * 100, 'r1': r1 * 100, 'r5': r5 * 100, 'r10': r10 * 100}
+    dump_2_json(result,out_p)
+def read_reference(path):
+    fin = open(path, encoding="utf-8")
+    reference = dict()
+    for line in fin:
+        line = line.strip()
+        obj = json.loads(line)
+        reference[obj['image_id']] = obj['text_ids']
+    return reference
+def compute_score(golden_file, predict_file):
+    # read ground-truth
+    reference = read_reference(golden_file)
+    # read predictions
+    k = 10
+    predictions = read_submission(predict_file, reference, k)
+    # compute score for each text
+    r1_stat, r5_stat, r10_stat = 0, 0, 0
+    for qid in reference.keys():
+        ground_truth_ids = set(reference[qid])
+        top10_pred_ids = predictions[qid]
+        if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+            r1_stat += 1
+        if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+            r5_stat += 1
+        if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+            r10_stat += 1
+    # the higher score, the better
+    r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+    mean_recall = (r1 + r5 + r10) / 3.0
+    result = [mean_recall, r1, r5, r10]
+    result = [score * 100 for score in result]
+    return result
+if __name__=="__main__":
+    # the path of answer json file (eg. test_queries_answers.jsonl)
+    standard_path = sys.argv[1]
+    # the path of prediction file (eg. example_pred.jsonl)
+    submit_path = sys.argv[2]
+    # the score will be dumped into this output json file
+    out_path = sys.argv[3]
+    print("Read standard from %s" % standard_path)
+    print("Read user submit file from %s" % submit_path)
+    try:
+        # read ground-truth
+        reference = read_reference(standard_path)
+        # read predictions
+        k = 10
+        predictions = read_submission(submit_path, reference, k)
+        # compute score for each text
+        r1_stat, r5_stat, r10_stat = 0, 0, 0
+        for qid in reference.keys():
+            ground_truth_ids = set(reference[qid])
+            top10_pred_ids = predictions[qid]
+            if any([idx in top10_pred_ids[:1] for idx in ground_truth_ids]):
+                r1_stat += 1
+            if any([idx in top10_pred_ids[:5] for idx in ground_truth_ids]):
+                r5_stat += 1
+            if any([idx in top10_pred_ids[:10] for idx in ground_truth_ids]):
+                r10_stat += 1
+        # the higher score, the better
+        r1, r5, r10 = r1_stat * 1.0 / len(reference), r5_stat * 1.0 / len(reference), r10_stat * 1.0 / len(reference)
+        report_score(r1, r5, r10, out_path)
+        print("The evaluation finished successfully.")
+    except Exception as e:
+        report_error_msg(e.args[0], e.args[0], out_path)
+        print("The evaluation failed: {}".format(e.args[0]))

eval/extract_features.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# -*- coding: utf-8 -*-
+'''
+This script extracts image and text features for evaluation. (with single-GPU)
+'''
+import os
+import argparse
+import logging
+from pathlib import Path
+import json
+import torch
+from tqdm import tqdm
+from clip.model import convert_weights, CLIP
+from eval.data import get_eval_img_dataset, get_eval_txt_dataset
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--extract-image-feats',
+        action="store_true",
+        default=False,
+        help="Whether to extract image features."
+    )
+    parser.add_argument(
+        '--extract-text-feats',
+        action="store_true",
+        default=False,
+        help="Whether to extract text features."
+    )
+    parser.add_argument(
+        '--image-data',
+        type=str,
+        default="../Multimodal_Retrieval/lmdb/test/imgs",
+        help="If --extract-image-feats is True, specify the path of the LMDB directory storing input image base64 strings."
+    )
+    parser.add_argument(
+        '--text-data',
+        type=str,
+        default="../Multimodal_Retrieval/test_texts.jsonl",
+        help="If --extract-text-feats is True, specify the path of input text Jsonl file."
+    )
+    parser.add_argument(
+        '--image-feat-output-path',
+        type=str,
+        default=None,
+        help="If --extract-image-feats is True, specify the path of output image features."
+    )
+    parser.add_argument(
+        '--text-feat-output-path',
+        type=str,
+        default=None,
+        help="If --extract-image-feats is True, specify the path of output text features."
+    )
+    parser.add_argument(
+        "--img-batch-size", type=int, default=64, help="Image batch size."
+    )
+    parser.add_argument(
+        "--text-batch-size", type=int, default=64, help="Text batch size."
+    )
+    parser.add_argument(
+        "--context-length", type=int, default=64, help="The maximum length of input text (include [CLS] & [SEP] tokens)."
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--precision",
+        choices=["amp", "fp16", "fp32"],
+        default="amp",
+        help="Floating point precition."
+    )
+    parser.add_argument(
+        "--vision-model",
+        choices=["ViT-B-16", "ViT-L-14", "RN50"],
+        default="ViT-B-16",
+        help="Name of the vision backbone to use.",
+    )
+    parser.add_argument(
+        "--text-model",
+        choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"],
+        default="RoBERTa-wwm-ext-base-chinese",
+        help="Name of the text backbone to use.",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="If true, more information is logged."
+    )
+    args = parser.parse_args()
+    return args
+# Used by https://github.com/openai/CLIP/issues/83 but not below.
+# Keeping it incase needed.
+def convert_models_to_fp32(model):
+    for p in model.parameters():
+        p.data = p.data.float()
+        if p.grad:
+            p.grad.data = p.grad.data.float()
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.extract_image_feats or args.extract_text_feats, "--extract-image-feats and --extract-text-feats cannot both be False!"
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    args.gpu = 0
+    torch.cuda.set_device(args.gpu)
+    # Initialize the model.
+    vision_model_config_file = Path(__file__).parent.parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json"
+    print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(__file__).parent.parent.parent / f"clip/model_configs/{args.text_model.replace('/', '-')}.json"
+    print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        if isinstance(model_info['vision_layers'], str):
+            model_info['vision_layers'] = eval(model_info['vision_layers'])
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    model = CLIP(**model_info)
+    convert_weights(model)
+    # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+    if args.precision == "amp" or args.precision == "fp32":
+        convert_models_to_fp32(model)
+    model.cuda(args.gpu)
+    if args.precision == "fp16":
+        convert_weights(model)
+    # Get data.
+    if args.extract_image_feats:
+        print("Preparing image inference dataset.")
+        img_data = get_eval_img_dataset(args)
+    if args.extract_text_feats:
+        print("Preparing text inference dataset.")
+        text_data = get_eval_txt_dataset(args, max_txt_length=args.context_length)
+    # Resume from a checkpoint.
+    print("Begin to load model checkpoint from {}.".format(args.resume))
+    assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume)
+    # Map model to be loaded to specified single gpu.
+    loc = "cuda:{}".format(args.gpu)
+    checkpoint = torch.load(args.resume, map_location='cpu')
+    start_epoch = checkpoint["epoch"]
+    sd = checkpoint["state_dict"]
+    if next(iter(sd.items()))[0].startswith('module'):
+        sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+    model.load_state_dict(sd)
+    print(
+        f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)"
+    )
+    # Make inference for texts
+    if args.extract_text_feats:
+        print('Make inference for texts...')
+        if args.text_feat_output_path is None:
+            args.text_feat_output_path = "{}.txt_feat.jsonl".format(args.text_data[:-6])
+        write_cnt = 0
+        with open(args.text_feat_output_path, "w") as fout:
+            model.eval()
+            dataloader = text_data.dataloader
+            with torch.no_grad():
+                for batch in tqdm(dataloader):
+                    text_ids, texts = batch
+                    texts = texts.cuda(args.gpu, non_blocking=True)
+                    text_features = model(None, texts)
+                    text_features /= text_features.norm(dim=-1, keepdim=True)
+                    for text_id, text_feature in zip(text_ids.tolist(), text_features.tolist()):
+                        fout.write("{}\n".format(json.dumps({"text_id": text_id, "feature": text_feature})))
+                        write_cnt += 1
+        print('{} text features are stored in {}'.format(write_cnt, args.text_feat_output_path))
+    # Make inference for images
+    if args.extract_image_feats:
+        print('Make inference for images...')
+        if args.image_feat_output_path is None:
+            # by default, we store the image features under the same directory with the text features
+            args.image_feat_output_path = "{}.img_feat.jsonl".format(args.text_data.replace("_texts.jsonl", "_imgs"))
+        write_cnt = 0
+        with open(args.image_feat_output_path, "w") as fout:
+            model.eval()
+            dataloader = img_data.dataloader
+            with torch.no_grad():
+                for batch in tqdm(dataloader):
+                    image_ids, images = batch
+                    images = images.cuda(args.gpu, non_blocking=True)
+                    image_features = model(images, None)
+                    image_features /= image_features.norm(dim=-1, keepdim=True)
+                    for image_id, image_feature in zip(image_ids.tolist(), image_features.tolist()):
+                        fout.write("{}\n".format(json.dumps({"image_id": image_id, "feature": image_feature})))
+                        write_cnt += 1
+        print('{} image features are stored in {}'.format(write_cnt, args.image_feat_output_path))
+    print("Done!")

eval/make_topk_predictions.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+'''
+This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs text-to-image prediction file for evaluation.
+'''
+import argparse
+import numpy
+from tqdm import tqdm
+import json
+import numpy as np
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image-feats',
+        type=str,
+        required=True,
+        help="Specify the path of image features."
+    )
+    parser.add_argument(
+        '--text-feats',
+        type=str,
+        required=True,
+        help="Specify the path of text features."
+    )
+    parser.add_argument(
+        '--top-k',
+        type=int,
+        default=10,
+        help="Specify the k value of top-k predictions."
+    )
+    parser.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=32768,
+        help="Specify the image-side batch size when computing the inner products, default to 8192"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help="Specify the output jsonl prediction filepath."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    print("Begin to load image features...")
+    image_ids = []
+    image_feats = []
+    with open(args.image_feats, "r") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            image_ids.append(obj['image_id'])
+            image_feats.append(obj['feature'])
+    image_feats_array = np.array(image_feats, dtype=np.float32)
+    print("Finished loading image features.")
+    print("Begin to compute top-{} predictions for texts...".format(args.top_k))
+    with open(args.output, "w") as fout:
+        with open(args.text_feats, "r") as fin:
+            for line in tqdm(fin):
+                obj = json.loads(line.strip())
+                text_id = obj['text_id']
+                text_feat = obj['feature']
+                score_tuples = []
+                text_feat_tensor = torch.tensor([text_feat], dtype=torch.float).cuda() # [1, feature_dim]
+                idx = 0
+                while idx < len(image_ids):
+                    img_feats_tensor = torch.from_numpy(image_feats_array[idx : min(idx + args.eval_batch_size, len(image_ids))]).cuda() # [batch_size, feature_dim]
+                    batch_scores = text_feat_tensor @ img_feats_tensor.t() # [1, batch_size]
+                    for image_id, score in zip(image_ids[idx : min(idx + args.eval_batch_size, len(image_ids))], batch_scores.squeeze(0).tolist()):
+                        score_tuples.append((image_id, score))
+                    idx += args.eval_batch_size
+                top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k]
+                fout.write("{}\n".format(json.dumps({"text_id": text_id, "image_ids": [entry[0] for entry in top_k_predictions]})))
+    print("Top-{} predictions are saved in {}".format(args.top_k, args.output))
+    print("Done!")

eval/make_topk_predictions_tr.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+'''
+This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs image-to-text retrieval prediction file for evaluation.
+'''
+import argparse
+import numpy
+from tqdm import tqdm
+import json
+import numpy as np
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image-feats',
+        type=str,
+        required=True,
+        help="Specify the path of image features."
+    )
+    parser.add_argument(
+        '--text-feats',
+        type=str,
+        required=True,
+        help="Specify the path of text features."
+    )
+    parser.add_argument(
+        '--top-k',
+        type=int,
+        default=10,
+        help="Specify the k value of top-k predictions."
+    )
+    parser.add_argument(
+        '--eval-batch-size',
+        type=int,
+        default=32768,
+        help="Specify the image-side batch size when computing the inner products, default to 8192"
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help="Specify the output jsonl prediction filepath."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    print("Begin to load text features...")
+    text_ids = []
+    text_feats = []
+    with open(args.text_feats, "r") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            text_ids.append(obj['text_id'])
+            text_feats.append(obj['feature'])
+    text_feats_array = np.array(text_feats, dtype=np.float32)
+    print("Finished loading text features.")
+    print("Begin to compute top-{} predictions for images...".format(args.top_k))
+    with open(args.output, "w") as fout:
+        with open(args.image_feats, "r") as fin:
+            for line in tqdm(fin):
+                obj = json.loads(line.strip())
+                image_id = obj['image_id']
+                image_feat = obj['feature']
+                score_tuples = []
+                image_feat_tensor = torch.tensor([image_feat], dtype=torch.float).cuda() # [1, feature_dim]
+                idx = 0
+                while idx < len(text_ids):
+                    text_feats_tensor = torch.from_numpy(text_feats_array[idx : min(idx + args.eval_batch_size, len(text_ids))]).cuda() # [batch_size, feature_dim]
+                    batch_scores = image_feat_tensor @ text_feats_tensor.t() # [1, batch_size]
+                    for text_id, score in zip(text_ids[idx : min(idx + args.eval_batch_size, len(text_ids))], batch_scores.squeeze(0).tolist()):
+                        score_tuples.append((text_id, score))
+                    idx += args.eval_batch_size
+                top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k]
+                fout.write("{}\n".format(json.dumps({"image_id": image_id, "text_ids": [entry[0] for entry in top_k_predictions]})))
+    print("Top-{} predictions are saved in {}".format(args.top_k, args.output))
+    print("Done!")

eval/transform_ir_annotation_to_tr.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# -*- coding: utf-8 -*-
+from tqdm import tqdm
+import argparse
+import json
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input',
+        type=str,
+        required=True,
+        help="Input path of text-to-image Jsonl annotation file."
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    t2i_record = dict()
+    with open(args.input, "r", encoding="utf-8") as fin:
+        for line in tqdm(fin):
+            obj = json.loads(line.strip())
+            text_id = obj['text_id']
+            image_ids = obj['image_ids']
+            for image_id in image_ids:
+                if image_id not in t2i_record:
+                    t2i_record[image_id] = []
+                t2i_record[image_id].append(text_id)
+    with open(args.input.replace(".jsonl", "") + ".tr.jsonl", "w", encoding="utf-8") as fout:
+        for image_id, text_ids in t2i_record.items():
+            out_obj = {"image_id": image_id, "text_ids": text_ids}
+            fout.write("{}\n".format(json.dumps(out_obj)))
+    print("Done!")

eval/zeroshot_evaluation.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# -*- coding: utf-8 -*-
+'''
+This script performs zero-shot evaluation on ImageNet-1K. (with single-GPU)
+'''
+import os
+import argparse
+from pathlib import Path
+import json
+from tqdm import tqdm
+import torch
+from clip.model import convert_weights, CLIP
+from clip import tokenize
+from clip.utils import image_transform
+from eval.data import get_zeroshot_dataset, _preprocess_text
+from eval.cvinw_zeroshot_templates import (
+    openai_templates,
+    flower_templates,
+    food_templates,
+    aircraft_templates,
+    eurosat_templates,
+    country211_templates,
+)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--vision-model",
+        choices=["ViT-B-16", "ViT-L-14", "RN50"],
+        default="ViT-B-16",
+        help="Name of the vision backbone to use.",
+    )
+    parser.add_argument(
+        "--text-model",
+        choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"],
+        default="RoBERTa-wwm-ext-base-chinese",
+        help="Name of the text backbone to use.",
+    )
+    parser.add_argument(
+        "--precision",
+        choices=["amp", "fp16", "fp32"],
+        default="amp",
+        help="Floating point precition."
+    )
+    parser.add_argument(
+        "--label-file",
+        type=str,
+        help="file for labels",
+    )
+    parser.add_argument(
+        "--datapath",
+        type=str,
+        required=True,
+        help="Path to the test set for conducting zero shot evaluation.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="imagenet",
+        help="Specified dataset.",
+    )
+    parser.add_argument(
+        "--index",
+        type=str,
+        default="",
+        help="Specify image paths.",
+    )
+    parser.add_argument(
+        "--save-dir",
+        type=str,
+        default="",
+        help="Specified dataset.",
+    )
+    # parser.add_argument(
+    #     "--imagenet-val",
+    #     type=str,
+    #     required=True,
+    #     help="Path to imagenet val set for conducting zero shot evaluation.",
+    # )
+    parser.add_argument(
+        "--img-batch-size", type=int, default=64, help="Image batch size."
+    )
+    parser.add_argument(
+        "--context-length",
+        type=int,
+        default=52,
+        help="The maximum length of input text (include [CLS] & [SEP] tokens)."
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--num-workers", type=int, default=4, help="Number of workers for ImageNet dataloader."
+    )
+    args = parser.parse_args()
+    return args
+# Used by https://github.com/openai/CLIP/issues/83 but not below.
+# Keeping it incase needed.
+def convert_models_to_fp32(model):
+    for p in model.parameters():
+        p.data = p.data.float()
+        if p.grad:
+            p.grad.data = p.grad.data.float()
+def zero_shot_classifier(model, classnames, templates, args):
+    with torch.no_grad():
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [_preprocess_text(template(classname)) for template in templates]  # format with class
+            texts = tokenize(texts, context_length=args.context_length).to(args.gpu)  # tokenize
+            class_embeddings = model(None, texts)
+            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+            class_embedding = class_embeddings.mean(dim=0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.gpu)
+    return zeroshot_weights
+def accuracy(output, target, topk=(1,)):
+    pred = output.topk(max(topk), 1, True, True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
+def run(model, classifier, dataloader, args):
+    total_logits = []
+    total_targets = []
+    with torch.no_grad():
+        top1, top5, n = 0.0, 0.0, 0.0
+        for images, target in tqdm(dataloader):
+            images = images.to(args.gpu)
+            target = target.to(args.gpu)
+            total_targets.append(target)
+            # predict
+            image_features = model(images, None)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+            logits = (100.0 * image_features @ classifier).softmax(dim=-1)
+            total_logits.append(logits)
+            # measure accuracy
+            acc1, acc5 = accuracy(logits, target, topk=(1, 1))
+            top1 += acc1
+            n += images.size(0)
+    outputs = torch.cat(total_logits, dim=0)
+    targets = torch.cat(total_targets, dim=0)
+    if getattr(args, "index", ""):
+        print("Use index to rearrange the logits...")
+        with open(args.index, "r", encoding="utf-8") as f:
+            index = json.load(f)
+            print(index)
+        outputs = outputs[index]
+        targets = targets[index]
+        print(targets)
+    top1 = top1 / n
+    return top1, outputs
+if __name__ == "__main__":
+    args = parse_args()
+    # Log params.
+    print("Params:")
+    for name in sorted(vars(args)):
+        val = getattr(args, name)
+        print(f"  {name}: {val}")
+    args.gpu = 0
+    torch.cuda.set_device(args.gpu)
+    # Initialize the model.
+    vision_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json"
+    print('Loading vision model config from', vision_model_config_file)
+    assert os.path.exists(vision_model_config_file)
+    text_model_config_file = Path(__file__).parent.parent / f"clip/model_configs/{args.text_model.replace('/', '-')}.json"
+    print('Loading text model config from', text_model_config_file)
+    assert os.path.exists(text_model_config_file)
+    with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
+        model_info = json.load(fv)
+        if isinstance(model_info['vision_layers'], str):
+            model_info['vision_layers'] = eval(model_info['vision_layers'])
+        for k, v in json.load(ft).items():
+            model_info[k] = v
+    model = CLIP(**model_info)
+    convert_weights(model)
+    # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+    if args.precision == "amp" or args.precision == "fp32":
+        convert_models_to_fp32(model)
+    model.cuda(args.gpu)
+    if args.precision == "fp16":
+        convert_weights(model)
+    # Get eval data.
+    print("Preparing zeroshot dataset.")
+    data = {}
+    print(f"{model_info['image_resolution']}")
+    data[args.dataset] = get_zeroshot_dataset(
+        args, image_transform(model_info["image_resolution"])
+    )
+    # Resume from a checkpoint.
+    print("Begin to load model checkpoint from {}.".format(args.resume))
+    assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume)
+    # Map model to be loaded to specified single gpu.
+    loc = "cuda:{}".format(args.gpu)
+    checkpoint = torch.load(args.resume, map_location='cpu')
+    start_epoch = checkpoint["epoch"]
+    sd = checkpoint["state_dict"]
+    if next(iter(sd.items()))[0].startswith('module'):
+        sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
+    model.load_state_dict(sd, strict=False)
+    print(
+        f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)"
+    )
+    # Compute ensembled class embeddings
+    print('Building zero-shot classifier')
+    model.eval()
+    f = open(args.label_file, "r", encoding="utf8")
+    classnames = [line.strip() for line in f.readlines()]
+    template_dict = {
+        "fgvc-aircraft-2013b-variants102": aircraft_templates,
+        "food-101": food_templates,
+        "oxford-flower-102": flower_templates,
+        "eurosat_clip": eurosat_templates,
+        "resisc45_clip": eurosat_templates,
+        "country211": country211_templates,
+        "openai": openai_templates,
+    }
+    if args.dataset in template_dict.keys():
+        templates = template_dict[args.dataset]
+    else:
+        templates = template_dict['openai']
+    # Make inference and evaluation
+    print('Using classifier')
+    classifier = zero_shot_classifier(model, classnames, templates, args)
+    results = {}
+    top1, logits = run(model, classifier, data[args.dataset].dataloader, args)
+    results["zeroshot-top1"] = top1
+    print('Result:')
+    print(", ".join(["{}: {}".format(k, v) for k, v in results.items()]))
+    print('Finished.')

examples/pokemon.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+numpy
+tqdm
+six
+timm
+lmdb==1.3.0
+torch>=1.7.1
+torchvision
+webdataset
+pandas
+transformers

scripts/zeroshot_eval.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/bin/bash
+# Usage: see example script below.
+# bash run_scripts/zeroshot_eval.sh 0 \
+#     ${path_to_dataset} ${dataset_name} \
+#     ViT-B-16 RoBERTa-wwm-ext-base-chinese \
+#     ${ckpt_path}
+# only supports single-GPU inference
+export CUDA_VISIBLE_DEVICES=${1}
+export PYTHONPATH=${PYTHONPATH}:`pwd`/QA-CLIP-main
+path=${2}
+dataset=${3}
+datapath=${path}
+savedir=`pwd`/save_predictions
+vision_model=${4} # ViT-B-16
+text_model=${5}
+resume=${6}
+label_file=`pwd`/label_cn.txt
+index=${7:-}
+mkdir -p ${savedir}
+python -u eval/zeroshot_evaluation.py \
+    --datapath="${datapath}" \
+    --label-file=${label_file} \
+    --save-dir=${savedir} \
+    --dataset=${dataset} \
+    --index=${index} \
+    --img-batch-size=64 \
+    --resume=${resume} \
+    --vision-model=${vision_model} \
+    --text-model=${text_model}