Add transformers.js tag + sample code
#10
by
Xenova
HF staff
- opened
README.md
CHANGED
@@ -5,6 +5,7 @@ tags:
|
|
5 |
- mteb
|
6 |
- clip
|
7 |
- vision
|
|
|
8 |
language: en
|
9 |
inference: false
|
10 |
license: apache-2.0
|
@@ -77,6 +78,44 @@ print(cos_sim(text_embeddings[1], image_embeddings[0])) # text-image cross-modal
|
|
77 |
print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
|
78 |
```
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
## Performance
|
82 |
|
|
|
5 |
- mteb
|
6 |
- clip
|
7 |
- vision
|
8 |
+
- transformers.js
|
9 |
language: en
|
10 |
inference: false
|
11 |
license: apache-2.0
|
|
|
78 |
print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
|
79 |
```
|
80 |
|
81 |
+
3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
|
82 |
+
|
83 |
+
```js
|
84 |
+
import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
|
85 |
+
|
86 |
+
// Load tokenizer and text model
|
87 |
+
const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v1');
|
88 |
+
const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
|
89 |
+
|
90 |
+
// Load processor and vision model
|
91 |
+
const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
|
92 |
+
const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
|
93 |
+
|
94 |
+
// Run tokenization
|
95 |
+
const texts = ['Bridge close-shot', 'Bridge in far away'];
|
96 |
+
const text_inputs = tokenizer(texts, { padding: true, truncation: true });
|
97 |
+
|
98 |
+
// Compute text embeddings
|
99 |
+
const { text_embeds } = await text_model(text_inputs);
|
100 |
+
|
101 |
+
// Read images and run processor
|
102 |
+
const urls = [
|
103 |
+
'https://fastly.picsum.photos/id/74/4288/2848.jpg?hmac=q02MzzHG23nkhJYRXR-_RgKTr6fpfwRgcXgE0EKvNB8',
|
104 |
+
'https://fastly.picsum.photos/id/84/1280/848.jpg?hmac=YFRYDI4UsfbeTzI8ZakNOR98wVU7a-9a2tGF542539s',
|
105 |
+
];
|
106 |
+
const image = await Promise.all(urls.map(url => RawImage.read(url)));
|
107 |
+
const image_inputs = await processor(image);
|
108 |
+
|
109 |
+
// Compute vision embeddings
|
110 |
+
const { image_embeds } = await vision_model(image_inputs);
|
111 |
+
|
112 |
+
// Compute similarities
|
113 |
+
console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
|
114 |
+
console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
|
115 |
+
console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
|
116 |
+
console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
|
117 |
+
console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
|
118 |
+
```
|
119 |
|
120 |
## Performance
|
121 |
|