Update README.md
Browse files
README.md
CHANGED
@@ -69,24 +69,21 @@ from scipy.spatial.distance import cdist
|
|
69 |
tokenizer = AutoTokenizer.from_pretrained('cmarkea/bloomz-3b-retriever-v2')
|
70 |
model = AutoModel.from_pretrained('cmarkea/bloomz-3b-retriever-v2')
|
71 |
|
72 |
-
def
|
73 |
if isinstance(txt, str):
|
74 |
length = 0
|
75 |
else:
|
76 |
length = len(txt)
|
77 |
-
tok = tokenizer(txt, return_tensors='pt')
|
78 |
-
with torch.
|
79 |
embedding = model(**tok)
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
# Inportant: take only last token!
|
84 |
-
infer = lambda x: [ii[0][-1] for ii in retriever(x)]
|
85 |
|
86 |
list_of_contexts = [...]
|
87 |
-
emb_contexts =
|
88 |
list_of_queries = [...]
|
89 |
-
emb_queries =
|
90 |
|
91 |
# Important: take cosine distance!
|
92 |
dist = cdist(emb_queries, emb_contexts, 'cosine')
|
@@ -125,4 +122,17 @@ top_k = lambda x: [
|
|
125 |
|
126 |
# top 5 nearest contexts for each queries
|
127 |
top_contexts = top_k(5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
```
|
|
|
69 |
tokenizer = AutoTokenizer.from_pretrained('cmarkea/bloomz-3b-retriever-v2')
|
70 |
model = AutoModel.from_pretrained('cmarkea/bloomz-3b-retriever-v2')
|
71 |
|
72 |
+
def infer(txt: Union[str, List[str]]):
|
73 |
if isinstance(txt, str):
|
74 |
length = 0
|
75 |
else:
|
76 |
length = len(txt)
|
77 |
+
tok = tokenizer(txt, padding=True, return_tensors='pt')
|
78 |
+
with torch.no_grad():
|
79 |
embedding = model(**tok)
|
80 |
+
# Inportant: take only last token!
|
81 |
+
return embedding.get('last_hidden_state')[:,-1,:].numpy()
|
|
|
|
|
|
|
82 |
|
83 |
list_of_contexts = [...]
|
84 |
+
emb_contexts = infer(list_of_contexts)
|
85 |
list_of_queries = [...]
|
86 |
+
emb_queries = infer(list_of_queries)
|
87 |
|
88 |
# Important: take cosine distance!
|
89 |
dist = cdist(emb_queries, emb_contexts, 'cosine')
|
|
|
122 |
|
123 |
# top 5 nearest contexts for each queries
|
124 |
top_contexts = top_k(5)
|
125 |
+
```
|
126 |
+
|
127 |
+
Citation
|
128 |
+
--------
|
129 |
+
|
130 |
+
```bibtex
|
131 |
+
@online{DeBloomzRetv2,
|
132 |
+
AUTHOR = {Cyrile Delestre},
|
133 |
+
ORGANIZATION = {Cr{\'e}dit Mutuel Ark{\'e}a},
|
134 |
+
URL = {https://huggingface.co/cmarkea/bloomz-3b-retriever-v2},
|
135 |
+
YEAR = {2024},
|
136 |
+
KEYWORDS = {NLP ; Transformers ; LLM ; Bloomz},
|
137 |
+
}
|
138 |
```
|