sentence-transformers
/

msmarco-distilbert-base-tas-b

@@ -10,7 +10,7 @@ tags:
 # sentence-transformers/msmarco-distilbert-base-tas-b
-This is a port of the [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco) to [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
@@ -25,12 +25,30 @@ pip install -U sentence-transformers
 Then you can use the model like this:
 ```python
-from sentence_transformers import SentenceTransformer
-sentences = ["This is an example sentence", "Each sentence is converted"]
 model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
-embeddings = model.encode(sentences)
-print(embeddings)
 ```
@@ -42,30 +60,49 @@ Without [sentence-transformers](https://www.SBERT.net), you can use the model li
 from transformers import AutoTokenizer, AutoModel
 import torch
-def cls_pooling(model_output, attention_mask):
-    return model_output[0][:,0]
 # Sentences we want sentence embeddings for
-sentences = ['This is an example sentence', 'Each sentence is converted']
 # Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-tas-b')
-model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-tas-b')
-# Tokenize sentences
-encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-# Compute token embeddings
-with torch.no_grad():
-    model_output = model(**encoded_input)
-# Perform pooling. In this case, max pooling.
-sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
-print("Sentence embeddings:")
-print(sentence_embeddings)
 ```
@@ -88,4 +125,4 @@ SentenceTransformer(
 ## Citing & Authors
-Have a look at: [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco

 # sentence-transformers/msmarco-distilbert-base-tas-b
+This is a port of the [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco) to [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and is optimized for the task of semantic search.
 Then you can use the model like this:
 ```python
+from sentence_transformers import SentenceTransformer, util
+query = "How many people live in London?"
+docs = ["Around 9 Million people live in London", "London is known for its financial district"]
+#Load the model
 model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
+#Encode query and documents
+query_emb = model.encode(query)
+doc_emb = model.encode(docs)
+#Compute dot score between query and all document embeddings
+scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
+#Combine docs & scores
+doc_score_pairs = list(zip(docs, scores))
+#Sort by decreasing score
+doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
+#Output passages & scores
+for doc, score in doc_score_pairs:
+    print(score, doc)
 ```
 from transformers import AutoTokenizer, AutoModel
 import torch
+#CLS Pooling - Take output from first token
+def cls_pooling(model_output):
+    return model_output.last_hidden_state[:,0]
+#Encode text
+def encode(texts):
+    # Tokenize sentences
+    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(**encoded_input, return_dict=True)
+    # Perform pooling
+    embeddings = cls_pooling(model_output)
+    return embeddings
 # Sentences we want sentence embeddings for
+query = "How many people live in London?"
+docs = ["Around 9 Million people live in London", "London is known for its financial district"]
 # Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
+model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
+#Encode query and docs
+query_emb = encode(query)
+doc_emb = encode(docs)
+#Compute dot score between query and all document embeddings
+scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
+#Combine docs & scores
+doc_score_pairs = list(zip(docs, scores))
+#Sort by decreasing score
+doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
+#Output passages & scores
+for doc, score in doc_score_pairs:
+    print(score, doc)
 ```
 ## Citing & Authors
+Have a look at: [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco)