nreimers commited on
Commit
7c3d562
1 Parent(s): 9902299

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +57 -20
README.md CHANGED
@@ -10,7 +10,7 @@ tags:
10
 
11
  # sentence-transformers/msmarco-distilbert-base-tas-b
12
 
13
- This is a port of the [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco) to [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
 
15
 
16
 
@@ -25,12 +25,30 @@ pip install -U sentence-transformers
25
  Then you can use the model like this:
26
 
27
  ```python
28
- from sentence_transformers import SentenceTransformer
29
- sentences = ["This is an example sentence", "Each sentence is converted"]
30
 
 
 
 
 
31
  model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
32
- embeddings = model.encode(sentences)
33
- print(embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ```
35
 
36
 
@@ -42,30 +60,49 @@ Without [sentence-transformers](https://www.SBERT.net), you can use the model li
42
  from transformers import AutoTokenizer, AutoModel
43
  import torch
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- def cls_pooling(model_output, attention_mask):
47
- return model_output[0][:,0]
 
 
48
 
49
 
50
  # Sentences we want sentence embeddings for
51
- sentences = ['This is an example sentence', 'Each sentence is converted']
 
52
 
53
  # Load model from HuggingFace Hub
54
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilbert-base-tas-b')
55
- model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilbert-base-tas-b')
 
 
 
 
56
 
57
- # Tokenize sentences
58
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
59
 
60
- # Compute token embeddings
61
- with torch.no_grad():
62
- model_output = model(**encoded_input)
63
 
64
- # Perform pooling. In this case, max pooling.
65
- sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
66
 
67
- print("Sentence embeddings:")
68
- print(sentence_embeddings)
 
69
  ```
70
 
71
 
@@ -88,4 +125,4 @@ SentenceTransformer(
88
 
89
  ## Citing & Authors
90
 
91
- Have a look at: [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco
 
10
 
11
  # sentence-transformers/msmarco-distilbert-base-tas-b
12
 
13
+ This is a port of the [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco) to [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and is optimized for the task of semantic search.
14
 
15
 
16
 
 
25
  Then you can use the model like this:
26
 
27
  ```python
28
+ from sentence_transformers import SentenceTransformer, util
 
29
 
30
+ query = "How many people live in London?"
31
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
32
+
33
+ #Load the model
34
  model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
35
+
36
+ #Encode query and documents
37
+ query_emb = model.encode(query)
38
+ doc_emb = model.encode(docs)
39
+
40
+ #Compute dot score between query and all document embeddings
41
+ scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
42
+
43
+ #Combine docs & scores
44
+ doc_score_pairs = list(zip(docs, scores))
45
+
46
+ #Sort by decreasing score
47
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
48
+
49
+ #Output passages & scores
50
+ for doc, score in doc_score_pairs:
51
+ print(score, doc)
52
  ```
53
 
54
 
 
60
  from transformers import AutoTokenizer, AutoModel
61
  import torch
62
 
63
+ #CLS Pooling - Take output from first token
64
+ def cls_pooling(model_output):
65
+ return model_output.last_hidden_state[:,0]
66
+
67
+ #Encode text
68
+ def encode(texts):
69
+ # Tokenize sentences
70
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
71
+
72
+ # Compute token embeddings
73
+ with torch.no_grad():
74
+ model_output = model(**encoded_input, return_dict=True)
75
 
76
+ # Perform pooling
77
+ embeddings = cls_pooling(model_output)
78
+
79
+ return embeddings
80
 
81
 
82
  # Sentences we want sentence embeddings for
83
+ query = "How many people live in London?"
84
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
85
 
86
  # Load model from HuggingFace Hub
87
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
88
+ model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")
89
+
90
+ #Encode query and docs
91
+ query_emb = encode(query)
92
+ doc_emb = encode(docs)
93
 
94
+ #Compute dot score between query and all document embeddings
95
+ scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
96
 
97
+ #Combine docs & scores
98
+ doc_score_pairs = list(zip(docs, scores))
 
99
 
100
+ #Sort by decreasing score
101
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
102
 
103
+ #Output passages & scores
104
+ for doc, score in doc_score_pairs:
105
+ print(score, doc)
106
  ```
107
 
108
 
 
125
 
126
  ## Citing & Authors
127
 
128
+ Have a look at: [DistilBert TAS-B Model](https://huggingface.co/sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco)