erfan226 commited on
Commit
c17fe90
1 Parent(s): 22cef6b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -43
README.md CHANGED
@@ -1,53 +1,28 @@
1
  erfan226/persian-t5-paraphraser
2
 
3
- This is a paraphrase model that works on the Persian language. It is based on [the monolingual T5 model](https://huggingface.co/Ahmad/parsT5-base)
4
- language:
5
- - {fa}
6
- # Usage (Sentence-Transformers)
7
 
8
- ```python
9
-
10
- pip install -U sentence-transformers
11
- Then you can use the model like this:
12
-
13
- from sentence_transformers import SentenceTransformer
14
- sentences = ["This is an example sentence", "Each sentence is converted"]
15
-
16
- model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
17
- embeddings = model.encode(sentences)
18
- print(embeddings)
19
- Usage (HuggingFace Transformers)
20
- Without sentence-transformers, you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
21
-
22
- from transformers import AutoTokenizer, AutoModel
23
- import torch
24
 
 
25
 
26
- #Mean Pooling - Take attention mask into account for correct averaging
27
- def mean_pooling(model_output, attention_mask):
28
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
29
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
30
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
31
-
32
-
33
- # Sentences we want sentence embeddings for
34
- sentences = ['This is an example sentence', 'Each sentence is converted']
35
-
36
- # Load model from HuggingFace Hub
37
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
38
- model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
39
-
40
- # Tokenize sentences
41
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
42
 
43
- # Compute token embeddings
44
- with torch.no_grad():
45
- model_output = model(**encoded_input)
46
 
47
- # Perform pooling. In this case, max pooling.
48
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
 
 
 
49
 
50
- print("Sentence embeddings:")
51
- print(sentence_embeddings)
 
 
52
 
53
  ```
 
1
  erfan226/persian-t5-paraphraser
2
 
3
+ This is a paraphrasing model for the Persian language. It is based on [the monolingual T5 model for Persian](https://huggingface.co/Ahmad/parsT5-base)
 
 
 
4
 
5
+ # Usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ ```python
8
 
9
+ >>> pip install transformers
10
+ >>> from transformers import (T5ForConditionalGeneration, AutoTokenizer)
11
+ >>> import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ model_path = 'erfan226/persian-t5-paraphraser'
14
+ model = T5ForConditionalGeneration.from_pretrained(model_path)
15
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
16
 
17
+ def paraphrase(text):
18
+ input = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
19
+ max_size = int(input.input_ids.shape[1] * 1.5 + 10)
20
+ out = model.generate(**input, encoder_no_repeat_ngram_size=4, do_sample=False, num_beams=10, max_length=max_size, no_repeat_ngram_size=4,)
21
+ return tokenizer.decode(out[0], skip_special_tokens=True)
22
 
23
+ for text1, text2 in zip(x, y):
24
+ print("Original:", text1)
25
+ print("Paraphrase:", paraphrase(text1))
26
+ print("Original Paraphrase:", text2)
27
 
28
  ```