FrancescoPeriti commited on
Commit
9615074
1 Parent(s): 517ddd6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -1
README.md CHANGED
@@ -22,10 +22,13 @@ The following `bitsandbytes` quantization config was used during training:
22
 
23
  ## Get it started
24
  ```python
25
- from peft import PeftModel, PeftConfig
 
26
  from huggingface_hub import login
 
27
  from transformers import AutoModelForCausalLM, AutoTokenizer, AddedToken
28
 
 
29
  login("[YOUR HF TOKEN HERE FOR USING LLAMA]")
30
  config = PeftConfig.from_pretrained("ChangeIsKey/llama-7b-lexical-substitution")
31
  base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map='auto')
@@ -39,4 +42,61 @@ base_model.resize_token_embeddings(len(tokenizer))
39
 
40
  model = PeftModel.from_pretrained(base_model, "ChangeIsKey/llama-7b-lexical-substitution")
41
  model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ```
 
22
 
23
  ## Get it started
24
  ```python
25
+ import torch
26
+ from datasets import Dataset
27
  from huggingface_hub import login
28
+ from peft import PeftModel, PeftConfig
29
  from transformers import AutoModelForCausalLM, AutoTokenizer, AddedToken
30
 
31
+ # load model and tokenizer
32
  login("[YOUR HF TOKEN HERE FOR USING LLAMA]")
33
  config = PeftConfig.from_pretrained("ChangeIsKey/llama-7b-lexical-substitution")
34
  base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map='auto')
 
42
 
43
  model = PeftModel.from_pretrained(base_model, "ChangeIsKey/llama-7b-lexical-substitution")
44
  model.eval()
45
+
46
+ # let's use this model
47
+ def formatting_func(records):
48
+ text_batch = []
49
+
50
+ for i in range(len(records['example'])):
51
+ example = records[i]['example']
52
+ start, end = records[i]['start'], records[i]['end']
53
+
54
+ target = f'**{example[start:end]}**'
55
+ input_text = f'{example[:start]} {target} {example[end:]}'
56
+ text_batch.append(f"{input_text}<|answer|>")
57
+
58
+ return text_batch
59
+
60
+ def tokenization(dataset):
61
+ return tokenizer(formatting_func(dataset),
62
+ truncation=True,
63
+ max_length=512,
64
+ padding=True,
65
+ return_tensors="pt").to("cuda")
66
+
67
+
68
+ # a toy example
69
+ examples = [{'example': 'The traffic jam on the highway made everyone late for work.', 'start': 12, 'end': 15},
70
+ {'example': 'I spread a generous layer of strawberry jam on my toast this morning', 'start': 40, 'end': 43}]
71
+ dataset = Dataset.from_list(examples)
72
+
73
+
74
+ batch_size = 32
75
+ output = list()
76
+
77
+ with torch.no_grad():
78
+ for i in range(0, len(dataset), batch_size):
79
+ model_input = tokenization(dataset.select(range(i, min(dataset.num_rows, i + batch_size))))
80
+
81
+ output_ids = model.generate(**model_input,
82
+ do_sample=True,
83
+ num_return_sequences=1,
84
+ max_new_tokens=30,
85
+ temperature=0.00001,
86
+ repetition_penalty=1/0.85,
87
+ top_k=40,
88
+ top_p=0.1)
89
+
90
+ answers = tokenizer.batch_decode(output_ids, skip_special_tokens=False)
91
+
92
+ for answer in answers:
93
+ answer = " ".join(answer.split('<|answer|>')[1:])
94
+ substitutes = [s.strip() for s in answer.split('<|end|>')[:-1] if s.strip() != ""]
95
+ output.append(", ".join(substitutes))
96
+
97
+ # output
98
+ dataset = dataset.add_column('substitutes', output)
99
+ for row in dataset:
100
+ target = row['example'][row['start']:row['end']]
101
+ print(f"Target: {target}\nExample: {row['example']}\nSubstitutes: {row['substitutes']}\n")
102
  ```