JerryLiJinyi commited on
Commit
10b912d
·
verified ·
1 Parent(s): 9277ca2

Upload 127 files

Browse files

Upload everything

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. SCRL_new/Makefile +93 -0
  2. SCRL_new/README.md +137 -0
  3. SCRL_new/bin/evaluate.py +144 -0
  4. SCRL_new/bin/evaluate_hc_output.py +132 -0
  5. SCRL_new/bin/predict.py +53 -0
  6. SCRL_new/bin/run_hc.py +80 -0
  7. SCRL_new/bin/train.py +157 -0
  8. SCRL_new/config/example.json +30 -0
  9. SCRL_new/config/gigaword-L8.json +37 -0
  10. SCRL_new/config/hc.json +16 -0
  11. SCRL_new/config/newsroom-CR75.json +37 -0
  12. SCRL_new/config/newsroom-L11.json +37 -0
  13. SCRL_new/data/test-data/bnc.jsonl +0 -0
  14. SCRL_new/data/test-data/broadcast.jsonl +0 -0
  15. SCRL_new/data/test-data/duc2004.jsonl +0 -0
  16. SCRL_new/data/test-data/gigaword.jsonl +0 -0
  17. SCRL_new/data/test-data/google.jsonl +0 -0
  18. SCRL_new/data/test-data/newsroom.jsonl +280 -0
  19. SCRL_new/example.py +23 -0
  20. SCRL_new/images/model.png +0 -0
  21. SCRL_new/loaders/gigaword.py +52 -0
  22. SCRL_new/loaders/newsroom.py +51 -0
  23. SCRL_new/requirements.txt +5 -0
  24. SCRL_new/scrl/__init__.py +0 -0
  25. SCRL_new/scrl/config.py +65 -0
  26. SCRL_new/scrl/config_hc.py +50 -0
  27. SCRL_new/scrl/data.py +24 -0
  28. SCRL_new/scrl/eval_metrics.py +24 -0
  29. SCRL_new/scrl/hill_climbing.py +166 -0
  30. SCRL_new/scrl/model.py +75 -0
  31. SCRL_new/scrl/rewards.py +330 -0
  32. SCRL_new/scrl/sampling.py +99 -0
  33. SCRL_new/scrl/training.py +346 -0
  34. SCRL_new/scrl/utils.py +86 -0
  35. SCRL_new/setup.py +8 -0
  36. abs_compressor.py +44 -0
  37. kis.py +47 -0
  38. models/gigaword-L8/checkpoints/best_val_reward-7700/classifier.bin +3 -0
  39. models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/config.json +3 -0
  40. models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/pytorch_model.bin +3 -0
  41. models/gigaword-L8/config.json +37 -0
  42. models/gigaword-L8/series/argmax_len.npy +3 -0
  43. models/gigaword-L8/series/argmax_reward.npy +3 -0
  44. models/gigaword-L8/series/label_variance.npy +3 -0
  45. models/gigaword-L8/series/loss.npy +3 -0
  46. models/gigaword-L8/series/mean_max_prob.npy +3 -0
  47. models/gigaword-L8/series/reward_Fluency.npy +3 -0
  48. models/gigaword-L8/series/reward_GaussianLength.npy +3 -0
  49. models/gigaword-L8/series/reward_SentenceMeanSimilarity.npy +3 -0
  50. models/gigaword-L8/series/sample_prob.npy +3 -0
SCRL_new/Makefile ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONFIG ?= config/example.json
2
+ DEVICE ?= cpu
3
+ MODELDIR ?= models/newsroom-P75/model-dirs/best_val_reward-7950
4
+ TESTSET ?= data/test-data/broadcast.jsonl
5
+ HC_OUTPUT ?= data/hc-outputs/hc.L11.google.jsonl
6
+
7
+ # TRAINING
8
+
9
+ .PHONY: train
10
+ train:
11
+ python bin/train.py --verbose --config $(CONFIG) --device $(DEVICE)
12
+
13
+ # EVALUATING SCRL MODELS (predict + evaluate)
14
+
15
+ .PHONY: eval-google
16
+ eval-google:
17
+ python bin/evaluate.py \
18
+ --model-dir $(MODELDIR) \
19
+ --device $(DEVICE) \
20
+ --dataset data/test-data/google.jsonl
21
+
22
+
23
+ .PHONY: eval-duc2004
24
+ eval-duc2004:
25
+ python bin/evaluate.py \
26
+ --model-dir $(MODELDIR) \
27
+ --device $(DEVICE) \
28
+ --dataset data/test-data/duc2004.jsonl \
29
+ --max-chars 75
30
+
31
+
32
+ .PHONY: eval-gigaword
33
+ eval-gigaword:
34
+ python bin/evaluate.py \
35
+ --model-dir $(MODELDIR) \
36
+ --device $(DEVICE) \
37
+ --dataset data/test-data/gigaword.jsonl \
38
+ --pretokenized
39
+
40
+
41
+ .PHONY: eval-broadcast
42
+ eval-broadcast:
43
+ python bin/evaluate.py \
44
+ --model-dir $(MODELDIR) \
45
+ --device $(DEVICE) \
46
+ --dataset data/test-data/broadcast.jsonl \
47
+ --pretokenized
48
+
49
+
50
+ .PHONY: eval-bnc
51
+ eval-bnc:
52
+ python bin/evaluate.py \
53
+ --model-dir $(MODELDIR) \
54
+ --device $(DEVICE) \
55
+ --dataset data/test-data/bnc.jsonl \
56
+ --pretokenized
57
+
58
+
59
+ # EVALUATE HILL CLIMBING SEARCH
60
+
61
+ .PHONY: hc-eval-google
62
+ hc-eval-google:
63
+ python bin/evaluate_hc_output.py \
64
+ --dataset data/test-data/google.jsonl \
65
+ --outputs $(HC_OUTPUT)
66
+
67
+
68
+ .PHONY: hc-eval-duc2004
69
+ hc-eval-duc2004:
70
+ python bin/evaluate_hc_output.py \
71
+ --dataset data/test-data/duc2004.jsonl \
72
+ --outputs $(HC_OUTPUT)
73
+
74
+
75
+ .PHONY: hc-eval-gigaword
76
+ hc-eval-gigaword:
77
+ python bin/evaluate_hc_output.py \
78
+ --dataset data/test-data/gigaword.jsonl \
79
+ --outputs $(HC_OUTPUT)
80
+
81
+
82
+ .PHONY: hc-eval-broadcast
83
+ hc-eval-broadcast:
84
+ python bin/evaluate_hc_output.py \
85
+ --dataset data/test-data/broadcast.jsonl \
86
+ --outputs $(HC_OUTPUT)
87
+
88
+
89
+ .PHONY: hc-eval-bnc
90
+ hc-eval-bnc:
91
+ python bin/evaluate_hc_output.py \
92
+ --dataset data/test-data/bnc.jsonl \
93
+ --outputs $(HC_OUTPUT)
SCRL_new/README.md ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # Sentence Compression with Reinforcement Learning
4
+
5
+ Code for the ACL 2022 paper [Efficient Unsupervised Sentence Compression by Fine-tuning Transformers with Reinforcement Learning](https://arxiv.org/abs/2205.08221).
6
+
7
+ Model architecture used in this work:
8
+
9
+ <img src="images/model.png" alt="drawing" width="350"/>
10
+
11
+ ### Install `scrl` library
12
+ The library is used for training, producing summaries with existing models and for evaluation and works with Python 3.7/3.8.
13
+
14
+ 1. Create environment <br>
15
+ `conda create -n my_env python=3.8` with conda, or with venv: `python3.8 -m venv <env path>` <br>
16
+
17
+ 2. Activate the environment <br>
18
+ `conda activate my_env` with conda, otherwise: `source <env path>/bin/activate`
19
+
20
+ 3. Install dependencies & library in development mode: <br>
21
+ `pip install -r requirements.txt` <br>
22
+ `pip install -e .`
23
+
24
+ ### Data
25
+ The full contents of the `data` folder can be found in [this google drive folder](https://drive.google.com/drive/folders/1grkgZhtdd-Bw45GAnHza9RRb5OVQG4pK?usp=sharing).
26
+ In particular, `models` are required to use and evaluate our trained models, `train-data` to train new models, and `hc-outputs` to analyse/evaluate outputs of the hill climbing baseline.
27
+
28
+ ### Using a model
29
+
30
+ We trained 3 models which were used in our evaluation:
31
+ * `gigaword-L8` - trained to predict summaries of 8 tokens; trained on Gigaword to match preprocessing of test set
32
+ * `newsroom-L11` - trained to predict summaries of 11 tokens
33
+ * `newsroom-P75` - trained to reduce sentences to 75% of their original length
34
+
35
+ To use a trained model in Python, we need its model directory and the correct pretrained model ID for the tokenizer corresponding to the original pretrained model that the sentence compression model was initialised with:
36
+ ```python
37
+ from scrl.model import load_model
38
+ from transformers import AutoTokenizer
39
+
40
+ # model_dir = "data/models/gigaword-L8/"
41
+ # model_dir = "data/models/newsroom-L11/"
42
+ model_dir = "data/models/newsroom-P75/"
43
+ device = "cpu"
44
+ model = load_model(model_dir, device)
45
+ tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
46
+ sources = [
47
+ """
48
+ Most remaining Covid restrictions in Victoria have now been removed for those who are fully vaccinated, with the state about to hit its 90% vaccinated target.
49
+ """.strip()
50
+ ]
51
+ summaries = model.predict(sources, tokenizer, device)
52
+ for s in summaries:
53
+ print(s)
54
+ ```
55
+
56
+ You can run this code with [example.py](example.py)
57
+
58
+
59
+ ### Training a new model
60
+
61
+ A new model needs a new config file (examples in [config](config)) for various settings, e.g. training dataset, reward functions, model directory, steps.
62
+
63
+
64
+ `python bin/train.py --verbose --config config/example.json --device cuda`
65
+
66
+ You can also change the device to `cpu` to try it out locally.
67
+
68
+ Training can be interrupted with `Ctrl+C` and continued by re-running the same command which will pick up from the latest saved checkpoint. Add `--fresh` to delete the previous training progress and start from scratch.
69
+
70
+
71
+ ### Evaluation
72
+
73
+ The evaluation results can be replicated with the following Make commands, which run with slightly different settings depending on the dataset:
74
+
75
+ ```bash
76
+ make eval-google MODELDIR=data/models/newsroom-L11
77
+ make eval-duc2004 MODELDIR=data/models/newsroom-L11
78
+ make eval-gigaword MODELDIR=data/models/gigaword-L8
79
+ make eval-broadcast MODELDIR=data/models/newsroom-P75
80
+ make eval-bnc MODELDIR=data/models/newsroom-P75
81
+ ```
82
+
83
+ To evaluate on a custom dataset, check out [bin/evaluate.py](bin/evaluate.py) and its arguments.
84
+
85
+
86
+ ### Hill Climbing Baseline
87
+
88
+ We implemented a search-based baseline for sentence compression using hill climbing, based on [Discrete Optimization for Unsupervised Sentence Summarization with Word-Level Extraction](https://arxiv.org/abs/2005.01791). A difference to the original method is that we only restart the search if no unknown neighbour state can be found, i.e. dynamically instead of in equal-paced intervals.
89
+
90
+ **Producing summaries**<br>
91
+ The budget of search steps is controlled with `--steps`.
92
+ ```bash
93
+ python bin/run_hc.py \
94
+ --config config/hc.json \
95
+ --steps 10 \
96
+ --target-len 11 \
97
+ --dataset data/test-data/google.jsonl \
98
+ --output data/hc-outputs/example.jsonl \
99
+ --device cpu
100
+ ```
101
+
102
+
103
+ **Evaluation** <br>
104
+
105
+ For datasets used in the paper:
106
+ ```bash
107
+ make hc-eval-google HC_OUTPUT=data/hc-outputs/hc.L11.google.jsonl
108
+ make hc-eval-duc2004 HC_OUTPUT=data/hc-outputs/hc.L11.duc2004.jsonl
109
+ make hc-eval-gigaword HC_OUTPUT=data/hc-outputs/hc.L8.gigaword.jsonl
110
+ make hc-eval-broadcast HC_OUTPUT=data/hc-outputs/hc.P75.broadcast.jsonl
111
+ make hc-eval-bnc HC_OUTPUT=data/hc-outputs/hc.P75.bnc.jsonl
112
+ ```
113
+
114
+ Example for custom dataset:
115
+ ```
116
+ python bin/evaluate_hc_output.py \
117
+ --dataset data/test-data/google.jsonl \
118
+ --outputs data/hc-outputs/hc.L11.google.jsonl
119
+ ```
120
+
121
+ ### Citation
122
+
123
+ ⚠️ Please refer to the version of the paper on Arxiv, there is a typo in the original ACL version (Table 3, ROUGE-1 column, Gigaword-SCRL-8 row).
124
+
125
+ ```
126
+ @inproceedings{ghalandari-etal-2022-efficient,
127
+ title = "Efficient Unsupervised Sentence Compression by Fine-tuning Transformers with Reinforcement Learning",
128
+ author = "Gholipour Ghalandari, Demian and Hokamp, Chris and Ifrim, Georgiana",
129
+ booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
130
+ month = may,
131
+ year = "2022",
132
+ address = "Dublin, Ireland",
133
+ publisher = "Association for Computational Linguistics",
134
+ url = "https://arxiv.org/abs/2205.08221",
135
+ pages = "1267--1280",
136
+ }
137
+ ```
SCRL_new/bin/evaluate.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import numpy as np
4
+ import tqdm
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+ from collections import defaultdict, Counter
8
+
9
+ from transformers import AutoTokenizer
10
+ import sys
11
+ sys.path.append("/home/hdd/lijinyi/CompressionInAvalon/promptcompressor/SCRL_new")
12
+ print(sys.path)
13
+ import scrl.utils as utils
14
+ from scrl.model import load_checkpoint, load_model
15
+ from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
16
+ from nltk import word_tokenize
17
+ import nltk
18
+
19
+ nltk.download('punkt')
20
+ print("punkt done!")
21
+
22
+
23
+ def main(args):
24
+
25
+ if args.model_dir is not None and args.checkpoint is None:
26
+ model = load_model(
27
+ Path(args.model_dir), device=args.device, prefix="best"
28
+ )
29
+ elif args.model_dir is None and args.checkpoint is not None:
30
+ model = load_checkpoint(Path(args.checkpoint), device=args.device)
31
+ else:
32
+ raise Exception("Provide either a model directory or checkpoint.")
33
+
34
+ model = load_model(Path(args.model_dir), device=args.device)
35
+ tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
36
+
37
+ dataset = list(utils.read_jsonl(args.dataset))
38
+
39
+ all_scores = defaultdict(list)
40
+
41
+ for item in tqdm.tqdm(dataset):
42
+ src = item["text"]
43
+ if args.lower_src:
44
+ src = src.lower()
45
+ tgts = item["summaries"]
46
+ pred = model.predict([src], tokenizer, args.device)[0]
47
+
48
+ if args.max_chars > 0:
49
+ pred = pred[:args.max_chars]
50
+
51
+ src_tokens = word_tokenize(src)
52
+ pred_tokens = word_tokenize(pred)
53
+
54
+ if args.lower_summary:
55
+ pred_tokens = [t.lower() for t in pred_tokens]
56
+
57
+ if args.pretokenized:
58
+ src_tokens = src.split()
59
+ else:
60
+ src_tokens = word_tokenize(src)
61
+
62
+ item_scores = defaultdict(list)
63
+ for tgt in tgts:
64
+ if args.pretokenized:
65
+ tgt_tokens = tgt.split()
66
+ else:
67
+ tgt_tokens = word_tokenize(tgt)
68
+ if args.lower_summary:
69
+ tgt_tokens = [t.lower() for t in tgt_tokens]
70
+
71
+ token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True)
72
+
73
+ rouge_scores = rouge_scorer.score(tgt, pred)
74
+ for rouge_type, rouge_type_scores in rouge_scores.items():
75
+ item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision)
76
+ item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall)
77
+ item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure)
78
+
79
+ item_scores["token-f1"].append(token_fscore)
80
+ item_scores["tgt-len"].append(len(tgt_tokens))
81
+ item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens))
82
+
83
+ for k, values in item_scores.items():
84
+ item_mean = np.mean(values)
85
+ all_scores[k].append(item_mean)
86
+
87
+ all_scores["pred-len"].append(len(pred_tokens))
88
+ all_scores["src-len"].append(len(src_tokens))
89
+ all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens))
90
+
91
+ if args.verbose:
92
+ print("SRC:", src)
93
+ print("TGT:", tgts[0])
94
+ print("PRED:", pred)
95
+ print("=" * 100)
96
+
97
+ print("="*100)
98
+ print("RESULTS:")
99
+
100
+ print("="*20, "Length (#tokens):", "="*20)
101
+ for metric in ("src-len", "tgt-len", "pred-len"):
102
+ mean = np.mean(all_scores[metric])
103
+ print(f"{metric}: {mean:.2f}")
104
+ print()
105
+
106
+ print("="*20, "Compression ratio:", "="*20)
107
+ for metric in ("tgt-cr", "pred-cr"):
108
+ mean = np.mean(all_scores[metric])
109
+ print(f"{metric}: {mean:.2f}")
110
+ print()
111
+
112
+ print("="*20, "Token F1-Score:", "="*20)
113
+ mean = np.mean(all_scores["token-f1"])
114
+ print(f"f1-score: {mean:.3f}")
115
+ print()
116
+
117
+ print("="*20, "ROUGE F1-Scores:", "="*20)
118
+ for rouge_type in ROUGE_TYPES:
119
+ mean = np.mean(all_scores[f"{rouge_type}-f"])
120
+ print(f"{rouge_type}: {mean:.4f}")
121
+ print()
122
+
123
+ print("="*20, "ROUGE Recall:", "="*20)
124
+ for rouge_type in ROUGE_TYPES:
125
+ mean = np.mean(all_scores[f"{rouge_type}-r"])
126
+ print(f"{rouge_type}: {mean:.4f}")
127
+ print()
128
+
129
+ def parse_args():
130
+ parser = argparse.ArgumentParser()
131
+ parser.add_argument('--dataset', required=True)
132
+ parser.add_argument('--model-dir', required=False)
133
+ parser.add_argument('--checkpoint', required=False)
134
+ parser.add_argument('--device', default="cpu")
135
+ parser.add_argument('--pretokenized', action="store_true")
136
+ parser.add_argument('--max-chars', type=int, default=-1)
137
+ parser.add_argument('--verbose', action="store_true")
138
+ parser.add_argument('--lower-src', action="store_true")
139
+ parser.add_argument('--lower-summary', action="store_true")
140
+ return parser.parse_args()
141
+
142
+
143
+ if __name__ == '__main__':
144
+ main(parse_args())
SCRL_new/bin/evaluate_hc_output.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import numpy as np
4
+ import tqdm
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+ from collections import defaultdict, Counter
8
+
9
+ from transformers import AutoTokenizer
10
+ import scrl.utils as utils
11
+ from scrl.model import load_checkpoint
12
+ from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
13
+ from nltk import word_tokenize
14
+
15
+
16
+ def get_hc_summary(output):
17
+ i = np.argmax(output["scores"])
18
+ summary = output["summaries"][i]
19
+ mask = output["masks"][i]
20
+ return summary
21
+
22
+
23
+ def main(args):
24
+
25
+ outputs = list(utils.read_jsonl(args.outputs))
26
+ dataset = list(utils.read_jsonl(args.dataset))
27
+
28
+ all_scores = defaultdict(list)
29
+
30
+ for i, item in tqdm.tqdm(enumerate(dataset)):
31
+
32
+ src = item["text"]
33
+ if args.lower_src:
34
+ src = src.lower()
35
+ tgts = item["summaries"]
36
+ pred = get_hc_summary(outputs[i])
37
+
38
+ if args.max_chars > 0:
39
+ pred = pred[:args.max_chars]
40
+
41
+ src_tokens = word_tokenize(src)
42
+ pred_tokens = word_tokenize(pred)
43
+
44
+ if args.lower_summary:
45
+ pred_tokens = [t.lower() for t in pred_tokens]
46
+
47
+ if args.pretokenized:
48
+ src_tokens = src.split()
49
+ else:
50
+ src_tokens = word_tokenize(src)
51
+
52
+ item_scores = defaultdict(list)
53
+ for tgt in tgts:
54
+ if args.pretokenized:
55
+ tgt_tokens = tgt.split()
56
+ else:
57
+ tgt_tokens = word_tokenize(tgt)
58
+ if args.lower_summary:
59
+ tgt_tokens = [t.lower() for t in tgt_tokens]
60
+
61
+ token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True)
62
+
63
+ rouge_scores = rouge_scorer.score(tgt, pred)
64
+ for rouge_type, rouge_type_scores in rouge_scores.items():
65
+ item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision)
66
+ item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall)
67
+ item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure)
68
+
69
+ item_scores["token-f1"].append(token_fscore)
70
+ item_scores["tgt-len"].append(len(tgt_tokens))
71
+ item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens))
72
+
73
+ for k, values in item_scores.items():
74
+ item_mean = np.mean(values)
75
+ all_scores[k].append(item_mean)
76
+
77
+ all_scores["pred-len"].append(len(pred_tokens))
78
+ all_scores["src-len"].append(len(src_tokens))
79
+ all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens))
80
+
81
+ if args.verbose:
82
+ print("SRC:", src)
83
+ print("TGT:", tgts[0])
84
+ print("PRED:", pred)
85
+ print("=" * 100)
86
+
87
+ print("="*100)
88
+ print("RESULTS:")
89
+
90
+ print("="*20, "Length (#tokens):", "="*20)
91
+ for metric in ("src-len", "tgt-len", "pred-len"):
92
+ mean = np.mean(all_scores[metric])
93
+ print(f"{metric}: {mean:.2f}")
94
+ print()
95
+
96
+ print("="*20, "Compression ratio:", "="*20)
97
+ for metric in ("tgt-cr", "pred-cr"):
98
+ mean = np.mean(all_scores[metric])
99
+ print(f"{metric}: {mean:.2f}")
100
+ print()
101
+
102
+ print("="*20, "Token F1-Score:", "="*20)
103
+ mean = np.mean(all_scores["token-f1"])
104
+ print(f"f1-score: {mean:.3f}")
105
+ print()
106
+
107
+ print("="*20, "ROUGE F1-Scores:", "="*20)
108
+ for rouge_type in ROUGE_TYPES:
109
+ mean = np.mean(all_scores[f"{rouge_type}-f"])
110
+ print(f"{rouge_type}: {mean:.4f}")
111
+ print()
112
+
113
+ print("="*20, "ROUGE Recall:", "="*20)
114
+ for rouge_type in ROUGE_TYPES:
115
+ mean = np.mean(all_scores[f"{rouge_type}-r"])
116
+ print(f"{rouge_type}: {mean:.4f}")
117
+ print()
118
+
119
+ def parse_args():
120
+ parser = argparse.ArgumentParser()
121
+ parser.add_argument('--dataset', required=True)
122
+ parser.add_argument('--outputs', required=True)
123
+ parser.add_argument('--pretokenized', action="store_true")
124
+ parser.add_argument('--max-chars', type=int, default=-1)
125
+ parser.add_argument('--verbose', action="store_true")
126
+ parser.add_argument('--lower-src', action="store_true")
127
+ parser.add_argument('--lower-summary', action="store_true")
128
+ return parser.parse_args()
129
+
130
+
131
+ if __name__ == '__main__':
132
+ main(parse_args())
SCRL_new/bin/predict.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import numpy as np
4
+ import tqdm
5
+ from pathlib import Path
6
+ from pprint import pprint
7
+ from collections import defaultdict, Counter
8
+
9
+ from transformers import AutoTokenizer
10
+ import scrl.utils as utils
11
+ from scrl.model import load_checkpoint
12
+ from scrl.metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES
13
+ from nltk import word_tokenize
14
+
15
+ from scrl.rewards import load_rewards
16
+ from scrl.config import load_config
17
+ import time
18
+
19
+
20
+ def main(args):
21
+ model = load_checkpoint(Path(args.checkpoint), device=args.device)
22
+ tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
23
+ dataset = list(utils.read_jsonl(args.dataset))
24
+ batches = utils.batchify(dataset, args.batch_size)
25
+ outputs = []
26
+ t1 = time.time()
27
+ for items in tqdm.tqdm(batches):
28
+ sources = [x["text"] for x in items]
29
+ summaries = model.predict(sources, tokenizer, args.device)
30
+ for item, summary in zip(items, summaries):
31
+ output = {
32
+ "id": item["id"],
33
+ "pred-summary": summary,
34
+ }
35
+ outputs.append(output)
36
+ t2 = time.time()
37
+ print("Seconds:", t2-t1)
38
+ if args.output:
39
+ utils.write_jsonl(outputs, args.output, "w")
40
+
41
+
42
+ def parse_args():
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument('--dataset', required=True)
45
+ parser.add_argument('--output', required=False)
46
+ parser.add_argument('--checkpoint', required=True)
47
+ parser.add_argument('--device', default="cpu")
48
+ parser.add_argument('--batch-size', type=int, default=4)
49
+ return parser.parse_args()
50
+
51
+
52
+ if __name__ == '__main__':
53
+ main(parse_args())
SCRL_new/bin/run_hc.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from scrl.hill_climbing import DynamicRestartHCSC, PunktTokenizer, WhiteSpaceTokenizer
3
+ from scrl.config_hc import load_config
4
+ from scrl.rewards import load_rewards
5
+ from scrl import utils
6
+ import tqdm
7
+ from pathlib import Path
8
+
9
+
10
+ def run_on_dataset(
11
+ searcher,
12
+ dataset,
13
+ target_len,
14
+ target_ratio,
15
+ n_steps,
16
+ outpath,
17
+ ):
18
+
19
+ outpath = Path(outpath)
20
+
21
+ start = 0
22
+ if outpath.exists():
23
+ for i, x in enumerate(utils.read_jsonl(outpath)):
24
+ start += 1
25
+ passed = 0
26
+
27
+ batches = utils.batchify(dataset, batch_size=4)
28
+ for batch in tqdm.tqdm(batches):
29
+ passed += len(batch)
30
+ if passed <= start:
31
+ continue
32
+ elif passed == start + len(batch):
33
+ print(f"starting at position {passed - len(batch)}")
34
+
35
+ sources = [x["text"] for x in batch]
36
+ if target_len is not None:
37
+ target_lens = [target_len for _ in batch]
38
+ else:
39
+ input_lens = [len(tokens) for tokens in searcher.tokenizer(sources)]
40
+ target_lens = [round(target_ratio * l) for l in input_lens]
41
+ print(input_lens)
42
+ print(target_lens)
43
+ states = searcher(
44
+ sources,
45
+ target_lens=target_lens,
46
+ n_steps=n_steps,
47
+ )
48
+ preds = [s["best_summary"] for s in states]
49
+ utils.write_jsonl(states, outpath, "a")
50
+
51
+
52
+ def main(args):
53
+ config = load_config(args)
54
+ print("DEVICE:", config.device)
55
+ objective = load_rewards(config)
56
+ tokenizer = WhiteSpaceTokenizer() if args.pretokenized else PunktTokenizer()
57
+ searcher = DynamicRestartHCSC(tokenizer, objective)
58
+ dataset = list(utils.read_jsonl(args.dataset))
59
+ assert (args.target_len is None or args.target_ratio is None)
60
+ run_on_dataset(
61
+ searcher,
62
+ dataset,
63
+ args.target_len,
64
+ args.target_ratio,
65
+ args.steps,
66
+ args.output
67
+ )
68
+
69
+
70
+ if __name__ == '__main__':
71
+ parser = argparse.ArgumentParser()
72
+ parser.add_argument("--config", help="path to JSON config file", required=True)
73
+ parser.add_argument("--output", required=True)
74
+ parser.add_argument("--dataset", required=True)
75
+ parser.add_argument("--pretokenized", action="store_true")
76
+ parser.add_argument("--device", default="cuda")
77
+ parser.add_argument("--target-len", type=int, default=None)
78
+ parser.add_argument("--target-ratio", type=float, default=None)
79
+ parser.add_argument("--steps", default=1000, type=int)
80
+ main(load_config(parser.parse_args()))
SCRL_new/bin/train.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import numpy as np
3
+ from pathlib import Path
4
+ import tqdm
5
+ from pprint import pprint
6
+ import torch
7
+ from torch.nn.utils.rnn import pad_sequence
8
+ from scrl.config import load_config
9
+ from scrl.training import setup_and_train
10
+ from scrl.model import labels_to_summary
11
+ from scrl.eval_metrics import compute_token_f1
12
+ import scrl.utils as utils
13
+ from nltk import word_tokenize
14
+
15
+
16
+ def evaluate_validation_reward(args, manager, model, tokenizer, reward_generator, dataset):
17
+ device = args.device
18
+ idx_range = list(range(len(dataset)))
19
+ dataset_indices = list(utils.batchify(idx_range, args.batch_size))
20
+ rewards = []
21
+ for i, indices in enumerate(dataset_indices):
22
+ if args.max_val_steps != None and i >= args.max_val_steps:
23
+ break
24
+ batch = dataset[indices]
25
+ input_ids = batch["input_ids"]
26
+ input_ids = pad_sequence(
27
+ [torch.tensor(ids) for ids in input_ids], batch_first=True
28
+ )
29
+ logits = model(input_ids.to(device))
30
+ probs = torch.softmax(logits, dim=2)
31
+ argmax_labels = torch.argmax(logits, dim=2).to(device)
32
+ argmax_summaries = labels_to_summary(input_ids, argmax_labels, tokenizer)
33
+ argmax_rewards, _ = reward_generator(batch["document"], argmax_summaries)
34
+ rewards += argmax_rewards
35
+ avg_reward = np.mean(rewards)
36
+ return avg_reward
37
+
38
+
39
+
40
+ def evaluate_validation_dataset(args, manager, model, tokenizer, reward_generator, dataset_path):
41
+ f1_scores = []
42
+ dataset = list(utils.read_jsonl(dataset_path))
43
+ dump_data = []
44
+
45
+ for item in tqdm.tqdm(dataset):
46
+ src = item["text"]
47
+ tgts = item["summaries"]
48
+
49
+ input_ids = torch.tensor(tokenizer([src])["input_ids"]).to(args.device)
50
+ logits = model.forward(input_ids)
51
+ argmax_labels = torch.argmax(logits, dim=2)
52
+ pred = labels_to_summary(input_ids, argmax_labels, tokenizer)[0]
53
+
54
+ pred_tokens = word_tokenize(pred)
55
+ src_tokens = word_tokenize(src)
56
+
57
+
58
+ item_scores = []
59
+ for tgt in tgts:
60
+ tgt_tokens = word_tokenize(tgt)
61
+ pred_tokens = [t.lower() for t in pred_tokens]
62
+ tgt_tokens = [t.lower() for t in tgt_tokens]
63
+ token_f1 = compute_token_f1(
64
+ tgt_tokens, pred_tokens, use_counts=True
65
+ )
66
+ item_scores.append(token_f1)
67
+
68
+ if args.dump:
69
+ probs = torch.softmax(logits, dim=2)[0].detach().tolist()
70
+ dump_item = {
71
+ "probs": probs,
72
+ "source": src,
73
+ "target": tgts[0],
74
+ "f1-score": item_scores[0],
75
+ "pred_summary": pred,
76
+ "pred_labels": argmax_labels[0].tolist(),
77
+ }
78
+ dump_data.append(dump_item)
79
+
80
+ item_score = np.mean(item_scores)
81
+ f1_scores.append(item_score)
82
+ score = np.mean(f1_scores)
83
+
84
+
85
+ if args.dump:
86
+ dataset_name = dataset_path.name.split(".jsonl")[0]
87
+ dump_dir = manager.dir / f"dump-{dataset_name}"
88
+ dump_dir.mkdir(exist_ok=True)
89
+ utils.write_jsonl(
90
+ dump_data,
91
+ dump_dir / f"step-{manager.step}.jsonl",
92
+ "w"
93
+ )
94
+ return score
95
+
96
+
97
+ def evaluate(args, manager, model, tokenizer, reward_generator, holdout_data):
98
+ step = manager.step
99
+ val_reward = evaluate_validation_reward(args, manager, model, tokenizer, reward_generator, holdout_data)
100
+
101
+ reward_path = manager.dir / "val_rewards.jsonl"
102
+ if reward_path.exists():
103
+ reward_results = list(utils.read_jsonl(reward_path))
104
+ prev_max = max([x["score"] for x in reward_results])
105
+ else:
106
+ reward_results = []
107
+ prev_max = 0
108
+ if val_reward > prev_max:
109
+ manager.save_model(model, step, "best_val_reward")
110
+ reward_results.append({"step": step, "score": val_reward})
111
+ utils.write_jsonl(reward_results, reward_path, "w")
112
+ if args.verbose:
113
+ print("Validation Rewards:")
114
+ pprint(reward_results)
115
+ print()
116
+
117
+ # only used if a validation dataset is specified in config
118
+ for val_data_path in args.validation_datasets:
119
+ val_data_path = Path(val_data_path)
120
+ dataset_name = val_data_path.name.split(".jsonl")[0]
121
+ dataset_score = evaluate_validation_dataset(
122
+ args, manager, model, tokenizer, reward_generator, val_data_path
123
+ )
124
+ result_path = Path(manager.dir / f"val_data_results.{dataset_name}.jsonl")
125
+ if result_path.exists():
126
+ dataset_results = list(utils.read_jsonl(result_path))
127
+ prev_max = max([x["score"] for x in dataset_results])
128
+ else:
129
+ dataset_results = []
130
+ prev_max = 0
131
+ if dataset_score > prev_max:
132
+ manager.save_model(model, step, f"best_on_{dataset_name}")
133
+ dataset_results.append({"step": step, "score": dataset_score})
134
+ utils.write_jsonl(dataset_results, result_path, "w")
135
+ if args.verbose:
136
+ print(f"Validation Dataset Results for {dataset_name}:")
137
+ pprint(dataset_results)
138
+ print()
139
+
140
+
141
+ def main(args):
142
+ utils.set_random_seed(0)
143
+ setup_and_train(args, eval_func=evaluate)
144
+
145
+
146
+ if __name__ == '__main__':
147
+ parser = argparse.ArgumentParser()
148
+ parser.add_argument("--config", help="path to JSON config file")
149
+ parser.add_argument("--device", default="cuda")
150
+ parser.add_argument("--dump", action="store_true")
151
+ parser.add_argument("--verbose", action="store_true")
152
+ parser.add_argument(
153
+ "--fresh",
154
+ action="store_true",
155
+ help="delete model directory and start from scratch"
156
+ )
157
+ main(load_config(parser.parse_args()))
SCRL_new/config/example.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "loader": "loaders/gigaword.py",
3
+ "dataset": "data/train-data/gigaword",
4
+ "indices": "data/train-data/gigaword/indices.npy",
5
+ "model_dir": "data/models/example",
6
+ "verbose": true,
7
+ "print_every": 1,
8
+ "eval_every": 10,
9
+ "save_every": 10,
10
+ "max_val_steps": 8,
11
+ "max_train_seconds": null,
12
+ "max_train_steps": 1000,
13
+ "batch_size": 1,
14
+ "learning_rate": 1e-05,
15
+ "k_samples": 10,
16
+ "sample_aggregation": "max",
17
+ "loss": "pgb",
18
+ "encoder_model_id": "distilroberta-base",
19
+ "rewards": {
20
+ "BiEncoderSimilarity": {
21
+ "weight": 1,
22
+ "model_id": "all-distilroberta-v1"
23
+ },
24
+ "GaussianCR": {
25
+ "weight": 1,
26
+ "mean": 0.5,
27
+ "std": 0.2
28
+ }
29
+ }
30
+ }
SCRL_new/config/gigaword-L8.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "loader": "loaders/gigaword.py",
3
+ "dataset": "data/train-data/gigaword",
4
+ "indices": "data/train-data/gigaword/indices.npy",
5
+ "model_dir": "data/models/gigaword-L8",
6
+ "verbose": true,
7
+ "print_every": 1,
8
+ "eval_every": 50,
9
+ "save_every": 50,
10
+ "max_val_steps": 512,
11
+ "max_train_seconds": null,
12
+ "max_train_steps": 8000,
13
+ "batch_size": 4,
14
+ "learning_rate": 1e-05,
15
+ "k_samples": 100,
16
+ "sample_aggregation": "max",
17
+ "loss": "pgb",
18
+ "encoder_model_id": "distilroberta-base",
19
+ "rewards": {
20
+ "Fluency": {
21
+ "weight": 1,
22
+ "type": "masked",
23
+ "model_id": "distilroberta-base",
24
+ "max_score": 40.0,
25
+ "norm": "max"
26
+ },
27
+ "BiEncoderSimilarity": {
28
+ "weight": 1,
29
+ "model_id": "all-distilroberta-v1"
30
+ },
31
+ "GaussianLength": {
32
+ "weight": 1,
33
+ "mean": 8,
34
+ "std": 3.2
35
+ }
36
+ }
37
+ }
SCRL_new/config/hc.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "batch_size": 4,
3
+ "rewards": {
4
+ "Fluency": {
5
+ "weight": 1,
6
+ "type": "masked",
7
+ "model_id": "distilroberta-base",
8
+ "max_score": 40.0,
9
+ "norm": "max"
10
+ },
11
+ "BiEncoderSimilarity": {
12
+ "weight": 1,
13
+ "model_id": "all-distilroberta-v1"
14
+ }
15
+ }
16
+ }
SCRL_new/config/newsroom-CR75.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "loader": "loaders/newsroom.py",
3
+ "dataset": "data/train-data/newsroom",
4
+ "indices": "data/train-data/newsroom/indices.npy",
5
+ "model_dir": "data/models/newsroom-CR75",
6
+ "verbose": true,
7
+ "print_every": 1,
8
+ "eval_every": 50,
9
+ "save_every": 50,
10
+ "max_val_steps": 512,
11
+ "max_train_seconds": null,
12
+ "max_train_steps": 8000,
13
+ "batch_size": 4,
14
+ "learning_rate": 1e-05,
15
+ "k_samples": 100,
16
+ "sample_aggregation": "max",
17
+ "loss": "pgb",
18
+ "encoder_model_id": "distilroberta-base",
19
+ "rewards": {
20
+ "Fluency": {
21
+ "weight": 1,
22
+ "type": "masked",
23
+ "model_id": "distilroberta-base",
24
+ "max_score": 40.0,
25
+ "norm": "max"
26
+ },
27
+ "BiEncoderSimilarity": {
28
+ "weight": 1,
29
+ "model_id": "all-distilroberta-v1"
30
+ },
31
+ "GaussianCR": {
32
+ "weight": 1,
33
+ "mean": 0.75,
34
+ "std": 0.3
35
+ }
36
+ }
37
+ }
SCRL_new/config/newsroom-L11.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "loader": "loaders/newsroom.py",
3
+ "dataset": "data/train-data/newsroom",
4
+ "indices": "data/train-data/newsroom/indices.npy",
5
+ "model_dir": "data/models/newsroom-L11",
6
+ "verbose": true,
7
+ "print_every": 1,
8
+ "eval_every": 50,
9
+ "save_every": 50,
10
+ "max_val_steps": 512,
11
+ "max_train_seconds": null,
12
+ "max_train_steps": 8000,
13
+ "batch_size": 4,
14
+ "learning_rate": 1e-05,
15
+ "k_samples": 100,
16
+ "sample_aggregation": "max",
17
+ "loss": "pgb",
18
+ "encoder_model_id": "distilroberta-base",
19
+ "rewards": {
20
+ "Fluency": {
21
+ "weight": 1,
22
+ "type": "masked",
23
+ "model_id": "distilroberta-base",
24
+ "max_score": 40.0,
25
+ "norm": "max"
26
+ },
27
+ "BiEncoderSimilarity": {
28
+ "weight": 1,
29
+ "model_id": "all-distilroberta-v1"
30
+ },
31
+ "GaussianLength": {
32
+ "weight": 1,
33
+ "mean": 11,
34
+ "std": 4.4
35
+ }
36
+ }
37
+ }
SCRL_new/data/test-data/bnc.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SCRL_new/data/test-data/broadcast.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SCRL_new/data/test-data/duc2004.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SCRL_new/data/test-data/gigaword.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SCRL_new/data/test-data/google.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SCRL_new/data/test-data/newsroom.jsonl ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": "newsroom-val-title-0", "summaries": ["Real Madrid sign Javier Hern\u00e1ndez on loan from Manchester United"], "text": "Real Madrid have confirmed they have agreed to sign the Mexican striker Javier Hern\u00e1ndez on a season-long loan from Manchester United."}
2
+ {"id": "newsroom-val-title-1", "summaries": ["American Pie singer Don Mclean arrested on domestic violence charge"], "text": "American Pie singer Don McLean was arrested on a misdemeanor domestic violence charge Monday in Maine, a jail supervisor said."}
3
+ {"id": "newsroom-val-title-2", "summaries": ["Candidate for governor of Mexican state of Tamaulipas killed in shootout"], "text": "A candidate for governor of the northern Mexican state of Tamaulipas, which borders Texas, and three others have been killed in a shootout, the Notimex news agency reports."}
4
+ {"id": "newsroom-val-title-3", "summaries": ["Bill Parcells rejoining ESPN for third time"], "text": "Bill Parcells, the two-time Super Bowl-winning coach, is rejoining ESPN for the third time."}
5
+ {"id": "newsroom-val-title-4", "summaries": ["IBM Watson Health now counts CVS Health as a partner"], "text": "IBM\u2019s data crunching service for the healthcare industry, Watson Health, now counts CVS Health as a partner."}
6
+ {"id": "newsroom-val-title-5", "summaries": ["Chick-fil-A Has Completely Lost Control Of Its Facebook Page"], "text": "Chick-fil-A has completely lost control of its Facebook page; it\u2019s become a message board for activism against the company."}
7
+ {"id": "newsroom-val-title-6", "summaries": ["Safe found in debris of mansion once owned by Pablo Escobar"], "text": "A locked safe has been found in the debris of a Miami Beach mansion once owned by Colombian drug lord Pablo Escobar."}
8
+ {"id": "newsroom-val-title-7", "summaries": ["Man knocked unconscious in alleged one-punch attack at Gold Coast bar"], "text": "Newly obtained CCTV shows the moment a man was knocked unconscious in an alleged one-punch attack at a Gold Coast bar in April."}
9
+ {"id": "newsroom-val-title-8", "summaries": ["Truck engulfed by torrential flood waters in China"], "text": "A truck has been engulfed by torrential flood waters and swept away while crossing a road in China."}
10
+ {"id": "newsroom-val-title-9", "summaries": ["Justin Bieber Cleared in Possible Hit-and-Run"], "text": "Justin Bieber has been cleared in a possible hit-and-run accident involving a paparazzo Monday night."}
11
+ {"id": "newsroom-val-title-10", "summaries": ["Pancho Gonzalez Inducted Into U.S. Open Court Of Champions"], "text": "Pancho Gonzalez was inducted Saturday into the U.S. Open Court of Champions."}
12
+ {"id": "newsroom-val-title-11", "summaries": ["Antarctica is Losing About 160 Billion Metric Tons of Ice a Year"], "text": "A new study shows that the Antarctica is now losing about 160 billion metric tons of ice a year, twice as much as when the continent was last surveyed."}
13
+ {"id": "newsroom-val-title-12", "summaries": ["17-year-old boy fatally stabbed during argument in the Bronx"], "text": "A 17-year-old boy boy was fatally stabbed during an argument in the Bronx early Monday, police said."}
14
+ {"id": "newsroom-val-title-13", "summaries": ["Singapore Former Prime Minister Lee Kuan Yew Has Weakened Further"], "text": "The Singapore government said Sunday that former Prime Minister Lee Kuan Yew has weakened further, a day after it said that his condition had worsened."}
15
+ {"id": "newsroom-val-title-14", "summaries": ["Wholesale businesses restocked faster in April"], "text": "Wholesale businesses restocked faster in April, responding to a strong gain in sales."}
16
+ {"id": "newsroom-val-title-15", "summaries": ["Grateful Dead bassist Phil Lesh battling bladder cancer"], "text": "Grateful Dead bassist Phil Lesh is battling bladder cancer, he revealed in a Facebook post canceling two upcoming shows."}
17
+ {"id": "newsroom-val-title-16", "summaries": ["Eamonn Holmes has quit Sky News after more than a decade on-air"], "text": "TV host and journalist has walked away from his dream job after 11 years EAMONN Holmes has quit his \u201cdream job\u201d at Sky News after more than a decade on-air."}
18
+ {"id": "newsroom-val-title-17", "summaries": ["Big Republican Donors Are Splitting Into Two Camps"], "text": "Big Republican donors are splitting into two rival camps, reports filed yesterday with the Federal Election Commission show."}
19
+ {"id": "newsroom-val-title-18", "summaries": ["Abortion protesters arrested outside John Boehner's office"], "text": "Six anti-abortion protesters were arrested today outside of House Speaker John Boehner's office in Washington."}
20
+ {"id": "newsroom-val-title-19", "summaries": ["Michigan is the first Midwestern state to partner with Google Trekker"], "text": "Michigan is the first Midwestern state to partner with Google for its Trekker program, going off the beaten path to bring amazing views to Street View."}
21
+ {"id": "newsroom-val-title-20", "summaries": ["80-Year-Old Woman Reportedly Arrested For Selling Crack Cocaine"], "text": "An 80-year-old Alabama woman has reportedly been arrested for selling crack cocaine for a second time, authorities said."}
22
+ {"id": "newsroom-val-title-21", "summaries": ["Amanda Bynes bong-throwing case tossed"], "text": "Amanda Bynes' bong-throwing case was officially tossed Monday morning."}
23
+ {"id": "newsroom-val-title-22", "summaries": ["Rundown Chinatown may be revitalized by a destination restaurant"], "text": "Rundown Chinatown may be revitalized by a destination restaurant Grant Avenue used to be one of the city\u2019s showpieces, the main street of Chinatown."}
24
+ {"id": "newsroom-val-title-23", "summaries": ["Florida Woman Found Dead in Motel With Two Caged Monkeys"], "text": "A Florida woman was found dead on Friday in a motel room with two caged monkeys and a note, authorities said."}
25
+ {"id": "newsroom-val-title-24", "summaries": ["Body of missing California woman reportedly discovered in forest"], "text": "The body of a missing California woman was reportedly identified Tuesday after being discovered in a remote part of the Cleveland National Forest."}
26
+ {"id": "newsroom-val-title-25", "summaries": ["Australia's inflation rose 0.4 percent in June quarter"], "text": "Australia's inflation rate rose 0.4 percent in the June quarter of this year, according to the Australian Bureau of Statistics."}
27
+ {"id": "newsroom-val-title-26", "summaries": ["Man shot and killed in Southwest Washington"], "text": "A man was shot and killed Saturday night in Southwest Washington, police said Sunday."}
28
+ {"id": "newsroom-val-title-27", "summaries": ["US border patrol violated agency rules in deporting thousands of children"], "text": "US border patrol agents violated agency rules in deporting thousands of unaccompanied immigrant children from 2009 to 2014, according to a federal audit released this week."}
29
+ {"id": "newsroom-val-title-28", "summaries": ["China debt swap could leave banks in capital hole"], "text": "China\u2019s mooted debt-for-equity swap could leave the country\u2019s banks in a capital hole."}
30
+ {"id": "newsroom-val-title-29", "summaries": ["Key and Peele Will Produce Undercover Cop Comedy for Fox"], "text": "According to The Wrap, Keegan-Michael Key and Jordan Peele will co-produce an undercover cop comedy for Fox."}
31
+ {"id": "newsroom-val-title-30", "summaries": ["Couple Beaten to Death in Bizarre Stop-Smoking Ritual"], "text": "A couple from Malaysia was beaten to death Thursday in a bizarre stop-smoking ritual, Agence France-Presse reported."}
32
+ {"id": "newsroom-val-title-31", "summaries": ["Kansas City Police To Re-Interview Brothers Of Missing Baby Lisa Irwin"], "text": "Kansas City police are planning to re-interview the brothers of missing baby Lisa Irwin and take a DNA sample from them, the family's lawyer said."}
33
+ {"id": "newsroom-val-title-32", "summaries": ["Geithner says economy gradually getting better and that's encouraging"], "text": "Treasury Secretary Tim Geithner says the economy is \"gradually getting better\" and \"that's very encouraging.\""}
34
+ {"id": "newsroom-val-title-33", "summaries": ["Vice News reporter arrested at Trump event in Houston"], "text": "A Vice News reporter was arrested for trespassing at a Donald Trump campaign event in Houston, Vice reported on Saturday."}
35
+ {"id": "newsroom-val-title-34", "summaries": ["Woman Suffering Asthma Attack Denied Inhaler at Pharmacy"], "text": "A New Jersey woman suffering an asthma attack was denied an inhaler at a pharmacy because she was $1.99 short, MyFoxNY.com reported."}
36
+ {"id": "newsroom-val-title-35", "summaries": ["Severe thunderstorm warning for damaging winds and large hail issued for southern parts of WA"], "text": "A severe thunderstorm warning for damaging winds and large hail has been issued for southern parts of WA, including Ravensthorpe, Walpole and Kellerberrin."}
37
+ {"id": "newsroom-val-title-36", "summaries": ["Catholic priests in Montreal banned from being alone with children"], "text": "Catholic priests in Montreal will be banned from being alone with children to provide a \u201csafety net\u201d against allegations of abuse."}
38
+ {"id": "newsroom-val-title-37", "summaries": ["Naked Man Playing a Violin in Portland Arrested, Police Say"], "text": "A naked man playing a violin outside of a courthouse in Portland was arrested Friday afternoon after numerous complaints and warnings, police say."}
39
+ {"id": "newsroom-val-title-38", "summaries": ["Lynne Spears' book about parenting delayed indefinitely"], "text": "Lynne Spears' book about parenting has been delayed indefinitely, her publisher said Wednesday."}
40
+ {"id": "newsroom-val-title-39", "summaries": ["New Orleans Man Sentenced for Making Death Threats Against Obama"], "text": "A 47-year-old New Orleans man has been sentenced for making death threats against President Barack Obama."}
41
+ {"id": "newsroom-val-title-40", "summaries": ["NFL Publishing Commemorative Super Bowl App"], "text": "The NFL is publishing a commemorative Super Bowl app for tablets."}
42
+ {"id": "newsroom-val-title-41", "summaries": ["Eastern cougar declared extinct, confirming decades of suspicion"], "text": "The eastern cougar has been declared extinct by the U.S. Fish and Wildlife Service, confirming decades of suspicion that the elusive subspecies was no more."}
43
+ {"id": "newsroom-val-title-42", "summaries": ["Jonah Hill Involved in Car Accident in Los Angeles"], "text": "Jonah Hill was involved in a car accident in downtown Los Angeles that left his ride very banged up."}
44
+ {"id": "newsroom-val-title-43", "summaries": ["Ruth Rendell in hospital after serious stroke"], "text": "Ruth Rendell is in hospital after suffering a serious stroke last Wednesday, her publisher has announced."}
45
+ {"id": "newsroom-val-title-44", "summaries": ["Father Charged With Murder in Death of 18-Year-Old Daughter"], "text": "A Kansas City father has been charged with murder and sexual abuse in the death of his 18-year-old daughter."}
46
+ {"id": "newsroom-val-title-45", "summaries": ["Klay Thompson says he\u2019s the best shooting guard in the NBA"], "text": "Klay Thompson doesn\u2019t lack for confidence\u2013he says he\u2019s the best shooting guard in the NBA."}
47
+ {"id": "newsroom-val-title-46", "summaries": ["Julian Castro to make history as first Hispanic to deliver keynote at Democratic convention"], "text": "San Antonio Mayor Julian Castro, a rising star among Democrats, is poised to make history Tuesday as the first Hispanic to deliver a keynote address at the Democratic National Convention."}
48
+ {"id": "newsroom-val-title-47", "summaries": ["House destroyed by fire in rural west Ottawa"], "text": "A house has been destroyed by fire in rural west Ottawa this morning, just nine days before Christmas."}
49
+ {"id": "newsroom-val-title-48", "summaries": ["Wilson released after two years behind bars for teen sex conviction"], "text": "Genarlow Wilson was released from prison Friday, after spending more than two years behind bars for a teen sex conviction."}
50
+ {"id": "newsroom-val-title-49", "summaries": ["Son of Fox studio host killed in car accident"], "text": "The 19-year-old son of NASCAR on Fox studio host Chris Myers has been killed in a car accident."}
51
+ {"id": "newsroom-val-title-50", "summaries": ["New jobless numbers a mixed bag for Obama"], "text": "New jobless numbers are a bit of a mixed bag for President Obama and his re-election bid."}
52
+ {"id": "newsroom-val-title-51", "summaries": ["Car filled with Christmas presents stolen in Melbourne"], "text": "A car filled with Christmas presents was stolen from a home in Melbourne\u2019s north-east on Christmas Day."}
53
+ {"id": "newsroom-val-title-52", "summaries": ["A mariachi band has serenaded Donald Trump"], "text": "A mariachi band has serenaded Donald Trump on the sidewalk outside Trump Tower in New York City."}
54
+ {"id": "newsroom-val-title-53", "summaries": ["Lena Dunham Will Undergo Surgery For Ruptured Ovarian Cyst"], "text": "Lena Dunham was taken to the hospital Saturday and will undergo surgery for a ruptured ovarian cyst, her spokeswoman said."}
55
+ {"id": "newsroom-val-title-54", "summaries": ["At least 13 dead after tour bus collided with semi-trailer in California"], "text": "At least 13 people are reportedly dead and many more are seriously injured after a tour bus collided with a semi-trailer in California."}
56
+ {"id": "newsroom-val-title-55", "summaries": ["Chicago cop praised for buying homeless man Chipotle"], "text": "A Chicago cop is being praised for his good deed after an image of him buying a homeless man Chipotle went viral."}
57
+ {"id": "newsroom-val-title-56", "summaries": ["Coffee could be extinct by 2080 because of climate change"], "text": "Coffee could be extinct by 2080 because of the effects of climate change on coffee growing regions."}
58
+ {"id": "newsroom-val-title-57", "summaries": ["American Student and Part-Time Model Found Dead at Australian University"], "text": "An American student and part-time model was found dead in her room at an Australian university, a police spokesman said."}
59
+ {"id": "newsroom-val-title-58", "summaries": ["Apple Watch Will Be Available In Stores In Two Weeks"], "text": "The Apple Watch will be available in retail stores in two weeks, Apple announced Thursday."}
60
+ {"id": "newsroom-val-title-59", "summaries": ["Death of teenager found with cobra bites ruled suicide"], "text": "The death of a teenager found in a North Austin parking lot with multiple cobra bites has been ruled a suicide."}
61
+ {"id": "newsroom-val-title-60", "summaries": ["Man charged with murder after body found in home in Sydney's south"], "text": "A man has been charged with murder after a body was found in a home in Sydney's south yesterday."}
62
+ {"id": "newsroom-val-title-61", "summaries": ["Glen Campbell is suffering from Alzheimer\u2019s disease"], "text": "Glen Campbell is suffering from Alzheimer\u2019s disease, the singer revealed to People."}
63
+ {"id": "newsroom-val-title-62", "summaries": ["Asbestos found in charred remains of beach house on Sydney's Northern Beaches"], "text": "Asbestos has been found in the charred remains of a beach house, gutted by a ferocious fire overnight in Palm Beach on Sydney's Northern Beaches."}
64
+ {"id": "newsroom-val-title-63", "summaries": ["Paris Hilton Arrested for Cocaine in Las Vegas"], "text": "Paris Hilton Arrested for Cocaine in Las Vegas was arrested for possession of cocaine Friday night in Las Vegas."}
65
+ {"id": "newsroom-val-title-64", "summaries": ["Body pulled from Lake Michigan identified as missing medical student Ambrose Monye"], "text": "A body pulled from Lake Michigan has been identified as missing medical student Ambrose Monye, who disappeared weeks before his graduation, Chicago police confirmed to FoxNews.com Thursday."}
66
+ {"id": "newsroom-val-title-65", "summaries": ["Nestle Is Recalling DiGiorno Pizzas, Lean Cuisines, and Stouffer Meals"], "text": "Nestle nestle-s-a announced in a press release that it is voluntarily recalling some DiGiorno pizzas, Lean Cuisines, and Stouffer\u2019s meals that may contain foreign matter."}
67
+ {"id": "newsroom-val-title-66", "summaries": ["University of Cincinnati police officer charged with murder of black man"], "text": "University of Cincinnati police officer Ray Tensing has been charged with murder in the shooting death of Samuel DuBose, a 43-year-old black man."}
68
+ {"id": "newsroom-val-title-67", "summaries": ["Trea Turner named National League Rookie of the Month"], "text": "After hitting .357 and slugging .571 in 28 August games, Trea Turner was named the National League rookie of the month."}
69
+ {"id": "newsroom-val-title-68", "summaries": ["Civil rights leader Walter Fauntroy arrested at Dulles International Airport"], "text": "D.C. civil rights leader Walter Fauntroy was arrested at Dulles International Airport and is being detained at Loudon County jail."}
70
+ {"id": "newsroom-val-title-69", "summaries": ["Death of 2-Month-Old in Connecticut Daycare Ruled a Homicide"], "text": "The death of a 2-month-old girl in a Connecticut home-based daycare has been ruled a homicide, PEOPLE confirms."}
71
+ {"id": "newsroom-val-title-70", "summaries": ["Prince Rushed to Hospital After Emergency Landing"], "text": "Prince was rushed to a hospital early Friday morning after his jet made an emergency landing in Illinois ..."}
72
+ {"id": "newsroom-val-title-71", "summaries": ["Gonorrhea at risk of becoming untreatable"], "text": "England\u2019s chief medical officer says that gonorrhea is at risk of becoming an untreatable disease."}
73
+ {"id": "newsroom-val-title-72", "summaries": ["Four injured in chain-reaction crash outside Southern California concert venue"], "text": "Four people were seriously injured after being struck by an unoccupied police vehicle in a chain-reaction crash outside a Southern California concert venue Saturday night."}
74
+ {"id": "newsroom-val-title-73", "summaries": ["Pakistani Taliban Commander Killed in Afghanistan"], "text": "Now, Mullah Dadullah has become the most senior Pakistani Taliban commander to be killed by NATO in Afghanistan."}
75
+ {"id": "newsroom-val-title-74", "summaries": ["Victorian grandfather found dead in van swept away in floodwaters"], "text": "Tributes are flowing for a Victorian bakery owner and grandfather who was found dead in his van after it was swept away in floodwaters this morning."}
76
+ {"id": "newsroom-val-title-75", "summaries": ["Two 12-year-old boys charged over alleged sexual assault of girl"], "text": "Two 12-year-old boys have been charged over the alleged sexual assault of a young girl at a primary school on Sydney\u2019s northern beaches."}
77
+ {"id": "newsroom-val-title-76", "summaries": ["News producer found dead in Belize was strangled"], "text": "An autopsy shows that an American news producer found dead while staying in a vacation resort in Belize was strangled."}
78
+ {"id": "newsroom-val-title-77", "summaries": ["Man stabbed in central Queensland"], "text": "A man has been stabbed in the back with a knife in central Queensland."}
79
+ {"id": "newsroom-val-title-78", "summaries": ["Man charged with murder of asylum seeker Reza Barati granted bail"], "text": "A Papua New Guinean man charged with the murder of asylum seeker Reza Barati has been granted bail in Papua New Guinea."}
80
+ {"id": "newsroom-val-title-79", "summaries": ["Children hospitalised after a gastro outbreak at a childcare centre"], "text": "A child has died and four children have been hospitalised after a suspected gastro outbreak at a Sydney childcare centre."}
81
+ {"id": "newsroom-val-title-80", "summaries": ["Girls aged two and five raped in separate attacks in Delhi"], "text": "Girls aged two and five have been raped in separate attacks in Delhi, sparking angry protests in the Indian capital over police inaction."}
82
+ {"id": "newsroom-val-title-81", "summaries": ["Bikies charged over ecstasy haul"], "text": "Two Nomads bikies have been charged over an interstate ecstasy haul."}
83
+ {"id": "newsroom-val-title-82", "summaries": ["S.F. sues family that operates residential hotels"], "text": "San Francisco City Attorney Dennis S.F. sues family that operates residential hotels"}
84
+ {"id": "newsroom-val-title-83", "summaries": ["Labor and Greens to use inquiry to investigate royalty regime for oil and gas companies"], "text": "Labor and the Greens plan to use a formal parliamentary inquiry to investigate Australia\u2019s royalty regime for oil and gas companies, following a damning report."}
85
+ {"id": "newsroom-val-title-84", "summaries": ["Drunken deputy arrested for threatening to shoot helicopter"], "text": "A drunken Nevada deputy has been arrested for threatening to shoot down a sheriff's helicopter during a tense six-hour standoff."}
86
+ {"id": "newsroom-val-title-85", "summaries": ["Berkshire Hathaway to Buy Lubrizol for $9 Billion in Cash"], "text": "Warren Buffett's Berkshire Hathaway Monday agreed to buy U.S. chemical maker Lubrizol Corp. for about $9 billion in cash."}
87
+ {"id": "newsroom-val-title-86", "summaries": ["Daytona 500 winner Bayne diagnosed with multiple sclerosis"], "text": "Daytona 500 winner and Roush Fenway Racing driver Trevor Bayne announced Tuesday that he has been diagnosed with multiple sclerosis."}
88
+ {"id": "newsroom-val-title-87", "summaries": ["American who defected from ISIS left D.C. area in mid-December"], "text": "CBS News has learned the American who defected from ISIS and surrendered to Kurdish forces had left the D.C. area in mid-December."}
89
+ {"id": "newsroom-val-title-88", "summaries": ["Pill camera to screen for colon cancer approved in U.S."], "text": "An ingestible pill camera to help screen for polyps and early signs of colon cancer has been approved for use in the U.S."}
90
+ {"id": "newsroom-val-title-89", "summaries": ["Gretchen Carlson Regrets Not Speaking Up About Harassment"], "text": "Former Fox News anchor Gretchen Carlson writes in her memoir that she regrets not speaking up about sexual harassment earlier in her career."}
91
+ {"id": "newsroom-val-title-90", "summaries": ["Former Milwaukee Officer Charged With Homicide in Shooting That Sparked Riots"], "text": "A former Milwaukee police officer was charged Thursday with first-degree reckless homicide in the fatal shooting of a young black man that sparked two days of riots in August."}
92
+ {"id": "newsroom-val-title-91", "summaries": ["Florida woman fighting to keep motorcycle-riding pet alligator"], "text": "A Florida woman was fighting Wednesday to keep her 6-foot-long clothes-wearing, motorcycle-riding pet alligator in her home."}
93
+ {"id": "newsroom-val-title-92", "summaries": ["Pennsylvania state troopers on lookout for Amish buggy in hit and run"], "text": "Pennsylvania state troopers are on the lookout for the driver of an Amish buggy that was involved in a hit and run incident on Sunday."}
94
+ {"id": "newsroom-val-title-93", "summaries": ["Kevin Andrews is prepared to challenge Turnbull for prime ministership"], "text": "Dumped former minister Kevin Andrews has announced he is prepared to challenge Malcolm Turnbull for the prime ministership under the right circumstances."}
95
+ {"id": "newsroom-val-title-94", "summaries": ["Walmart found to be sourcing bottled water from drought-stricken California"], "text": "Walmart is the latest company found to be sourcing its bottled water from drought-stricken California, as state residents push for greater regulation of the bottling industry."}
96
+ {"id": "newsroom-val-title-95", "summaries": ["Ohio high school teacher sentenced to two years after sexual relations with student"], "text": "A former Ohio high school teacher has been sentenced to two years behind bars after found guilty of carrying on sexual relations with a 16-year-old student."}
97
+ {"id": "newsroom-val-title-96", "summaries": ["Australia named worst-performing industrial country on climate change"], "text": "Australia has been named the worst-performing industrial country in the world on climate change in a report released at international negotiations in Peru."}
98
+ {"id": "newsroom-val-title-97", "summaries": ["St Louis Rams receiver Stedman Bailey critical but stable after being shot"], "text": "St Louis Rams wide receiver Stedman Bailey is in critical but stable condition after being shot in the head on Tuesday night, according to multiple reports in the media."}
99
+ {"id": "newsroom-val-title-98", "summaries": ["Man and woman found dead in Tamworth"], "text": "A man and a woman have been found dead in a Tamworth home in northern NSW."}
100
+ {"id": "newsroom-val-title-99", "summaries": ["Rosemary Barton named permanent host of CBC's Power & Politics"], "text": "Veteran political reporter Rosemary Barton has been named the permanent host of CBC News Network's daily political show Power & Politics."}
101
+ {"id": "newsroom-val-title-100", "summaries": ["Asbestos found on Sydney Harbour Bridge"], "text": "A surprise cache of asbestos has been found on Sydney's iconic Harbour Bridge during road works."}
102
+ {"id": "newsroom-val-sent-0", "summaries": ["Donald Trump has emerged victorious in his ugly battle with Miss USA hopeful\u00a0Sheena Monnin"], "text": "In a beauty of a $5 million legal win, Donald Trump has emerged victorious in his ugly battle with Miss USA hopeful Sheena Monnin, who claimed the pageant was \u201cfixed\u201d and \u201ctrashy.\u201d"}
103
+ {"id": "newsroom-val-sent-1", "summaries": ["Biologists say Beijing\u2019s island-building campaign means permanent ecocide"], "text": "In fact, biologists say that while the reefs could have slowly recovered from the clam-harvesting degradation, Beijing\u2019s island-building campaign means permanent ecocide."}
104
+ {"id": "newsroom-val-sent-2", "summaries": ["Twenty-two people and two gunmen were killed in the March 18 attack."], "text": "Twenty-two people, mainly foreigners, and two gunmen were killed in the March 18 attack on the National Bardo Museum."}
105
+ {"id": "newsroom-val-sent-3", "summaries": ["An Ethiopian court convicted former dictator Mengistu Haile Mariam of genocide today."], "text": "An Ethiopian court convicted former dictator Mengistu Haile Mariam of genocide today, but Mr. Mengistu may never face punishment because he remains in exile in Zimbabwe."}
106
+ {"id": "newsroom-val-sent-4", "summaries": ["Erykah Badu squashed the industry's obsession with looking younger in one tweet."], "text": "Erykah Badu squashed the entertainment industry's obsession with perfection and looking younger in just one tweet."}
107
+ {"id": "newsroom-val-sent-5", "summaries": ["No casualties were reported from Sunday's blast in Douma."], "text": "No casualties were reported from Sunday's blast in Douma, which damaged a parked Toyota pickup."}
108
+ {"id": "newsroom-val-sent-6", "summaries": ["Running back Michael Dyer has transferred from Auburn to Arkansas State."], "text": "Arkansas State head football coach Gus Malzahn announced Tuesday on the school's athletic website that running back Michael Dyer has transferred from Auburn to Arkansas State."}
109
+ {"id": "newsroom-val-sent-7", "summaries": ["An Egyptian court upheld a death sentence against former President Mohammed Morsi."], "text": "An Egyptian court on Tuesday upheld a death sentence against former President Mohammed Morsi, in a session that underlined the judiciary\u2019s hostility toward the ousted leader and his Islamist allies."}
110
+ {"id": "newsroom-val-sent-8", "summaries": ["Jessica McCain has been charged with child molestation and child exploitation."], "text": "Jessica McCain, from Lafayette, has been charged with child molestation and child exploitation after recording the disgusting act on video."}
111
+ {"id": "newsroom-val-sent-9", "summaries": ["Republicans sought to keep the pressure on President Obama over high gas prices."], "text": "Republicans sought to keep the pressure on President Obama over high gas prices Saturday with a radio speech claiming his \"lack of leadership\" is creating an \"energy crisis.\""}
112
+ {"id": "newsroom-val-sent-10", "summaries": ["Adolescent medicine is in little demand by doctors seeking to advance their careers."], "text": "But a decade after adolescent medicine became board certified as a subspecialty, it is in little demand by doctors seeking to advance their careers."}
113
+ {"id": "newsroom-val-sent-11", "summaries": ["More people are getting their news on smaller screens"], "text": "More and more people are discarding their old TV sets and getting their news and entertainment on smaller screens."}
114
+ {"id": "newsroom-val-sent-12", "summaries": ["The body was found floating Saturday in the waters between south Brooklyn and Queens."], "text": "The body of a man who is believed to have jumped from a bridge earlier this month was found floating Saturday in the waters between south Brooklyn and Queens, cops said."}
115
+ {"id": "newsroom-val-sent-13", "summaries": ["A man detonated explosives that killed 12 people and injured 20 in northeast Nigeria."], "text": "A man forced his way onto a bus at a crowded bus station and detonated explosives that killed 12 people and injured 20 in northeast Nigeria, according to the bus driver and hospital records."}
116
+ {"id": "newsroom-val-sent-14", "summaries": ["Cellist David Teie has created the first ever album specifically for cats"], "text": "Cellist and musician David Teie, a soloist with America\u2019s National Symphony Orchestra, has created the first ever album specifically for cats to listen to."}
117
+ {"id": "newsroom-val-sent-15", "summaries": ["Clark died Tuesday at his home in Nashville, Tennessee."], "text": "Clark died Tuesday at his home in Nashville, Tennessee, according to his manager, Keith Case."}
118
+ {"id": "newsroom-val-sent-16", "summaries": ["The Illinois Senate recently passed legislation that would put slot machines at airports."], "text": "According to the Chicago Sun-Times, the Illinois Senate recently passed legislation that would put slot machines at airports and other places around the state."}
119
+ {"id": "newsroom-val-sent-17", "summaries": ["In March 1978 Israel invaded Lebanon, an invasion dreamt up years before."], "text": "In March 1978 Israel invaded Lebanon, an invasion dreamt up years before, though this was the first opportunity to put the large-scale operation dubbed Litani, into gear."}
120
+ {"id": "newsroom-val-sent-18", "summaries": ["Philanthropist David Rubenstein is giving $18 million to fix up the Lincoln Memorial."], "text": "Philanthropist David Rubenstein, who has already donated tens of millions of dollars to refurbish the Washington Monument and other icons, is giving $18 million to fix up the Lincoln Memorial."}
121
+ {"id": "newsroom-val-sent-19", "summaries": ["Google made about $11 billion from ad sales on Android phones last year."], "text": "The stakes are higher in the Android case for Google which made about $11 billion from ad sales on Android phones with Google apps such as Maps, Search and Gmail last year."}
122
+ {"id": "newsroom-val-sent-20", "summaries": ["Tina Turner to become Swiss citizen"], "text": "Singer Tina Turner is set to surrender her U.S. passport and become a Swiss citizen."}
123
+ {"id": "newsroom-val-sent-21", "summaries": ["Hillary Clinton declared victory in the Democratic presidential race Tuesday."], "text": "Paying homage to the generations of women who paved the way for her, Hillary Clinton declared a \u201cmilestone\u201d victory in the long and hard-fought Democratic presidential race Tuesday."}
124
+ {"id": "newsroom-val-sent-22", "summaries": ["Georgian cinema is exotic, and can be forbidding as well as fiercely beautiful."], "text": "Georgian cinema is exotic and, like the mountainous terrain celebrated by the Russian writers Lermontov and Tolstoy, can be forbidding as well as fiercely beautiful."}
125
+ {"id": "newsroom-val-sent-23", "summaries": ["Fisher remained hospitalized at the Ronald Reagan UCLA Medical Center in Los Angeles."], "text": "Fisher remained hospitalized at the Ronald Reagan UCLA Medical Center in Los Angeles, where she was rushed Friday after her flight from London touched down."}
126
+ {"id": "newsroom-val-sent-24", "summaries": ["A twitter troll who started rumors about Hurricane Sandy has been identified"], "text": "A Twitter troll who started widely spread rumors about Hurricane Sandy has been identified as Shashank Tripathi, according to reports."}
127
+ {"id": "newsroom-val-sent-25", "summaries": ["Misconceptions may be keeping many women from getting breast reconstruction after a mastectomy."], "text": "Misconceptions may be keeping many women from getting breast reconstruction after a mastectomy, even though the procedure can help improve quality of life for cancer survivors, according to a new review."}
128
+ {"id": "newsroom-val-sent-26", "summaries": ["The rankings assess nearly 1,800 colleges and universities on 16 criteria"], "text": "The rankings, which assess nearly 1,800 colleges and universities on 16 criteria rarely see drastic change from year to year, and the latest iteration is no exception."}
129
+ {"id": "newsroom-val-sent-27", "summaries": ["City Councilman Mark Treyger wants the city to designate the promenade a historic site."], "text": "City Councilman Mark Treyger wants the city\u2019s Landmarks Preservation Commission to designate the popular 2.5-mile wooden promenade a historic site that cannot be converted to cement."}
130
+ {"id": "newsroom-val-sent-28", "summaries": ["Zakaria Bulhan was charged with murder and five counts of attempted murder"], "text": "London\u2019s Metropolitan Police said Zakaria Bulhan, of London, was charged with the murder of 64-year-old Darlene Horton, and five counts of attempted murder."}
131
+ {"id": "newsroom-val-sent-29", "summaries": ["The New York Post Twitter account was apparently compromised on Friday."], "text": "The New York Post's main Twitter account was apparently compromised on Friday, sending a series of apparently untrue tweets on military and macroeconomic topics."}
132
+ {"id": "newsroom-val-sent-30", "summaries": ["Helium wants to be the Android of the Internet of things."], "text": "Helium\u2019s president and chief operating officer Rob Chandhok says it wants to be \u201cthe Android of the Internet of things.\u201d"}
133
+ {"id": "newsroom-val-sent-31", "summaries": ["The concept of a helipad has been discussed in the city for years."], "text": "The concept of a helipad has been discussed in the city for years, after two landing sites available for use by the general public closed in 1999."}
134
+ {"id": "newsroom-val-sent-32", "summaries": ["A Michigan man whose drunken girlfriend froze to death was sentenced to prison."], "text": "A Michigan man whose drunken girlfriend froze to death after he lectured her for being intoxicated was sentenced to prison on Monday."}
135
+ {"id": "newsroom-val-sent-33", "summaries": ["Companies are rising up to provide innovators with compact versions of industrial tools."], "text": "That movement, long identified with 3D printers, is entering a new phase, and new companies are rising up to provide craftspeople and innovators with compact versions of industrial tools."}
136
+ {"id": "newsroom-val-sent-34", "summaries": ["The European Union has agreed to impose broad economic sanctions against Russia."], "text": "The European Union has finally agreed to impose broad economic sanctions against Russia, hoping to force Moscow to reverse course in Ukraine, EU officials say."}
137
+ {"id": "newsroom-val-sent-35", "summaries": ["Terrorists could still send the United States into the dark ages for weeks."], "text": "Terrorists could still send a nation as powerful and modernized as the United States into the dark ages for weeks."}
138
+ {"id": "newsroom-val-sent-36", "summaries": ["Republicans issued subpoenas Wednesday for emails Hillary Clinton stored on her own server."], "text": "Republicans issued subpoenas Wednesday for emails Hillary Clinton stored on her own server, as the name of a former aide surfaced as the person possibly linked to the unusual arrangement."}
139
+ {"id": "newsroom-val-sent-37", "summaries": ["President Obama and Benjamin Netanyahu pledged to work together towards Middle East peace Monday."], "text": "President Obama and Israeli Prime Minister Benjamin Netanyahu pledged to work together towards Middle East peace Monday, looking to ease tensions after a long-running public feud over the Iran nuclear deal."}
140
+ {"id": "newsroom-val-sent-38", "summaries": ["Former porn star Candida Royalle has died after battling ovarian cancer."], "text": "Former porn star Candida Royalle, regarded as a pioneer for her work in front of and behind the camera, has died after battling ovarian cancer."}
141
+ {"id": "newsroom-val-sent-39", "summaries": ["People with Alzheimer's disease may experience depression before their memory starts to fade."], "text": "People with Alzheimer's disease may experience depression and other behavioral changes before their memory starts to fade, according to a new study in the journal Neurology."}
142
+ {"id": "newsroom-val-sent-40", "summaries": ["Seven men are in the running to replace Sepp Blatter as FIFA president."], "text": "Seven men are in the running to replace Sepp Blatter as FIFA president, with Michel Platini's candidature pending because of his suspension from soccer."}
143
+ {"id": "newsroom-val-sent-41", "summaries": ["The royal family attended a Christmas Day service without the Queen"], "text": "The royal family attended a Christmas Day service at St. Mark\u2019s Church in Englefield on Sunday without the Queen, who stayed home with a heavy cold, CBS reports."}
144
+ {"id": "newsroom-val-sent-42", "summaries": ["A former WWE wrestler thwarted an armed would-be robber on Saturday night."], "text": "A former WWE wrestler rumored to appear in the next \"Batman\" movie thwarted an armed would-be robber on Saturday night, saying later that \"I'm not dying in Florida.\""}
145
+ {"id": "newsroom-val-sent-43", "summaries": ["Company trying to diversify its product mix to be less reliant on fragrances."], "text": "The company, founded in 1904 in Paris, has been trying to diversify its product mix to be less reliant on fragrances, which generate more than half of its sales."}
146
+ {"id": "newsroom-val-sent-44", "summaries": ["\"I was just frustrated with myself,\" Murgatroyd told PEOPLE"], "text": "\"I was just frustrated with myself because the blindfold wouldn't work,\" Murgatroyd told PEOPLE at Mixology in Los Angeles after Monday's episode."}
147
+ {"id": "newsroom-val-sent-45", "summaries": ["There are almost 8,000 courses taught in English in non-English speaking countries."], "text": "There are now almost 8,000 courses being taught in English by leading universities in non-English speaking countries, according to a project mapping their expansion."}
148
+ {"id": "newsroom-val-sent-46", "summaries": ["Regulators need to end all varieties of test-gaming in the auto industry."], "text": "Rather than declare the matter closed, regulators in the U.S. and Europe need to redouble efforts to end all varieties of test-gaming in the auto industry."}
149
+ {"id": "newsroom-val-sent-47", "summaries": ["Trump spelled out hard-line immigration priorities in a fiery speech in Phoenix."], "text": "After weeks of opaque public statements regarding his stance on mass deportations, Trump spelled out hard-line immigration priorities in a fiery speech here in Phoenix."}
150
+ {"id": "newsroom-val-sent-48", "summaries": ["Researchers found drug trials funded by drug maker Roche downplayed side effects like diarrhea."], "text": "Danish researchers who reviewed data summaries and journal articles found that seven drug trials funded by the drug maker Roche in the 1990s downplayed the frequency of apparent side effects like diarrhea and incontinence."}
151
+ {"id": "newsroom-val-sent-49", "summaries": ["Lemonis Fischer Acquisition Company completed its purchase of Crumbs in August for $6.5 million"], "text": "Lemonis Fischer Acquisition Company, LLC, a joint venture between Lemonis and Fischer Enterprises, completed its purchase of Crumbs in August for $6.5 million."}
152
+ {"id": "newsroom-val-sent-50", "summaries": ["The rapper announced he had a third son on his social media accounts."], "text": "The rapper, who is notorious for trolling fans and celebritIes on social media, announced he had a third son on his social media accounts."}
153
+ {"id": "newsroom-val-sent-51", "summaries": ["Minnesota Congressman Keith Ellison is tweeting Emojis to talk about minimum wage."], "text": "Minnesota Democratic Congressman Keith Ellison is tweeting food Emojis to talk about minimum wage."}
154
+ {"id": "newsroom-val-sent-52", "summaries": ["Afghans are looking closely at the biographies of Abdullah Abdullah and Ashraf Ghani"], "text": "But in the absence of concrete platforms, Afghans are looking closely at the biographies of Abdullah Abdullah and Ashraf Ghani, speculating about what their lives say about their ability to lead."}
155
+ {"id": "newsroom-val-sent-53", "summaries": ["Americans expect an unnatural level of consistency from their foodstuffs."], "text": "Ugly vegetables may have made their way to grocery store shelves, but Americans still expect an unnatural level of consistency from their foodstuffs."}
156
+ {"id": "newsroom-val-sent-54", "summaries": ["Frontier Airlines will penalize passengers who don't book directly with the airline."], "text": "Frontier Airlines is the latest carrier to jump into the fight, announcing Wednesday that it will penalize passengers who don't book directly with the airline."}
157
+ {"id": "newsroom-val-sent-55", "summaries": ["A proposed merger between Charter Communications and Time Warner Cable could get local support."], "text": "A proposed $78.7 billion merger between Charter Communications and Time Warner Cable could get local support \u2014 if the big-bucks deal helps poor New Yorkers get more online access."}
158
+ {"id": "newsroom-val-sent-56", "summaries": ["Jason Smith charged with cocaine trafficking."], "text": "(Former NRL star Jason Smith has been charged with alleged cocaine trafficking."}
159
+ {"id": "newsroom-val-sent-57", "summaries": ["Sanders turned Bill Clinton\u2019s time in the White House into target practice."], "text": "The Flashback in Flint was the fight Bernie Sanders has been longing to have, and he turned Bill Clinton\u2019s time in the White House into target practice."}
160
+ {"id": "newsroom-val-sent-58", "summaries": ["Virginia is on the national ballot for the first time since Woodrow Wilson."], "text": "Virginia, the mother of presidents \u2014 it\u2019s had eight, more than any other state \u2014 is on the national ballot for the first time since Woodrow Wilson."}
161
+ {"id": "newsroom-val-sent-59", "summaries": ["Trump said that he and Clinton should undergo drug tests before the final debate"], "text": "Offered without evidence during a campaign rally in Portsmouth, N.H., Trump said that he and Clinton are \u201clike athletes\u201d and should undergo drug tests before the final debate on Wednesday."}
162
+ {"id": "newsroom-val-sent-60", "summaries": ["GlaxoSmithKline promoted Emma Walmsley to succeed Andrew Witty as chief executive officer."], "text": "GlaxoSmithKline promoted Emma Walmsley to succeed Andrew Witty as chief executive officer when he retires, singling out Britain\u2019s largest drugmaker as the only major global pharmaceutical company led by a woman."}
163
+ {"id": "newsroom-val-sent-61", "summaries": ["Valeant Pharmaceuticals has decided to keep and invest in its gastrointestinal-drugs division."], "text": "Valeant Pharmaceuticals International Inc. has decided to keep and invest in its gastrointestinal-drugs division after talks to sell the unit to Takeda Pharmaceutical Co. fell apart, according to people familiar with the matter."}
164
+ {"id": "newsroom-val-sent-62", "summaries": ["Father Patrick Fitzgerald of St. Francis of Assisi Church in Manhattan has died."], "text": "Father Patrick Fitzgerald of St. Francis of Assisi Church in Manhattan, who served as clergy liaison for the NYPD\u2019s Midtown South Precinct, has died."}
165
+ {"id": "newsroom-val-sent-63", "summaries": ["Microsoft confirmed its $1.2 billion acquisition of Yammer."], "text": "Microsoft on Monday confirmed its rumored $1.2 billion acquisition of B2B social networking firm Yammer."}
166
+ {"id": "newsroom-val-sent-64", "summaries": ["Granite prism part of national pilot scheme designed to introduce original artworks into traffic system"], "text": "Costing \u00a325,000, the granite prism is part of a national pilot scheme designed to introduce original artworks into the traffic system, replacing conventional plastic bollards and barriers."}
167
+ {"id": "newsroom-val-sent-65", "summaries": ["The world\u2019s largest retailer is being battered by the economy and tough competition."], "text": "The world\u2019s largest retailer is heading into the holiday season with a turnaround plan after being battered by the economy and tough competition."}
168
+ {"id": "newsroom-val-sent-66", "summaries": ["Casino bosses like Wynn are betting new resorts will breathe life back into Macau."], "text": "Casino bosses like Wynn are betting the new resorts on the Cotai strip will breathe life back into Macau, which generates five times more gambling revenue than Las Vegas."}
169
+ {"id": "newsroom-val-sent-67", "summaries": ["Tourists visiting Chongqing in China can now explore an underground nuclear base."], "text": "Tourists visiting Chongqing in China can now explore an underground nuclear base, dubbed the world's largest manmade cave at a size of 1.1-million square feet (or about 20 football fields)."}
170
+ {"id": "newsroom-val-sent-68", "summaries": ["Gift to the nation for permanent public display rejected by council officials"], "text": "A sculpture by Sir Anthony Caro, intended as a gift to the nation for permanent public display, has been rejected by council officials and could now be sold abroad for \u00a32.5m."}
171
+ {"id": "newsroom-val-sent-69", "summaries": ["A Tennessee teen was arrested after his getaway car got stuck on ice."], "text": "Saturday, March 7, 2015, 3:06 PM A Tennessee teen was arrested for robbing a man shoveling snow after his getaway car got stuck on a patch of ice, according to a report."}
172
+ {"id": "newsroom-val-sent-70", "summaries": ["The telescope will be used to reflect radio signals from distant parts of the universe"], "text": "The radio telescope, which measures 500 meters in diameter, will be used to reflect radio signals from distant parts of the universe and help search for extraterrestrial life beyond the galaxy, reports the Guardian."}
173
+ {"id": "newsroom-val-sent-71", "summaries": ["Hansen, 30, is being held on $1 million bail"], "text": "Hansen, 30, who is also a model with her own line of swimsuits, is being held on $1 million bail."}
174
+ {"id": "newsroom-val-sent-72", "summaries": ["The French actor split with his Berry because she was more successful than him."], "text": "The French actor, known to many as just arm candy for his famous wife, split with his Berry because she was more successful than him, sources told People Magazine."}
175
+ {"id": "newsroom-val-sent-73", "summaries": ["Apple needs a visionary who understands how to make really great software."], "text": "What Apple needs is a visionary who understands how to make really great software and who can provide the leadership to make that happen."}
176
+ {"id": "newsroom-val-sent-74", "summaries": ["Russia is the poster child for electoral authoritarianism."], "text": "Russia is the poster child for a type of governance termed electoral, or competitive, authoritarianism."}
177
+ {"id": "newsroom-val-sent-75", "summaries": ["It was 22 years ago Wednesday that \u201cFrasier\u201d aired on television."], "text": "It was 22 years ago Wednesday that \u201cFrasier\u201d \u2014 the hit sitcom of the 1990s that was a spin-off of the award-winning \u201cCheers\u201d \u2014 first aired on television."}
178
+ {"id": "newsroom-val-sent-76", "summaries": ["Country singer Joey Feek died of cervical cancer on Friday afternoon."], "text": "Country singer Joey Feek, whose husband Rory Feek revealed on Monday that she had only days to live, died of cervical cancer on Friday afternoon."}
179
+ {"id": "newsroom-val-sent-77", "summaries": ["Visibility is key to the success of any omnichannel retailing strategy."], "text": "Visibility is a key component to the success of any omnichannel retailing strategy, and both shoppers and retailers stand to benefit."}
180
+ {"id": "newsroom-val-sent-78", "summaries": ["Al-Shabab is responsible for numerous attacks in east Africa"], "text": "Al-Shabab, an Islamic extremist group affiliated with Al-Qaeda, is responsible for numerous attacks in east Africa, particularly Kenya."}
181
+ {"id": "newsroom-val-sent-79", "summaries": ["Breguet watches have been cherished by the most eminent figures of modern history"], "text": "Ever since Swiss-born Abraham-Louis Breguet opened his Paris workshop in 1775, watches bearing his name have been cherished by the most eminent figures of modern history."}
182
+ {"id": "newsroom-val-sent-80", "summaries": ["Brendan Dassey was just ordered released from custody by a federal judge."], "text": "Brendan Dassey -- the man who was convicted for helping Steven Avery in a murder chronicled on \"Making a Murderer\" -- was just ordered released from custody by a federal judge."}
183
+ {"id": "newsroom-val-sent-81", "summaries": ["Tens of thousands of demonstrators on Tuesday continued anti-government protests in Bangkok"], "text": "Tens of thousands of demonstrators on Tuesday continued anti-government protests in Bangkok intended to drive Thailand's Prime Minister out of office."}
184
+ {"id": "newsroom-val-sent-82", "summaries": ["Tom Cruise gave his first interview since his split with Katie Holmes on Tuesday."], "text": "Tom Cruise gave his first interview since his split with Katie Holmes on Tuesday, and he had a story to tell about an injury he sustained on the set of his new movie \u201cJack Reacher.\u201d"}
185
+ {"id": "newsroom-val-sent-83", "summaries": ["The largest hedge funds now control more cash than ever."], "text": "The fat funds are getting fatter\u2014the largest hedge funds now control more cash than ever before."}
186
+ {"id": "newsroom-val-sent-84", "summaries": ["Anytime the SAT is modified, students and parents seek assistance in droves."], "text": "Anytime the SAT is modified, students and parents seek assistance in droves, said Neil Chyten, president of Chyten Premier Tutoring and Test Preparation."}
187
+ {"id": "newsroom-val-sent-85", "summaries": ["Significant signs of improvement could prompt the Fed to raise interest rates sooner."], "text": "Significant signs of improvement, with more job openings and fewer layoffs, may mean the labor market recovery is broadening and could prompt the Fed to raise interest rates sooner."}
188
+ {"id": "newsroom-val-sent-86", "summaries": ["Actor Orlando Bloom listed his Hollywood Hills home for $4.5 million."], "text": "Actor Orlando Bloom listed his four-bedroom Hollywood Hills home for $4.5 million after previously renting it out for $16,500 a month."}
189
+ {"id": "newsroom-val-sent-87", "summaries": ["A weather bomb around Greenland caused seismic tremors as far away as Japan."], "text": "A weather bomb around Greenland caused seismic tremors as far away as Japan, and when researchers went to investigate, they made a rare discovery."}
190
+ {"id": "newsroom-val-sent-88", "summaries": ["Egypt officially started the process of holding its first-ever free presidential elections."], "text": "Egypt officially started on Saturday the process of holding its first-ever free presidential elections with candidates able to submit their applications."}
191
+ {"id": "newsroom-val-sent-89", "summaries": ["Know your risk tolerance and threshold for pain."], "text": "A few of her \u2014 and my \u2014 favorites: Know your risk tolerance and threshold for pain."}
192
+ {"id": "newsroom-val-sent-90", "summaries": ["Ford is recalling 450,000 Escape SUVs and Ford Freestar and Mercury Monterey minivans."], "text": "Ford is recalling 450,000 older Ford Escape SUVs for a fire risk and Ford Freestar and Mercury Monterey minivans for potential sudden loss of power, according to the National Highway Traffic Safety Administration."}
193
+ {"id": "newsroom-val-sent-91", "summaries": ["Romney is campaigning Tuesday and Wednesday in the battleground states of Pennsylvania and Ohio."], "text": "Romney is campaigning Tuesday and Wednesday in the battleground states of Pennsylvania and Ohio as speculation on his vice presidential candidate continues to build."}
194
+ {"id": "newsroom-val-sent-92", "summaries": ["Browns quarterback Connor Shaw will undergo surgery on his right thumb."], "text": "Browns third-string quarterback Connor Shaw will undergo surgery on his right thumb and be out for an indefinite period."}
195
+ {"id": "newsroom-val-sent-93", "summaries": ["Jackman saved two children from a dangerous riptide at Bondi Beach."], "text": "Hugh Jackman pulled a real-life superhero move this week when he saved his two children from a dangerous riptide at Bondi Beach."}
196
+ {"id": "newsroom-val-sent-94", "summaries": ["Microsoft will release a tablet-oriented version of Windows no sooner than 2012."], "text": "Microsoft will release a tablet-oriented version of Windows no sooner than 2012, Bloomberg reports, citing sources familiar with the matter."}
197
+ {"id": "newsroom-val-sent-95", "summaries": ["A South Carolina man was killed when an inflator exploded in December."], "text": "It comes just days after the government announced that a South Carolina man was killed when an inflator exploded in December."}
198
+ {"id": "newsroom-val-sent-96", "summaries": ["The mega explosions are powerful enough to lay an entire planet to waste."], "text": "On the positive side, the mega explosions, which are powerful enough to lay an entire planet to waste, can be detected ahead of time."}
199
+ {"id": "newsroom-val-sent-97", "summaries": ["Uber is partnering with the Communication Service for the Deaf to recruit more drivers."], "text": "Ride-hailing service Uber is partnering with the Communication Service for the Deaf to recruit more deaf drivers and continue to develop more resources for those drivers, the company said on Tuesday."}
200
+ {"id": "newsroom-val-sent-98", "summaries": ["Being childless has allowed me to invest in myself."], "text": "Being childless has also allowed me to invest in myself on a deeper level\u2014things that make me happy."}
201
+ {"id": "newsroom-val-sent-99", "summaries": ["Obama's pivot east has done little to contain China's strategic expansion."], "text": "Thus, Obama's \"pivot east\" has done much to fuel tensions in the Indo-Pacific, but has done little to contain China's strategic expansion."}
202
+ {"id": "newsroom-val-sent-100", "summaries": ["Google launched a free version of its music streaming service on Tuesday."], "text": "Google Inc launched a free version of its music streaming service on Tuesday, as it sought to upstage the debut of Apple's rival service next week."}
203
+ {"id": "newsroom-val-sent-101", "summaries": ["The couple, who have been dating since 2014, announced their engagement on Instagram"], "text": "The couple, who have been dating since 2014, announced their engagement Friday morning on Instagram, sharing similar smooching selfies while on a tropical vacation."}
204
+ {"id": "newsroom-val-sent-102", "summaries": ["Gradifi is hoping to entice employers into helping workers pay down their college loans."], "text": "Gradifi, a 12-person startup backed by about $3 million in seed investment, is hoping to entice more employers \u2014 and eventually, marketers of all stripes \u2014 into helping workers pay down their college loans."}
205
+ {"id": "newsroom-val-sent-103", "summaries": ["Nolen was charged Tuesday with first-degree murder and assault."], "text": "Alton Nolen was charged Tuesday with first-degree murder and assault in the decapitation of a former co-worker at a processing plant in Oklahoma."}
206
+ {"id": "newsroom-val-sent-104", "summaries": ["Samsung Pay will launch during the second half of 2015"], "text": "Samsung Pay will launch on the Galaxy S6 and the Galaxy S6 Edge in the U.S. during the second half of 2015."}
207
+ {"id": "newsroom-val-sent-105", "summaries": ["Ayesha Curry got into a social media throwdown with loudmouth Stephen A. Smith."], "text": "Ayesha Curry, who caused a Twitter storm late Thursday by suggesting the NBA Finals were fixed, got into a social media throwdown with loudmouth Stephen A. Smith on Friday."}
208
+ {"id": "newsroom-val-sent-106", "summaries": ["Retired FDNY Captain Vincent W. Julius died Sunday after a long illness."], "text": "Retired FDNY Captain Vincent W. Julius, one of the department\u2019s most high-profile African-American firefighters during the city\u2019s \u201cWar Years,\u201d died Sunday after a long illness."}
209
+ {"id": "newsroom-val-sent-107", "summaries": ["Avid Life Media announced Friday its CEO stepped down, effective Friday."], "text": "Avid Life Media announced Friday its CEO Noel Biderman stepped down and was no longer with the company, effective Friday."}
210
+ {"id": "newsroom-val-sent-108", "summaries": ["South Australian MP Jamie Briggs has resigned as Minister for Cities and Built Environment."], "text": "South Australian MP Jamie Briggs has resigned as Minister for Cities and the Built Environment over an incident with a female public servant on a trip to Hong Kong."}
211
+ {"id": "newsroom-val-sent-109", "summaries": ["Greek governments and private citizens have pushed for war damages from Germany for decades."], "text": "Greek governments and also private citizens have pushed for war damages from Germany for decades but the Greek government has never officially quantified its reparation claims."}
212
+ {"id": "newsroom-val-sent-110", "summaries": ["Former British Prime Minister Margaret Thatcher has died after suffering a stroke."], "text": "Former British Prime Minister Margaret Thatcher, an outspoken woman known to many as \"The Iron Lady,\" has died at 87 after suffering a stroke."}
213
+ {"id": "newsroom-val-sent-111", "summaries": ["A sea lion pup found strolling a San Francisco sidewalk surprised onlookers."], "text": "A sea lion pup found strolling a San Francisco sidewalk on Thursday may have surprised onlookers, but one expert says it could be \"the new normal.\""}
214
+ {"id": "newsroom-val-sent-112", "summaries": ["The international community has consistently deplored the occupation of the Azerbaijani territories."], "text": "The international community has consistently deplored, in the strongest terms, the use of force by Armenia against Azerbaijan and the occupation of the Azerbaijani territories."}
215
+ {"id": "newsroom-val-sent-113", "summaries": ["The pop star continues to open about her divorce from rocker Gavin Rossdale."], "text": "The pop star continues to open about her divorce from rocker Gavin Rossdale as she plugs her new album and revealed she felt she was \u201cdying\u201d after the collapse of her marriage."}
216
+ {"id": "newsroom-val-sent-114", "summaries": ["The drone scanned for signals being sent to devices that were sneaked into the test"], "text": "The drone, with six propellers and as big as a gas pump, scanned for signals potentially being sent to devices that were sneaked into the test, The Guardian reports."}
217
+ {"id": "newsroom-val-sent-115", "summaries": ["Cruz chose Fiorina as his vice-presidential running mate"], "text": "In a last-minute gambit to prevent Trump from winning the nomination, Cruz chose Fiorina on Wednesday as his vice-presidential running mate."}
218
+ {"id": "newsroom-val-sent-116", "summaries": ["Cavuto underwent surgery on Monday"], "text": "Cavuto hasn't been on air since May 31 and underwent surgery on Monday."}
219
+ {"id": "newsroom-val-sent-117", "summaries": ["Two researchers have created a program that can automatically generate paraphrases of English sentences."], "text": "Now, using several methods, including statistical techniques borrowed from gene analysis, two researchers have created a program that can automatically generate paraphrases of English sentences."}
220
+ {"id": "newsroom-val-sent-118", "summaries": ["Renaissance paintings taken from Jewish couple in 1935 returned to grandchildren by state of California"], "text": "Three renaissance paintings taken from a Jewish couple by the Nazis in 1935 have been returned to their grandchildren by the state of California, in a ceremony attended by governor Arnold Schwarzenegger."}
221
+ {"id": "newsroom-val-sent-119", "summaries": ["The child was rushed to the hospital on Sunday and died Wednesday"], "text": "The child allegedly was rushed to the hospital on Sunday and died Wednesday after being placed on life support, according to an indictment obtained by PEOPLE."}
222
+ {"id": "newsroom-val-sent-120", "summaries": ["The federal government filed two charges against Boston Marathon bombing suspect Dzhokhar Tsarnaev Monday."], "text": "The federal government filed two charges against Boston Marathon bombing suspect Dzhokhar Tsarnaev Monday, counts that could result in the death penalty if he's convicted."}
223
+ {"id": "newsroom-val-sent-121", "summaries": ["Old masters by Francesco Guardi worth \u00a310m have been seized by Scotland Yard"], "text": "A pair of old masters by Francesco Guardi worth \u00a310m have been seized by Scotland Yard after they were allegedly exported from Italy illegally."}
224
+ {"id": "newsroom-val-sent-122", "summaries": ["One of the primary banking regulators Thursday welcomed financial technology disrupters into the fold."], "text": "One of the nation\u2019s primary banking regulators Thursday welcomed financial technology disrupters into the fold and even encouraged them to join forces with traditional brick-and-mortar institutions, but with some rules."}
225
+ {"id": "newsroom-val-sent-123", "summaries": ["Facebook said it now has 1.71 billion monthly active users."], "text": "And Facebook said it now has 1.71 billion monthly active users, up 20 percent from the same period last year."}
226
+ {"id": "newsroom-val-sent-124", "summaries": ["The Waltham company sells cybersecurity software to\u00a0businesses."], "text": "The Waltham company, which has done business as Bit9 + Carbon Black since acquiring Carbon Black last year, sells cybersecurity software to businesses."}
227
+ {"id": "newsroom-val-sent-125", "summaries": ["Specialist units rushed to the mosque after envelopes were discovered there"], "text": "Specialist units rushed to the city\u2019s Grand Mosque after the envelopes were discovered there, fire brigade spokeswoman Malika Abbad told NBC News."}
228
+ {"id": "newsroom-val-sent-126", "summaries": ["Three extremely dangerous inmates have escaped from a jail in California."], "text": "Three extremely dangerous inmates \u2014 including one charged with murder and another charged with cutting off a man\u2019s penis \u2014 have escaped from a jail in California."}
229
+ {"id": "newsroom-val-sent-127", "summaries": ["The AP-GfK poll shows 59 percent of Americans now disapprove of Obama."], "text": "The AP-GfK poll shows 59 percent of Americans now disapprove of Obama -- a point higher than the previous high set in December."}
230
+ {"id": "newsroom-val-sent-128", "summaries": ["Age-based awards are outdated and discriminatory."], "text": "Here\u2019s what\u2019s on my mind: Age-based awards are outdated and discriminatory, even if unintentionally so."}
231
+ {"id": "newsroom-val-sent-129", "summaries": ["Tom and Arnisteen Clark were only separated when Mr Clark served in Korea."], "text": "Tom and Arnisteen Clark have been married 68 years and were only separated when Mr Clark, an Army veteran, served in Korea, ABC News reports."}
232
+ {"id": "newsroom-val-sent-130", "summaries": ["Boston is the fifth city to get the option in Lyft\u2019s app."], "text": "Boston is the fifth city to get the Line option in Lyft\u2019s app, following San Francisco, Los Angeles, New York, and Austin, Texas."}
233
+ {"id": "newsroom-val-sent-131", "summaries": ["Vice President Biden on Monday swore in former Loretta Lynch as attorney general."], "text": "Vice President Biden on Monday swore in former Brooklyn U.S. Attorney Loretta Lynch as the country\u2019s first female African-American attorney general."}
234
+ {"id": "newsroom-val-sent-132", "summaries": ["Transgender children experience a disconnect between their sex and their gender."], "text": "Transgender children experience a disconnect between their sex, which is anatomy, and their gender, which includes behaviors, roles and activities."}
235
+ {"id": "newsroom-val-sent-133", "summaries": ["Careless errors will cause others to question your commitment."], "text": "Double-check your work, as careless errors or poor grammar will cause others to question your commitment and work quality."}
236
+ {"id": "newsroom-val-sent-134", "summaries": ["Microsoft says CEO Steve Ballmer will retire within the next 12 months."], "text": "Microsoft says CEO Steve Ballmer will retire within the next 12 months, the world's biggest software company announced on Friday."}
237
+ {"id": "newsroom-val-sent-135", "summaries": ["The actress and boyfriend Jason Bleick welcomed son Arthur Saint on Monday"], "text": "The actress, 39, and her fashion designer boyfriend, Jason Bleick, welcomed son Arthur Saint Bleick on Monday, her rep tells PEOPLE exclusively."}
238
+ {"id": "newsroom-val-sent-136", "summaries": ["Goldman Sachs CEO Lloyd Blankfein has been diagnosed with lymphoma, he announced Tuesday."], "text": "Goldman Sachs CEO Lloyd Blankfein has been diagnosed with lymphoma but doctors expect he'll be completely cured of the cancer, he announced Tuesday."}
239
+ {"id": "newsroom-val-sent-137", "summaries": ["Starboard Value LP had aimed to overthrow the entire 12-director Darden board."], "text": "Investor Starboard Value LP had aimed to overthrow the entire 12-director Darden DRI board and replace it with a slate of its own, and came away victorious."}
240
+ {"id": "newsroom-val-sent-138", "summaries": ["There may be electricity flying between Amber Heard and billionaire Elon Musk."], "text": "There may be electricity flying between Amber Heard and billionaire Elon Musk amid the actress' high profile and nasty divorce from Johnny Depp."}
241
+ {"id": "newsroom-val-sent-139", "summaries": ["Violence continued to erupt across Israel as Palestinian assailants carried out five stabbing attacks."], "text": "Violence continued to erupt across Israel Saturday as Palestinian assailants carried out five stabbing attacks in Jerusalem and the West Bank."}
242
+ {"id": "newsroom-val-sent-140", "summaries": ["Clinton said half of Trump supporters are racist, sexist, homophobic or Islamophobic."], "text": "Hillary Clinton said Friday night in New York that half of Donald Trump's supporters are racist, sexist, xenophobic, homophobic and/or Islamophobic."}
243
+ {"id": "newsroom-val-sent-141", "summaries": ["Natural gas futures rose after a sharp price slide Monday."], "text": "Natural gas futures rose Tuesday as traders took profits after a sharp price slide Monday."}
244
+ {"id": "newsroom-val-sent-142", "summaries": ["Facebook is planning to release more standalone apps like Facebook Messenger and Instagram."], "text": "Facebook is planning to release more standalone apps like Facebook Messenger and Instagram, the company revealed during its fourth quarter earnings call on Wednesday."}
245
+ {"id": "newsroom-val-sent-143", "summaries": ["Wireless carriers are waiving fees following the terrorist attacks."], "text": "A number of U.S. wireless carriers are waiving fees for calls and texts to and from Belgium following the terrorist attacks in Brussels on Tuesday."}
246
+ {"id": "newsroom-val-sent-144", "summaries": ["A baby boy with two perfectly formed heads was born in Brazil this week"], "text": "A baby boy with two perfectly formed heads was born in Brazil this week, the country's media reported."}
247
+ {"id": "newsroom-val-sent-145", "summaries": ["Japan confirmed swine flu in three people who recently returned from Canada."], "text": "Japan confirmed its first cases of swine flu Saturday in three people who recently returned from Canada, even as the disease's spread appeared to slow in the rest of the world."}
248
+ {"id": "newsroom-val-sent-146", "summaries": ["Sometimes a national tragedy supersedes conventional workplace protocol."], "text": "While you ordinarily might shy away from sensitive subjects at work, sometimes a national tragedy \u2014 particularly one of this magnitude \u2014 supersedes conventional workplace protocol."}
249
+ {"id": "newsroom-val-sent-147", "summaries": ["Chinese state media caution that the space race should not fuel regional tensions."], "text": "Chinese state media give a cautious welcome to the successful launch of India's first Mars probe mission, but caution that the space race should not fuel regional tensions."}
250
+ {"id": "newsroom-val-sent-148", "summaries": ["The Czech Republic announced emergency measures Wednesday to combat a wave of alcohol poisoning."], "text": "The Czech Republic announced emergency measures Wednesday to combat a wave of alcohol poisoning, saying that 19 people have died and 24 have been hospitalized after drinking vodka and rum laced with methanol."}
251
+ {"id": "newsroom-val-sent-149", "summaries": ["Consensual BDSM can transport the lucky participants into an altered mental state."], "text": "Consensual BDSM (that\u2019s bondage/discipline, dominance/submission and sadism/masochism, Grandma) can transport the lucky participants into an altered mental state, according to a small new study."}
252
+ {"id": "newsroom-val-sent-150", "summaries": ["Google has updated the website where it showcases its Street View service."], "text": "Google has updated the website where it showcases its Street View service, now highlighting places of interest, the locations of Street View vehicles and more."}
253
+ {"id": "newsroom-val-sent-151", "summaries": ["People take an average of 1,403 fewer steps than usual on Christmas Day."], "text": "People take an average of 1,403 fewer steps than usual on Christmas Day, according to data from about 500,000 wearers of Jawbone UP, the fitness tracker."}
254
+ {"id": "newsroom-val-sent-152", "summaries": ["The affected specimen dates back roughly 1.7 million years."], "text": "The affected specimen, originally found in South Africa's Swartkrans Cave, dates back roughly 1.7 million years."}
255
+ {"id": "newsroom-val-sent-153", "summaries": ["Automakers continued to beat U.S. fuel-efficiency standards in 2015 model vehicles."], "text": "Automakers continued to beat U.S. fuel-efficiency standards in 2015 model vehicles, putting the industry on track to reach an average 50 miles per gallon by 2025."}
256
+ {"id": "newsroom-val-sent-154", "summaries": ["Skittles replaced its entire homepage with its Twitter stream."], "text": "Few, however, have taken it so far as candy maker Skittles, which replaced its entire homepage with its Twitter stream."}
257
+ {"id": "newsroom-val-sent-155", "summaries": ["Samsung sold 300 million devices so far this year, breaking its previous record."], "text": "Samsung may be still be trying to crack the tablet market, but in mobile devices it is continuing to pick up steam: It sold 300 million devices so far this year, breaking its previous record."}
258
+ {"id": "newsroom-val-sent-156", "summaries": ["Argentinian photography boasts a coterie of brilliant, established artists."], "text": "In this context, Argentinian photography may be slowly finding its feet internationally, but it still boasts a coterie of brilliant, established artists."}
259
+ {"id": "newsroom-val-sent-157", "summaries": ["Nervous investors remained transfixed on next week\u2019s presidential election."], "text": "Stocks retreated for an eighth consecutive day on Thursday as nervous investors remained transfixed on next week\u2019s too-close-to-call presidential election."}
260
+ {"id": "newsroom-val-sent-158", "summaries": ["What a liberal arts education gives you is growing more valuable, not less."], "text": "What a liberal arts education gives you \u2013 critical thinking, clear communication, the lessons of Homer \u2013 is growing more valuable, not less."}
261
+ {"id": "newsroom-val-sent-159", "summaries": ["The couple met at Apple, where they both work."], "text": "The couple met in August 2010 at Apple, in Cupertino, Calif., where they both work."}
262
+ {"id": "newsroom-val-sent-160", "summaries": ["The test for Trump will be avoiding gaffes that would further enrage Muslims."], "text": "The test for Trump will be avoiding gaffes that would further enrage Muslims after his proposal to ban them temporarily from entering the country."}
263
+ {"id": "newsroom-val-sent-161", "summaries": ["Historic grant of work authorization significant for individual workers and the national labor market"], "text": "But backers and opponents of the White House plan agree on one thing \u2014 that this historic grant of work authorization is significant both for individual workers and the national labor market."}
264
+ {"id": "newsroom-val-sent-162", "summaries": ["Wall Street firms say relatively stable markets indicate a Hillary Clinton victory."], "text": "Wall Street firms say relatively stable markets indicate that investors expect a Hillary Clinton victory, with Goldman Sachs Group on Friday citing an 85 percent probability that she would win."}
265
+ {"id": "newsroom-val-sent-163", "summaries": ["The danger of impunity is the very foundation of most violent crimes against humanity."], "text": "The danger of impunity is not merely the lack of legal accountability, but the fact that it is the very foundation of most violent crimes against humanity."}
266
+ {"id": "newsroom-val-sent-164", "summaries": ["Richard Emery abruptly stepped down as head of the police watchdog group on Wednesday."], "text": "Civilian Complaint Review Board head Richard Emery abruptly stepped down as head of the police watchdog group on Wednesday \u2014 a day after he was sued because of crass comments about female co-workers."}
267
+ {"id": "newsroom-val-sent-165", "summaries": ["Angelina Jolie filed for divorce from Brad Pitt."], "text": "Angelina Jolie has filed for divorce from Brad Pitt, a lawyer for the actress told the Associated Press."}
268
+ {"id": "newsroom-val-sent-166", "summaries": ["Health insurers are requesting the right to increase premiums by upwards of 50%"], "text": "According to a report published Friday in the The Wall Street Journal, health insurers are requesting the right in many states to increase premiums by upwards of 50%."}
269
+ {"id": "newsroom-val-sent-167", "summaries": ["Carlson has played in 412 consecutive games."], "text": "Carlson has played in 412 consecutive games, and the team\u2019s top defenseman leads the team in time on ice."}
270
+ {"id": "newsroom-val-sent-168", "summaries": ["The Alberta Party kicked off its campaign in Calgary on Saturday."], "text": "The Alberta Party kicked off its campaign in Calgary on Saturday with a pancake breakfast and promises of a better Alberta."}
271
+ {"id": "newsroom-val-sent-169", "summaries": ["The lead singer and guitarist John Bell shared his favorite places to play."], "text": "Now at the start of a summer/fall 2014 tour that brings the band to Boston on Friday, lead singer and guitarist John Bell shared his favorite places to play."}
272
+ {"id": "newsroom-val-sent-170", "summaries": ["Google announced it will add Google Glass options for prescription glasses."], "text": "On Tuesday, Google announced it will add Google Glass options for prescription glasses, its most requested feature since it launched the face-mounted computers last year."}
273
+ {"id": "newsroom-val-sent-171", "summaries": ["Senator Bernie Sanders has been going after pharmaceutical companies over drug prices."], "text": "Makers of insulin became the latest target for Senator Bernie Sanders, who has been going after pharmaceutical companies one by one over the issue of high U.S. drug prices."}
274
+ {"id": "newsroom-val-sent-172", "summaries": ["Michael Phelps\ufeff dropped $2.5 mil on a HUGE mansion in Scottsdale, Arizona."], "text": "Killing it in water gets you a sick place on land ... just ask Michael Phelps\ufeff, who dropped $2.5 mil on a HUGE mansion in Scottsdale, Arizona ... complete with a sick pool (of course)."}
275
+ {"id": "newsroom-val-sent-173", "summaries": ["Spanish Prime Minister Mariano Rajoy staunchly denied allegations that he received secret cash payments."], "text": "Spanish Prime Minister Mariano Rajoy on Saturday staunchly denied allegations that he and other conservative leaders received secret cash payments for years from the ruling Popular Party."}
276
+ {"id": "newsroom-val-sent-174", "summaries": ["Hillary Clinton wasn\u2019t by any stretch the first woman to run for President."], "text": "Though she just became the first woman to clinch the nomination of either major U.S. political party, Hillary Clinton wasn\u2019t by any stretch the first woman to run for President."}
277
+ {"id": "newsroom-val-sent-175", "summaries": ["Xcel raised $22.5 million in new funding."], "text": "Xcel Pharmaceuticals Inc., San Diego, announced it raised $22.5 million in new funding."}
278
+ {"id": "newsroom-val-sent-176", "summaries": ["Legendary racehorse trainer Bart Cummings has died aged 87."], "text": "Legendary Australian racehorse trainer Bart Cummings has died aged 87, surrounded by his family at his Castlereagh homestead."}
279
+ {"id": "newsroom-val-sent-177", "summaries": ["Fashion designer Ivanka Trump famously converted to Judaism before marrying Jared Kushner in 2009."], "text": "Fashion designer Ivanka Trump, daughter of billionaire real estate mogul and presidential candidate Donald Trump, famously converted to Judaism before marrying Jared Kushner in 2009."}
280
+ {"id": "newsroom-val-sent-178", "summaries": ["A powerful explosion ripped through a religious congregation in Karachi this evening."], "text": "A powerful explosion ripped through a religious congregation in the southern port city of Karachi this evening, leaving at least 47 dead and more than 80 wounded, officials said."}
SCRL_new/example.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrl.model import load_model
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+ def main():
6
+ # model_dir = "data/models/gigaword-L8/"
7
+ # model_dir = "data/models/newsroom-L11/"
8
+ model_dir = "data/models/newsroom-P75/"
9
+ device = "cpu"
10
+ model = load_model(model_dir, device)
11
+ tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
12
+ sources = [
13
+ """
14
+ Most remaining Covid restrictions in Victoria have now been removed for those who are fully vaccinated, with the state about to hit its 90% vaccinated target.
15
+ """.strip()
16
+ ]
17
+ summaries = model.predict(sources, tokenizer, device)
18
+ for s in summaries:
19
+ print(s)
20
+
21
+
22
+ if __name__ == '__main__':
23
+ main()
SCRL_new/images/model.png ADDED
SCRL_new/loaders/gigaword.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datasets
4
+ from pathlib import Path
5
+
6
+
7
+ _DESCRIPTION = "Gigaword dataset"
8
+ _DOCUMENT = "document"
9
+ _ID = "id"
10
+
11
+
12
+ class GigawordDataset(datasets.GeneratorBasedBuilder):
13
+
14
+ VERSION = datasets.Version("1.0.0")
15
+
16
+ def _info(self):
17
+ return datasets.DatasetInfo(
18
+ description=_DESCRIPTION,
19
+ features=datasets.Features(
20
+ {
21
+ _DOCUMENT: datasets.Value("string"),
22
+ _ID: datasets.Value("string"),
23
+ }
24
+ ),
25
+ #supervised_keys=(_DOCUMENT, _SUMMARY),
26
+ )
27
+
28
+ def _split_generators(self, dl_manager):
29
+ """Returns SplitGenerators."""
30
+ data_dir = dl_manager._data_dir
31
+ return [
32
+ datasets.SplitGenerator(
33
+ name=datasets.Split.TRAIN,
34
+ gen_kwargs={"path": os.path.join(data_dir, "train.jsonl"), "name": "train"}
35
+ ),
36
+ datasets.SplitGenerator(
37
+ name=datasets.Split.VALIDATION,
38
+ gen_kwargs={"path": os.path.join(data_dir, "val.jsonl"), "name": "validation"}
39
+ ),
40
+ ]
41
+
42
+ def _generate_examples(self, path=None, name=None):
43
+ """Yields examples."""
44
+ with open(path, encoding="utf-8") as f:
45
+ for i, line in enumerate(f):
46
+ x = json.loads(line)
47
+ id = x["id"]
48
+ item = {
49
+ _ID: id,
50
+ _DOCUMENT: x["text"],
51
+ }
52
+ yield id, item
SCRL_new/loaders/newsroom.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datasets
4
+ from pathlib import Path
5
+
6
+
7
+ _DESCRIPTION = "Newsroom validation dataset"
8
+ _DOCUMENT = "document"
9
+ _ID = "id"
10
+
11
+
12
+ class NewsroomDatasetValidation(datasets.GeneratorBasedBuilder):
13
+
14
+ VERSION = datasets.Version("1.0.0")
15
+
16
+ def _info(self):
17
+ return datasets.DatasetInfo(
18
+ description=_DESCRIPTION,
19
+ features=datasets.Features(
20
+ {
21
+ _DOCUMENT: datasets.Value("string"),
22
+ _ID: datasets.Value("string"),
23
+ }
24
+ ),
25
+ )
26
+
27
+ def _split_generators(self, dl_manager):
28
+ """Returns SplitGenerators."""
29
+ data_dir = dl_manager._data_dir
30
+ return [
31
+ datasets.SplitGenerator(
32
+ name=datasets.Split.TRAIN,
33
+ gen_kwargs={"path": os.path.join(data_dir, "train.jsonl"), "name": "train"}
34
+ ),
35
+ datasets.SplitGenerator(
36
+ name=datasets.Split.VALIDATION,
37
+ gen_kwargs={"path": os.path.join(data_dir, "val.jsonl"), "name": "validation"}
38
+ ),
39
+ ]
40
+
41
+ def _generate_examples(self, path=None, name=None):
42
+ """Yields examples."""
43
+ with open(path, encoding="utf-8") as f:
44
+ for i, line in enumerate(f):
45
+ x = json.loads(line)
46
+ id = x["id"]
47
+ item = {
48
+ _ID: id,
49
+ _DOCUMENT: x["sentence"],
50
+ }
51
+ yield id, item
SCRL_new/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers==4.11.3
2
+ datasets==1.14.0
3
+ sentence-transformers==2.1.0
4
+ rouge-score==0.0.4
5
+ nltk==3.6.5
SCRL_new/scrl/__init__.py ADDED
File without changes
SCRL_new/scrl/config.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from dataclasses import dataclass, make_dataclass, asdict, field
3
+ from typing import List
4
+
5
+
6
+ @dataclass
7
+ class Config:
8
+ # paths
9
+ config: str = "config/default.json"
10
+ loader: str = "loaders/newsroom.py"
11
+ dataset: str = ""
12
+ indices: str = ""
13
+ model_dir: str = "default_model_dir"
14
+ validation_datasets: List = field(default_factory=lambda: [])
15
+
16
+ # training settings/hyperparams
17
+ batch_size: int = 4
18
+ learning_rate: float = 0.00001
19
+ k_samples: int = 1
20
+ sample_aggregation: str = "max"
21
+ max_val_steps: int = None
22
+ max_train_steps: int = None
23
+ max_train_seconds: int = None
24
+ print_every: int = 10
25
+ save_every: int = 100
26
+ eval_every: int = 100
27
+ verbose: bool = True
28
+
29
+ # pretrained models
30
+ encoder_model_id: str = "distilroberta-base"
31
+ # reward settings
32
+ rewards: tuple = (
33
+ "FluencyReward",
34
+ "BiEncoderSimilarity",
35
+ "GaussianLength",
36
+ )
37
+
38
+
39
+ def validate_config(args):
40
+ assert (args.sample_aggregation in ("max", "mean"))
41
+
42
+
43
+ def load_config(args):
44
+ """
45
+ Loads settings into a dataclass object, from the following sources:
46
+ - defaults defined above by DefaultConfig
47
+ - args.config (path to a JSON config file)
48
+ - args (from using argparse in a script)
49
+
50
+ Overlapping fields are overwritten in that order.
51
+
52
+ Example usage:
53
+ (...)
54
+ args = load_config(parser.parse_args())
55
+ args.batch_size
56
+ """
57
+ config = asdict(Config())
58
+ if args.config:
59
+ with open(args.config) as f:
60
+ config.update(json.load(f))
61
+ config.update(args.__dict__)
62
+ Config_ = make_dataclass("Config", fields=config.items())
63
+ config_object = Config_(**config)
64
+ validate_config(config_object)
65
+ return config_object
SCRL_new/scrl/config_hc.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from dataclasses import dataclass, make_dataclass, asdict, field
3
+ from typing import List
4
+
5
+
6
+ @dataclass
7
+ class Config:
8
+ device: str = "cpu"
9
+ # paths
10
+ config: str = "config/default.json"
11
+ loader: str = "loaders/google_sc.py"
12
+ dataset: str = ""
13
+ indices: str = ""
14
+ model_dir: str = "default_model_dir"
15
+ validation_datasets: List = field(default_factory=lambda: [])
16
+
17
+ # training settings/hyperparams
18
+ batch_size: int = 4
19
+ verbose: bool = True
20
+
21
+ # pretrained models
22
+ encoder_model_id: str = "distilroberta-base"
23
+ # reward settings
24
+ rewards: tuple = (
25
+ "FluencyReward",
26
+ "CrossSimilarityReward",
27
+ )
28
+
29
+
30
+ def load_config(args):
31
+ """
32
+ Loads settings into a dataclass object, from the following sources:
33
+ - defaults defined above by DefaultConfig
34
+ - args.config (path to a JSON config file)
35
+ - args (from using argparse in a script)
36
+
37
+ Overlapping fields are overwritten in that order.
38
+
39
+ Example usage:
40
+ (...)
41
+ args = load_config(parser.parse_args())
42
+ args.batch_size
43
+ """
44
+ config = asdict(Config())
45
+ if args.config:
46
+ with open(args.config) as f:
47
+ config.update(json.load(f))
48
+ config.update(args.__dict__)
49
+ Config_ = make_dataclass("Config", fields=config.items())
50
+ return Config_(**config)
SCRL_new/scrl/data.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+
4
+ def load_data_for_training(
5
+ tokenizer,
6
+ loader_path,
7
+ dataset_dir,
8
+ max_input_length=256,
9
+ ):
10
+
11
+ def preprocess_function(examples):
12
+ inputs = [doc for doc in examples["document"]]
13
+ model_inputs = tokenizer(
14
+ inputs, max_length=max_input_length, truncation=True
15
+ )
16
+ return model_inputs
17
+
18
+ # preprocess dataset
19
+ datasets = load_dataset(
20
+ path=loader_path,
21
+ data_dir=dataset_dir,
22
+ )
23
+ tokenized_datasets = datasets.map(preprocess_function, batched=True)
24
+ return tokenized_datasets
SCRL_new/scrl/eval_metrics.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from rouge_score import rouge_scorer
3
+
4
+
5
+ ROUGE_TYPES = ["rouge1", "rouge2", "rougeL"]
6
+ rouge_scorer = rouge_scorer.RougeScorer(
7
+ ROUGE_TYPES,
8
+ use_stemmer=True
9
+ )
10
+
11
+
12
+ def compute_token_f1(tgt_tokens, pred_tokens, use_counts=True):
13
+ if not use_counts:
14
+ tgt_tokens = set(tgt_tokens)
15
+ pred_tokens = set(pred_tokens)
16
+ tgt_counts = Counter(tgt_tokens)
17
+ pred_counts = Counter(pred_tokens)
18
+ overlap = 0
19
+ for t in (set(tgt_tokens) | set(pred_tokens)):
20
+ overlap += min(tgt_counts[t], pred_counts[t])
21
+ p = overlap / len(pred_tokens) if overlap > 0 else 0.
22
+ r = overlap / len(tgt_tokens) if overlap > 0 else 0.
23
+ f1 = (2 * p * r) / (p + r) if min(p, r) > 0 else 0.
24
+ return f1
SCRL_new/scrl/hill_climbing.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import numpy as np
3
+ from nltk import word_tokenize
4
+ from collections import defaultdict
5
+ from copy import deepcopy
6
+ import tqdm
7
+
8
+
9
+ class PunktTokenizer:
10
+ def __call__(self, texts):
11
+ return [word_tokenize(t) for t in texts]
12
+
13
+
14
+ class WhiteSpaceTokenizer:
15
+ def __call__(self, texts):
16
+ return [t.split() for t in texts]
17
+
18
+
19
+ class SearchState:
20
+ def __init__(self, tokens):
21
+ self.tokens = tokens
22
+ self.masks = []
23
+ self.mask_set = set()
24
+ self.summaries = []
25
+ self.scores = []
26
+ self.best_step = None
27
+ self.terminated = False
28
+ self.step = 0
29
+
30
+ def update(self, mask, summary, score):
31
+ if self.best_step is None or score > self.best_score():
32
+ self.best_step = self.step
33
+ self.masks.append(mask)
34
+ self.mask_set.add(tuple(mask))
35
+ self.summaries.append(summary)
36
+ self.scores.append(score)
37
+ self.step += 1
38
+
39
+ def best_mask(self):
40
+ return self.masks[self.best_step]
41
+
42
+ def best_score(self):
43
+ return self.scores[self.best_step]
44
+
45
+ def best_summary(self):
46
+ return self.summaries[self.best_step]
47
+
48
+ def to_dict(self):
49
+ return {
50
+ "scores": self.scores,
51
+ "masks": self.masks,
52
+ "summaries": self.summaries,
53
+ "best_summary": self.best_summary(),
54
+ "best_score": self.best_score(),
55
+ }
56
+
57
+
58
+ class DynamicRestartHCSC:
59
+ def __init__(self, tokenizer, objective):
60
+ self.tokenizer = tokenizer
61
+ self.objective = objective
62
+ self.n_trials = 100
63
+
64
+ def _mask_to_summary(self, mask, tokens):
65
+ summary = [tokens[i] for i in range(len(mask)) if mask[i] == 1]
66
+ return " ".join(summary)
67
+
68
+ def _sample(self, state, sent_len, target_len, from_scratch=False):
69
+ """
70
+ Swaps one selected word for another, discarding previous solutions.
71
+ """
72
+ if target_len >= sent_len:
73
+ mask = [1 for _ in range(sent_len)]
74
+ state.terminated = True
75
+ return mask, True
76
+ if state.step == 0 or from_scratch:
77
+ indices = list(range(sent_len))
78
+ sampled = set(random.sample(indices, min(target_len, sent_len)))
79
+ mask = [int(i in sampled) for i in indices]
80
+ return mask, False
81
+ else:
82
+ mask = state.masks[state.best_step]
83
+ indices = list(range(len(mask)))
84
+ one_indices = [i for i in range(len(mask)) if mask[i] == 1]
85
+ zero_indices = [i for i in range(len(mask)) if mask[i] == 0]
86
+ if len(zero_indices) == 0:
87
+ return mask
88
+ terminated = True
89
+ # trying to find unknown state, heuristically with fixed no. trials
90
+ for _ in range(self.n_trials):
91
+ i = random.choice(one_indices)
92
+ j = random.choice(zero_indices)
93
+ new_mask = mask.copy()
94
+ new_mask[i] = 0
95
+ new_mask[j] = 1
96
+ if tuple(new_mask) not in state.mask_set:
97
+ terminated = False
98
+ mask = new_mask
99
+ break
100
+ # terminate if no unknown neighbor state is found
101
+ return mask, terminated
102
+
103
+ def aggregate_states(self, states):
104
+ masks = [m for s in states for m in s.masks]
105
+ summaries = [x for s in states for x in s.summaries]
106
+ scores = [x for s in states for x in s.scores]
107
+ best_step = np.argmax(scores)
108
+ return {
109
+ "masks": masks,
110
+ "summaries": summaries,
111
+ "scores": scores,
112
+ "best_score": scores[best_step],
113
+ "best_summary": summaries[best_step],
114
+ }
115
+
116
+ def __call__(
117
+ self,
118
+ sentences,
119
+ target_lens,
120
+ n_steps=100,
121
+ verbose=False,
122
+ return_states=False,
123
+ ):
124
+ tok_sentences = self.tokenizer(sentences)
125
+ batch_size = len(sentences)
126
+ terminated_states = [[] for _ in range(batch_size)]
127
+ states = [SearchState(s) for s in tok_sentences]
128
+
129
+ for t in tqdm.tqdm(list(range(1, n_steps + 1))):
130
+ masks = []
131
+ for i in range(batch_size):
132
+ if states[i].terminated:
133
+ if verbose:
134
+ print(f"step {t}, restarting state {i} with score {states[i].best_score()}")
135
+ terminated_states[i].append(states[i])
136
+ states[i] = SearchState(tok_sentences[i])
137
+
138
+ mask, terminated = self._sample(
139
+ states[i],
140
+ sent_len=len(tok_sentences[i]),
141
+ target_len=target_lens[i],
142
+ )
143
+ states[i].terminated = terminated
144
+ masks.append(mask)
145
+
146
+ summaries = [
147
+ self._mask_to_summary(m, tokens)
148
+ for m, tokens in zip(masks, tok_sentences)
149
+ ]
150
+ scores, _ = self.objective(sentences, summaries)
151
+
152
+ if verbose:
153
+ print(f"t={t}")
154
+ for i in range(batch_size):
155
+ print(f"[{scores[i]:.3f}][{summaries[i]}]")
156
+ print()
157
+
158
+ for i in range(batch_size):
159
+ states[i].update(masks[i], summaries[i], scores[i])
160
+
161
+ for i in range(batch_size):
162
+ terminated_states[i].append(states[i])
163
+ output_states = [
164
+ self.aggregate_states(i_states) for i_states in terminated_states
165
+ ]
166
+ return output_states
SCRL_new/scrl/model.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch.nn.utils.rnn import pad_sequence
5
+ from transformers import AutoModel
6
+ from pathlib import Path
7
+
8
+
9
+ class LinearTokenSelector(nn.Module):
10
+ def __init__(self, encoder, embedding_size=768):
11
+ super(LinearTokenSelector, self).__init__()
12
+ self.encoder = encoder
13
+ self.classifier = nn.Linear(embedding_size, 2, bias=False)
14
+
15
+ def forward(self, x):
16
+ output = self.encoder(x, output_hidden_states=True)
17
+ x = output["hidden_states"][-1] # B * S * H
18
+ x = self.classifier(x)
19
+ x = F.log_softmax(x, dim=2)
20
+ return x
21
+
22
+ def save(self, classifier_path, encoder_path):
23
+ state = self.state_dict()
24
+ state = dict((k, v) for k, v in state.items() if k.startswith("classifier"))
25
+ torch.save(state, classifier_path)
26
+ self.encoder.save_pretrained(encoder_path)
27
+
28
+ def predict(self, texts, tokenizer, device):
29
+ input_ids = tokenizer(texts)["input_ids"]
30
+ input_ids = pad_sequence(
31
+ [torch.tensor(ids) for ids in input_ids], batch_first=True
32
+ ).to(device)
33
+ logits = self.forward(input_ids)
34
+ argmax_labels = torch.argmax(logits, dim=2)
35
+ return labels_to_summary(input_ids, argmax_labels, tokenizer)
36
+
37
+
38
+ def load_model(model_dir, device="cuda", prefix="best"):
39
+ if isinstance(model_dir, str):
40
+ model_dir = Path(model_dir)
41
+ for p in (model_dir / "checkpoints").iterdir():
42
+ if p.name.startswith(f"{prefix}"):
43
+ checkpoint_dir = p
44
+ return load_checkpoint(checkpoint_dir, device=device)
45
+
46
+
47
+ def load_checkpoint(checkpoint_dir, device="cuda"):
48
+ if isinstance(checkpoint_dir, str):
49
+ checkpoint_dir = Path(checkpoint_dir)
50
+
51
+ encoder_path = checkpoint_dir / "encoder.bin"
52
+ classifier_path = checkpoint_dir / "classifier.bin"
53
+
54
+ encoder = AutoModel.from_pretrained(encoder_path).to(device)
55
+ embedding_size = encoder.state_dict()["embeddings.word_embeddings.weight"].shape[1]
56
+
57
+ classifier = LinearTokenSelector(None, embedding_size).to(device)
58
+ classifier_state = torch.load(classifier_path, map_location=device)
59
+ classifier_state = dict(
60
+ (k, v) for k, v in classifier_state.items()
61
+ if k.startswith("classifier")
62
+ )
63
+ classifier.load_state_dict(classifier_state)
64
+ classifier.encoder = encoder
65
+ return classifier.to(device)
66
+
67
+
68
+ def labels_to_summary(input_batch, label_batch, tokenizer):
69
+ summaries = []
70
+ for input_ids, labels in zip(input_batch, label_batch):
71
+ selected = [int(input_ids[i]) for i in range(len(input_ids))
72
+ if labels[i] == 1]
73
+ summary = tokenizer.decode(selected, skip_special_tokens=True)
74
+ summaries.append(summary)
75
+ return summaries
SCRL_new/scrl/rewards.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from torch.nn.utils.rnn import pad_sequence
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder
6
+ from sentence_transformers.util import pytorch_cos_sim
7
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM
8
+ from nltk import word_tokenize
9
+ from collections import defaultdict
10
+ from pprint import pprint
11
+
12
+
13
+ from collections import Counter
14
+ from rouge_score import rouge_scorer
15
+
16
+
17
+ ROUGE_TYPES = ["rouge1", "rouge2", "rougeL"]
18
+ rouge_scorer = rouge_scorer.RougeScorer(
19
+ ROUGE_TYPES,
20
+ use_stemmer=True
21
+ )
22
+
23
+
24
+ def load_rewards(args):
25
+ rewards, names = [], []
26
+ for name, settings in args.rewards.items():
27
+ settings["device"] = args.device
28
+ print("Loading reward:", name)
29
+ pprint(settings)
30
+ print()
31
+ reward_cls = globals()[name]
32
+ reward_func = reward_cls(**settings)
33
+ rewards.append(reward_func)
34
+ names.append(name)
35
+ return RewardAggregator(rewards, names)
36
+
37
+
38
+ class RewardAggregator:
39
+ def __init__(self, reward_generators, reward_names):
40
+ self.reward_generators = reward_generators
41
+ self.reward_names = reward_names
42
+ self.weights = [rg.weight for rg in reward_generators]
43
+ self.n_rewards = len(reward_generators)
44
+
45
+ def __call__(self, sources, summaries):
46
+ name_to_scores = {}
47
+ for rg, name in zip(self.reward_generators, self.reward_names):
48
+ scores = rg(sources=sources, summaries=summaries)
49
+ name_to_scores[name] = scores
50
+ final_scores = []
51
+ for i in range(len(summaries)):
52
+ score = 0.
53
+ total_weights = 0.
54
+ for name, w in zip(self.reward_names, self.weights):
55
+ score += name_to_scores[name][i] * w
56
+ total_weights += w
57
+ score /= total_weights
58
+ final_scores.append(score)
59
+
60
+ return final_scores, name_to_scores
61
+
62
+
63
+ class Fluency:
64
+
65
+ def __init__(
66
+ self,
67
+ model_id="distilroberta",
68
+ weight=1,
69
+ type="masked",
70
+ device="cuda",
71
+ norm="max",
72
+ max_score=40.,
73
+ min_score=-30.,
74
+ ):
75
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
76
+ if type == "masked":
77
+ pad_token_id = tokenizer.pad_token_id
78
+ model = AutoModelForMaskedLM.from_pretrained(model_id).to(device)
79
+ else:
80
+ pad_token_id = tokenizer.eos_token_id
81
+ model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
82
+
83
+ self.model = model
84
+ self.tokenizer = tokenizer
85
+ self.weight = weight
86
+ self.device = device
87
+ self.max_score = max_score
88
+ self.min_score = min_score
89
+ self.pad_token_id = pad_token_id
90
+ self.norm = norm
91
+ assert self.norm in ("max", "minmax")
92
+
93
+ def ids_to_tokens(self, ids):
94
+ return [self.tokenizer._convert_id_to_token(id) for id in ids]
95
+
96
+ def __call__(self, sources=None, summaries=None, normalize_len=False):
97
+ summaries = [s if s != "" else " " for s in summaries] # breaks if string is empty
98
+ input_ids = [self.tokenizer.encode(text) for text in summaries]
99
+ lens = [len(ids) for ids in input_ids]
100
+ input_ids = [torch.tensor(ids) for ids in input_ids]
101
+ input_ids = pad_sequence(
102
+ input_ids,
103
+ batch_first=True,
104
+ padding_value=self.pad_token_id
105
+ ).to(self.device)
106
+ with torch.no_grad():
107
+ output = self.model(input_ids=input_ids, labels=input_ids)
108
+ logits = output["logits"]
109
+
110
+ scores = []
111
+ for i in range(logits.size(0)):
112
+ i_scores = []
113
+ for j in range(logits.size(1)):
114
+ tok_idx = input_ids[i, j]
115
+ if tok_idx == self.pad_token_id:
116
+ break
117
+ score = logits[i, j, tok_idx].item()
118
+ i_scores.append(score)
119
+ i_score_max = np.mean(i_scores) / self.max_score
120
+ i_score_minmax = (np.mean(i_scores) - self.min_score) / (self.max_score - self.min_score)
121
+ if self.norm == "max":
122
+ i_score = i_score_max
123
+ else:
124
+ i_score = i_score_minmax
125
+ scores.append(i_score)
126
+ return scores
127
+
128
+
129
+ class BiEncoderSimilarity:
130
+ def __init__(
131
+ self,
132
+ model_id="all-distilroberta-v1",
133
+ device="cuda",
134
+ weight=1
135
+ ):
136
+ self.model = SentenceTransformer(model_id).to(device)
137
+ self.weight = weight
138
+
139
+ def __call__(self, sources=None, summaries=None):
140
+ src_embs = self.model.encode(sources)
141
+ sum_embs = self.model.encode(summaries)
142
+ scores = []
143
+ for i in range(len(summaries)):
144
+ score = pytorch_cos_sim(
145
+ src_embs[i].reshape(1, -1),
146
+ sum_embs[i].reshape(1, -1),
147
+ )[0, 0].item()
148
+ scores.append(score)
149
+ return scores
150
+
151
+
152
+ class CrossEncoderSimilarity:
153
+ def __init__(
154
+ self,
155
+ model_id="all-distilroberta-v1",
156
+ device="cuda",
157
+ weight=1
158
+ ):
159
+ self.model = CrossEncoder(model_id, device=device)
160
+ self.weight = weight
161
+
162
+ def __call__(self, sources=None, summaries=None):
163
+ scores = self.model.predict([
164
+ (src, sum) for src, sum in zip(sources, summaries)
165
+ ])
166
+ return scores.tolist()
167
+
168
+
169
+ class SelectedTokenSimilarity:
170
+ def __init__(
171
+ self,
172
+ model_id="all-distilroberta-v1",
173
+ device="cuda",
174
+ weight=1
175
+ ):
176
+ self.model = SentenceTransformer(model_id).to(device)
177
+ self.weight = weight
178
+ self.tokenizer = model.tokenizer
179
+
180
+ def ids_to_tokens(self, ids):
181
+ return [self.tokenizer._convert_id_to_token(id) for id in ids]
182
+
183
+ def align_tokens(self, src, summary):
184
+ src_ids, sum_ids = self.tokenizer(
185
+ [src, summary],
186
+ truncation=True,
187
+ max_length=self.model.max_seq_length,
188
+ ).input_ids
189
+ src_tokens = self.ids_to_tokens(src_ids)
190
+ sum_tokens = self.ids_to_tokens(sum_ids)
191
+ sum_to_src = defaultdict(list)
192
+ for i, sum_tok in enumerate(sum_tokens):
193
+ for j, src_tok in enumerate(src_tokens):
194
+ if sum_tok == src_tok:
195
+ sum_to_src[i].append(j)
196
+ if len(sum_to_src[i]) == 0:
197
+ sum_to_src[i] = None
198
+ return sum_to_src
199
+
200
+ def compute_score(self, x_sum, x_src, sum_to_src):
201
+ S = pytorch_cos_sim(x_sum, x_src).cpu().numpy()
202
+ scores = []
203
+ for i, J in sum_to_src.items():
204
+ if J is None:
205
+ i_score = 0.
206
+ else:
207
+ i_scores = [S[i, j] for j in J]
208
+ i_score = max(i_scores)
209
+ scores.append(i_score)
210
+ return np.mean(scores)
211
+
212
+ def __call__(self, sources=None, summaries=None):
213
+ src_embs = self.model.encode(sources, output_value="token_embeddings")
214
+ sum_embs = self.model.encode(summaries, output_value="token_embeddings")
215
+ scores = []
216
+ for i in range(len(summaries)):
217
+ x_src = src_embs[i]
218
+ x_sum = sum_embs[i]
219
+ sum_to_src = self.align_tokens(sources[i], summaries[i])
220
+ score = self.compute_score(x_sum, x_src, sum_to_src)
221
+ scores.append(score)
222
+ return scores
223
+
224
+
225
+ class NLIReward():
226
+ def __init__(
227
+ self,
228
+ model_id="cross-encoder/nli-distilroberta-base",
229
+ device="cuda",
230
+ weight=1
231
+ ):
232
+ self.model = CrossEncoder(model_id, device)
233
+ self.label_mapping = ['contradiction', 'entailment', 'neutral']
234
+ self.weight = weight
235
+
236
+ def __call__(self, sources=None, summaries=None):
237
+ scores = self.model.predict([
238
+ (src, sum) for src, sum in zip(sources, summaries)
239
+ ])
240
+ probs = torch.softmax(torch.tensor(scores), dim=1)
241
+ labels = [
242
+ self.label_mapping[score_max] for score_max in scores.argmax(axis=1)
243
+ ]
244
+ rewards = [probs[i, 1].item() for i in range(len(summaries))]
245
+ rewards = [
246
+ (0 if summaries[i].strip()=="" else r)
247
+ for i, r in enumerate(rewards)
248
+ ]
249
+ return rewards
250
+
251
+
252
+ class GaussianLength:
253
+ def __init__(self, mean=11, std=0.3, max_len=100, weight=1, device=None):
254
+ self.weight = weight
255
+ lens = np.arange(0, max_len + 1)
256
+ scores = gaussian(lens, mean, std)
257
+ scores /= scores.max()
258
+ self.len_to_reward = dict((l, scores[l]) for l in lens)
259
+ self.max_len = max_len
260
+
261
+ def __call__(self, sources=None, summaries=None):
262
+ lens = [len(word_tokenize(s)) for s in summaries]
263
+ scores = [
264
+ self.len_to_reward[l] if l <= self.max_len else 0.
265
+ for l in lens
266
+ ]
267
+ return scores
268
+
269
+
270
+ class GaussianCR:
271
+ def __init__(self, mean=0.45, std=0.3, weight=1, device=None):
272
+ self.weight = weight
273
+ ratios = np.arange(0, 1.1, 0.01)
274
+ scores = gaussian(ratios, mean, std)
275
+ scores /= scores.max()
276
+ self.ratio_to_reward = dict((round(r, 3), s) for r, s in zip(ratios, scores))
277
+
278
+ def __call__(self, sources=None, summaries=None):
279
+ source_lens = [len(word_tokenize(s)) for s in sources]
280
+ summary_lens = [len(word_tokenize(s)) for s in summaries]
281
+
282
+ ratios = [round(x / y, 2) for x, y in zip(summary_lens, source_lens)]
283
+ ratios = [min(1., x) for x in ratios]
284
+
285
+ return [
286
+ self.ratio_to_reward[round(ratio, 2)]
287
+ for ratio in ratios
288
+ ]
289
+
290
+
291
+ class NoDaysReward():
292
+ def __init__(self, weight=1, device=None):
293
+ self.day_words = [
294
+ "monday", "tuesday", "wednesday",
295
+ "thursday", "friday", "saturday", "sunday",
296
+ "today", "tomorrow", "yesterday", "tonight"
297
+ ]
298
+ self.weight = weight
299
+
300
+ def __call__(self, sources=None, summaries=None):
301
+ scores = []
302
+ for s in summaries:
303
+ s = s.lower()
304
+ if any([w in s for w in self.day_words]):
305
+ score = 0.
306
+ else:
307
+ score = 1.
308
+ scores.append(score)
309
+ return scores
310
+
311
+
312
+ def gaussian(x, mu, sig):
313
+ return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
314
+
315
+
316
+ class RougeReward:
317
+ def __init__(self, rouge_type="rougeL", weight=1, device=None):
318
+ self.rouge_type = rouge_type
319
+ self.weight = weight
320
+ self.targets = None
321
+
322
+ def __call__(self, sources=None, summaries=None):
323
+ scores = []
324
+ for pred, tgt in zip(summaries, self.targets):
325
+ rouge_scores = rouge_scorer.score(tgt, pred)
326
+ score = rouge_scores[self.rouge_type].fmeasure
327
+ scores.append(score)
328
+ return scores
329
+
330
+ #
SCRL_new/scrl/sampling.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+ import numpy as np
4
+ from collections import defaultdict
5
+ from torch.distributions import Categorical
6
+ from torch.nn.utils.rnn import pad_sequence
7
+ from scrl.model import labels_to_summary
8
+ from nltk import word_tokenize
9
+ from pprint import pprint
10
+
11
+
12
+ def sample_from_policy(
13
+ input_ids,
14
+ probs,
15
+ device="cuda",
16
+ force_diff=True,
17
+ diff_trials=1000,
18
+ ):
19
+ m = Categorical(probs)
20
+ argmax_labels = torch.argmax(probs, dim=2)
21
+ sample_labels = m.sample()
22
+
23
+ if force_diff:
24
+ for _ in range(diff_trials):
25
+ if (argmax_labels == sample_labels).all():
26
+ sample_labels = m.sample()
27
+ else:
28
+ break
29
+
30
+ sample_probs = m.log_prob(sample_labels)
31
+ return sample_probs, sample_labels
32
+
33
+
34
+ def best_of_k_samples(
35
+ args,
36
+ manager,
37
+ tokenizer,
38
+ reward_generator,
39
+ input_ids,
40
+ batch,
41
+ probs,
42
+ k_samples=50,
43
+ return_all=False
44
+ ):
45
+ batch_size = probs.size(0)
46
+
47
+ prob_batches = []
48
+ summary_batches = []
49
+ reward_batches = []
50
+ detail_batches = []
51
+ label_batches = []
52
+ for _ in range(k_samples):
53
+ sample_probs, sample_labels = sample_from_policy(
54
+ input_ids,
55
+ probs,
56
+ device=args.device
57
+ )
58
+ sample_summaries = labels_to_summary(
59
+ input_ids, sample_labels, tokenizer
60
+ )
61
+ sample_rewards, sample_details = reward_generator(
62
+ batch["document"], sample_summaries
63
+ )
64
+
65
+ prob_batches.append(sample_probs)
66
+ summary_batches.append(sample_summaries)
67
+ reward_batches.append(sample_rewards)
68
+ detail_batches.append(sample_details)
69
+ label_batches.append(sample_labels)
70
+
71
+
72
+ best_indices = []
73
+ for i in range(batch_size):
74
+ rewards = [reward_batches[j][i] for j in range(k_samples)]
75
+ scored = sorted(enumerate(rewards), key=lambda x: x[1], reverse=True)
76
+ best_idx = scored[0][0]
77
+ best_indices.append(best_idx)
78
+
79
+ sample_probs = torch.stack([prob_batches[j][i] for i, j in enumerate(best_indices)])
80
+ sample_summaries = [summary_batches[j][i] for i, j in enumerate(best_indices)]
81
+ sample_rewards = [reward_batches[j][i] for i, j in enumerate(best_indices)]
82
+ sample_labels = torch.stack([label_batches[j][i] for i, j in enumerate(best_indices)])
83
+
84
+ sample_details = []
85
+ for i, j in enumerate(best_indices):
86
+ detail_keys = sorted(detail_batches[0].keys())
87
+ details = defaultdict(list)
88
+ for k in detail_keys:
89
+ details[k].append(detail_batches[j][k][i])
90
+ sample_details.append(details)
91
+
92
+ sample_data = {
93
+ "probs": prob_batches,
94
+ "rewards": reward_batches,
95
+ "summaries": summary_batches,
96
+ "details": detail_batches,
97
+ "labels": label_batches,
98
+ }
99
+ return sample_probs, sample_summaries, sample_rewards, sample_details, sample_labels, sample_data
SCRL_new/scrl/training.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import shutil
3
+ import logging
4
+ import random
5
+ import time
6
+ from pprint import pprint
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+
10
+ from scrl.rewards import load_rewards
11
+ from scrl.data import load_data_for_training
12
+ from scrl.config import load_config
13
+ from scrl.model import load_model, LinearTokenSelector, labels_to_summary
14
+ import scrl.utils as utils
15
+ import scrl.sampling as sampling
16
+
17
+ import numpy as np
18
+ import torch
19
+ from torch.nn.utils.rnn import pad_sequence
20
+ from transformers import AutoModel, AutoTokenizer
21
+ from sklearn import preprocessing
22
+
23
+ from nltk import word_tokenize
24
+
25
+
26
+ def print_if(x, do_print=True):
27
+ if do_print:
28
+ print(x)
29
+
30
+
31
+ class TrainingManager:
32
+ """
33
+ Object for saving/loading model checkpoints and for tracking and saving
34
+ metrics measured during training, e.g. loss, rewards.
35
+
36
+ The following directory struture is build around one training run:
37
+
38
+ dir/
39
+ val_scores.json
40
+ checkpoints/
41
+ latest-model-500/
42
+ classifier.bin
43
+ encoder.bin
44
+ best-model-200/
45
+ [...]
46
+ series/
47
+ loss.npy
48
+ [...]
49
+ totals/
50
+ loss.npy
51
+ [...]
52
+ """
53
+ def __init__(self, dir):
54
+ self.step = 0
55
+ self.total_seconds = 0
56
+ self.start_time = None
57
+ self.series = defaultdict(list)
58
+ self.totals = defaultdict(float)
59
+ self.dir = dir
60
+ dir.mkdir(exist_ok=True)
61
+ for subdir_name in ("checkpoints", "series", "totals"):
62
+ (dir / subdir_name).mkdir(exist_ok=True)
63
+
64
+ def start_clock(self):
65
+ self.start_time = time.time() - self.total_seconds
66
+
67
+ def load(self):
68
+ # load tracked data, e.g. loss, rewards etc.
69
+ for p in (self.dir / "series").iterdir():
70
+ k = p.name.split(".npy")[0]
71
+ self.series[k] = list(utils.load_numpy(p))
72
+ for p in (self.dir / "totals").iterdir():
73
+ k = p.name.split(".npy")[0]
74
+ self.totals[k] = utils.load_numpy(p)
75
+ # read latest training step
76
+ latest_model_dir = self.find_old_model("latest-model")
77
+ self.total_seconds = utils.read_json(self.dir / "time.json")["total_seconds"]
78
+ last_step = int(latest_model_dir.name.split("-")[-1])
79
+ self.step = last_step + 1
80
+
81
+ def update_metric(self, key, value):
82
+ self.totals[key] += value
83
+ self.series[key].append(value)
84
+
85
+ def mean_metric(self, key):
86
+ return self.totals[key] / (self.step + 1)
87
+
88
+ def save_latest_model(self, model, checkpoint_id):
89
+ self.save_model(model, checkpoint_id, prefix="latest-model")
90
+
91
+ def save_model(self, model, checkpoint_id, prefix):
92
+ old_model_dir = self.find_old_model(prefix)
93
+ model_dir = self.dir / "checkpoints" / f"{prefix}-{checkpoint_id}"
94
+ model_dir.mkdir()
95
+ model.save(
96
+ classifier_path = model_dir / "classifier.bin",
97
+ encoder_path = model_dir / "encoder.bin"
98
+ )
99
+ if old_model_dir:
100
+ shutil.rmtree(old_model_dir)
101
+
102
+ def find_old_model(self, prefix):
103
+ model_path = None
104
+ for p in (self.dir / "checkpoints").iterdir():
105
+ if p.name.startswith(f"{prefix}"):
106
+ model_path = p
107
+ return model_path
108
+
109
+ def is_empty(self):
110
+ latest_model_dir = self.find_old_model("latest-model")
111
+ return latest_model_dir is None
112
+
113
+ def save_data(self):
114
+ for k, v in self.series.items():
115
+ utils.save_numpy(v, self.dir / "series" / f"{k}.npy")
116
+ for k, v in self.totals.items():
117
+ utils.save_numpy(v, self.dir / "totals" / f"{k}.npy")
118
+ utils.write_json({
119
+ "step": self.step,
120
+ "total_seconds": self.total_seconds
121
+ }, self.dir / "time.json")
122
+
123
+
124
+ def label_variance(probs):
125
+ # batch, seq, 2
126
+ variances = []
127
+ for i in range(probs.size(0)):
128
+ distrib = probs[i, :, 0]
129
+ var = torch.var(distrib)
130
+ variances.append(var)
131
+ return var.mean().item()
132
+
133
+
134
+ def check_gradient(model):
135
+ is_zero = []
136
+ is_none = []
137
+ for name, param in list(model.named_parameters()):
138
+ if (param.requires_grad):
139
+ grad = param.grad
140
+ if grad is None:
141
+ is_none.append(name)
142
+ else:
143
+ gradsum = param.grad.sum().item()
144
+ if gradsum == 0:
145
+ is_zero.append(name)
146
+ print("zero-grad:", len(is_zero), is_zero)
147
+ print("none-grad:", len(is_none), is_none)
148
+ print()
149
+
150
+
151
+ def get_mean_max_prob(probs):
152
+ return probs.max(dim=2).values.mean().item()
153
+
154
+
155
+ def print_training_progress(args, manager, model, probs, argmax_summaries, sample_summaries, batch, argmax_details):
156
+ print(f"[step: {manager.step}] [duration(s): {round(manager.total_seconds)}]")
157
+ print(f"[example/s: {(args.batch_size * (manager.step + 1)) / manager.total_seconds:.3f}]")
158
+ print(f"[s/step: {manager.total_seconds / (manager.step+1):.3f}]")
159
+ print(f"[avg-loss: {manager.mean_metric('loss')}]")
160
+ print(f"[avg-max-prob: {manager.mean_metric('mean_max_prob'):.3f}]")
161
+ print(f"[avg-a-reward: {manager.mean_metric('argmax_reward'):.3f}]")
162
+ print(f"[avg-s-reward: {manager.mean_metric('sample_reward'):.3f}]")
163
+ print(f"[avg-len: {manager.mean_metric('argmax_len'):.1f}]")
164
+ print()
165
+ print(f"[a-reward: {manager.series['argmax_reward'][-1]:.3f}]")
166
+ print(f"[s-reward: {manager.series['sample_reward'][-1]:.3f}]")
167
+ print(f"[max-prob: {manager.series['mean_max_prob'][-1]:.3f}]")
168
+ print()
169
+ print("[sentences]")
170
+ print("\n".join(batch["document"]))
171
+ print("\n[current policy summaries]")
172
+ print("\n".join(argmax_summaries))
173
+ print("\n[sampled summaries]")
174
+ print("\n".join(sample_summaries))
175
+ print()
176
+ print("Reward Breakdown:")
177
+ pprint(argmax_details)
178
+ print()
179
+ check_gradient(model)
180
+ print("="*100)
181
+
182
+
183
+ def setup_model(args):
184
+ # setup/load model manager object
185
+ model_dir = Path(args.model_dir)
186
+ if args.fresh and model_dir.exists():
187
+ utils.ask_rmdir(model_dir)
188
+ manager = TrainingManager(model_dir)
189
+ if not manager.is_empty():
190
+ manager.load()
191
+
192
+ if not (model_dir / "config.json").exists():
193
+ shutil.copy(args.config, model_dir / "config.json")
194
+
195
+ # initialize new or load existing model
196
+ if manager.step == 0:
197
+ encoder = AutoModel.from_pretrained(args.encoder_model_id)
198
+ embedding_size = encoder.state_dict()["embeddings.word_embeddings.weight"].shape[1]
199
+ model = LinearTokenSelector(encoder, embedding_size).to(args.device)
200
+ else:
201
+ print("loading latest model from step", manager.step - 1)
202
+ model = load_model(
203
+ model_dir, prefix="latest", device=args.device
204
+ )
205
+ return manager, model
206
+
207
+
208
+ def setup_dataset_indices(args, step):
209
+ """
210
+ Load pre-built indices that determine in which order we traverse a dataset.
211
+ If we continue interrupted training state, we move indices accordingly.
212
+ """
213
+ dataset_indices = utils.batchify(
214
+ utils.load_numpy(args.indices),
215
+ args.batch_size
216
+ )
217
+ if step > 0:
218
+ utils.move_generator(dataset_indices, step)
219
+ return dataset_indices
220
+
221
+
222
+ def train(
223
+ args,
224
+ manager,
225
+ model,
226
+ tokenizer,
227
+ reward_generator,
228
+ dataset,
229
+ dataset_indices,
230
+ eval_func
231
+ ):
232
+
233
+ optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate)
234
+ n_train = len(dataset["train"])
235
+ device = args.device
236
+ model.train()
237
+ manager.start_clock()
238
+
239
+ for indices in dataset_indices:
240
+
241
+ step = manager.step
242
+ manager.total_seconds = time.time() - manager.start_time
243
+ if args.max_train_steps and step >= args.max_train_steps + 1:
244
+ break
245
+ if args.max_train_seconds and manager.total_seconds >= args.max_train_seconds:
246
+ break
247
+
248
+ optimizer.zero_grad()
249
+
250
+ batch = dataset["train"][indices]
251
+ input_ids = pad_sequence(
252
+ [torch.tensor(ids) for ids in batch["input_ids"]],
253
+ batch_first=True
254
+ ).to(device)
255
+
256
+ logits = model(input_ids)
257
+ probs = torch.softmax(logits, dim=2)
258
+
259
+ argmax_labels = torch.argmax(logits, dim=2).to(device)
260
+ argmax_summaries = labels_to_summary(input_ids, argmax_labels, tokenizer)
261
+ argmax_rewards, argmax_details = reward_generator(batch["document"], argmax_summaries)
262
+ a_reward = np.mean(argmax_rewards)
263
+
264
+ (sample_probs, sample_summaries, sample_rewards, sample_details,
265
+ sample_labels, sample_data) = sampling.best_of_k_samples(
266
+ args, manager, tokenizer, reward_generator,
267
+ input_ids, batch, probs,
268
+ k_samples=args.k_samples,
269
+ )
270
+ s_reward = np.mean(sample_rewards)
271
+
272
+ if args.sample_aggregation == "max":
273
+ loss = (a_reward - s_reward) * sample_probs.sum(1).mean()
274
+ else:
275
+ loss = 0.
276
+ for sample_probs_i, s_rewards_i in zip(sample_data["probs"], sample_data["rewards"]):
277
+ s_reward_i = np.mean(s_rewards_i)
278
+ loss_i = (a_reward_i - s_reward_i) * sample_probs_i.sum(1).mean()
279
+ loss += loss_i
280
+ loss /= len(sample_data["rewards"])
281
+
282
+ if args.sample_aggregation == "mean" or a_reward != s_reward:
283
+ # not updating model if no reward difference, in case of single sample
284
+ loss.backward()
285
+ optimizer.step()
286
+
287
+ argmax_len = np.mean([len(word_tokenize(s)) for s in argmax_summaries])
288
+
289
+ manager.update_metric("time", time.time())
290
+ manager.update_metric("loss", loss.item())
291
+ manager.update_metric("argmax_reward", a_reward)
292
+ manager.update_metric("sample_reward", s_reward)
293
+ manager.update_metric("sample_prob", sample_probs.detach().cpu().numpy().mean())
294
+ manager.update_metric("mean_max_prob", get_mean_max_prob(probs))
295
+ manager.update_metric("label_variance", label_variance(probs))
296
+ manager.update_metric("argmax_len", argmax_len)
297
+ for rname, rvalues in argmax_details.items():
298
+ manager.update_metric(f"reward|{rname}", np.mean(rvalues))
299
+
300
+ if args.eval_every != None and (step > 0 and step % args.eval_every == 0):
301
+ eval_func(
302
+ args, manager, model, tokenizer, reward_generator,
303
+ dataset["validation"]
304
+ )
305
+ model.train()
306
+
307
+ if args.save_every != None and (step % args.save_every == 0):
308
+ manager.save_latest_model(model, step)
309
+ manager.save_data()
310
+
311
+ if args.print_every != None and (args.verbose and step % args.print_every == 0):
312
+ print_training_progress(
313
+ args, manager, model, probs,
314
+ argmax_summaries, sample_summaries, batch,
315
+ argmax_details
316
+ )
317
+ manager.step += 1
318
+
319
+
320
+ def setup_and_train(args, eval_func):
321
+
322
+ print_if("loading model", args.verbose)
323
+ manager, model = setup_model(args)
324
+
325
+ print_if("loading tokenizer", args.verbose)
326
+ tokenizer = AutoTokenizer.from_pretrained(args.encoder_model_id)
327
+
328
+ print_if("loading rewards", args.verbose)
329
+ reward_generator = load_rewards(args)
330
+ print_if("rewards:", reward_generator.reward_names)
331
+
332
+ print_if("loading dataset", args.verbose)
333
+ dataset = load_data_for_training(tokenizer, args.loader, args.dataset)
334
+
335
+ dataset_indices = setup_dataset_indices(args, manager.step)
336
+
337
+ train(
338
+ args,
339
+ manager,
340
+ model,
341
+ tokenizer,
342
+ reward_generator,
343
+ dataset,
344
+ dataset_indices,
345
+ eval_func
346
+ )
SCRL_new/scrl/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import shutil
3
+ import json
4
+ import gzip
5
+ import random
6
+ import torch
7
+
8
+
9
+ class TransformersTokenizerWrapper:
10
+ def __init__(self, tokenizer):
11
+ self.T = tokenizer
12
+
13
+ def __call__(self, texts):
14
+ token_ids_batch = self.T(texts)["input_ids"]
15
+ tokens_batch = [[self.T._convert_id_to_token(id) for id in ids] for ids in token_ids_batch]
16
+ tokens_batch = [[self.T.convert_tokens_to_string(t).strip() for t in tokens[1:-1]] for tokens in tokens_batch]
17
+ return tokens_batch
18
+
19
+
20
+
21
+ def set_random_seed(seed):
22
+ torch.manual_seed(seed)
23
+ random.seed(seed)
24
+ np.random.seed(seed)
25
+
26
+
27
+ def ask_rmdir(dir):
28
+ val = input(
29
+ f"WARNING: Proceed with deleting this directory: {dir} ? (yes|no) "
30
+ )
31
+ if val == "yes":
32
+ shutil.rmtree(dir)
33
+
34
+
35
+ def load_numpy(path):
36
+ with open(path, "rb") as f:
37
+ x = np.load(f)
38
+ return x
39
+
40
+
41
+ def save_numpy(x, path):
42
+ with open(path, "wb") as f:
43
+ np.save(f, x)
44
+
45
+
46
+ def batchify(items, batch_size):
47
+ for i in range(0, len(items), batch_size):
48
+ yield items[i:i + batch_size]
49
+
50
+
51
+ def move_generator(items, idx):
52
+ if idx == 0:
53
+ return
54
+ else:
55
+ for i, x in enumerate(items):
56
+ if i >= idx - 1:
57
+ break
58
+
59
+
60
+ def read_json(path):
61
+ with open(path) as f:
62
+ obj = json.load(f)
63
+ return obj
64
+
65
+
66
+ def write_json(obj, path):
67
+ with open(path, 'w') as f:
68
+ json.dump(obj, f)
69
+
70
+
71
+ def write_jsonl(items, path, mode):
72
+ with open(path, mode) as f:
73
+ lines = [json.dumps(x) for x in items]
74
+ f.write("\n".join(lines) + "\n")
75
+
76
+
77
+ def read_jsonl(path):
78
+ with open(path) as f:
79
+ for line in f:
80
+ yield json.loads(line)
81
+
82
+
83
+ def read_jsonl_gz(path):
84
+ with gzip.open(path) as f:
85
+ for l in f:
86
+ yield json.loads(l)
SCRL_new/setup.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+
3
+
4
+ setup(
5
+ name="scrl",
6
+ version=0.1,
7
+ packages=["scrl"]
8
+ )
abs_compressor.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Any
2
+ import tiktoken
3
+
4
+
5
+ class AbstractCompressor:
6
+ base_model = None
7
+ tokenizer = None
8
+ gpt_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
9
+
10
+ def compress(self, original_prompt: str, ratio: float) -> dict:
11
+ """
12
+ Input original prompt/sentence and compression ratio, return compressed prompt/sentence.\
13
+
14
+ :param original_prompt:
15
+ :param ratio:
16
+ :return: dict object
17
+ """
18
+ # output content including
19
+ # {
20
+ # 'compressed_prompt': compressed prompt,
21
+ # 'ratio': compression ratio,
22
+ # 'original_tokens': token count of original prompt,
23
+ # 'compressed_tokens': token count of compressed prompt
24
+ # }
25
+ raise NotImplementedError()
26
+
27
+ def fit(self, datas: List[dict], valid_size: int) -> None:
28
+ """
29
+ For trainable methods, call this function for training parameters.
30
+ Require training LongBench and valid set size.
31
+ :param datas:
32
+ :param valid_size:
33
+ :return:
34
+ """
35
+ raise NotImplementedError()
36
+
37
+ def set_model(self, model: Any, **kwargs):
38
+ """
39
+ Specify a trained or a pre-trained model.
40
+ :param model:
41
+ :param kwargs:
42
+ :return:
43
+ """
44
+ pass
kis.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import torch
3
+ from abs_compressor import AbstractCompressor
4
+
5
+
6
+ class KiSCompressor(AbstractCompressor):
7
+ def __init__(self, DEVICE: str = 'cpu', model_dir: str = 'philippelaban/keep_it_simple'):
8
+ self.DEVICE = DEVICE
9
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side='right', pad_token='<|endoftext|')
10
+ self.tokenizer.pad_token = self.tokenizer.eos_token
11
+ self.tokenizer.padding_side = 'right'
12
+ self.kis_model = AutoModelForCausalLM.from_pretrained(model_dir)
13
+ self.kis_model.to(self.DEVICE)
14
+ # if self.tokenizer.pad_token is None:
15
+ # self.tokenizer.pad_token = self.tokenizer.eos_token
16
+ # self.kis_model.eval()
17
+
18
+ def compress(self, original_prompt: str, ratio: float = 0.5, max_length: int = 150, num_beams: int = 4, do_sample: bool = True, num_return_sequences: int = 1, target_index: int = 0) -> dict:
19
+
20
+ original_tokens = len(self.gpt_tokenizer.encode(original_prompt))
21
+
22
+ start_id = self.tokenizer.bos_token_id
23
+ print(self.tokenizer.padding_side)
24
+ tokenized_paragraph = [(self.tokenizer.encode(text=original_prompt) + [start_id])]
25
+ input_ids = torch.LongTensor(tokenized_paragraph)
26
+ if self.DEVICE == 'cuda':
27
+ input_ids = input_ids.type(torch.cuda.LongTensor)
28
+ output_ids = self.kis_model.generate(input_ids, max_length=max_length, num_beams=num_beams, do_sample=do_sample,
29
+ num_return_sequences=num_return_sequences,
30
+ pad_token_id=self.tokenizer.eos_token_id)
31
+ output_ids = output_ids[:, input_ids.shape[1]:]
32
+ output = self.tokenizer.batch_decode(output_ids)
33
+ output = [o.replace(self.tokenizer.eos_token, "") for o in output]
34
+ compressed_prompt = output[target_index]
35
+
36
+ compressed_tokens = len(self.gpt_tokenizer.encode(compressed_prompt))
37
+
38
+ result = {
39
+ 'compressed_prompt': compressed_prompt,
40
+ 'ratio': compressed_tokens / original_tokens,
41
+ 'original_tokens': original_tokens,
42
+ 'compressed_tokens': compressed_tokens,
43
+ }
44
+
45
+ return result
46
+
47
+
models/gigaword-L8/checkpoints/best_val_reward-7700/classifier.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d9c699efc675ab1515860afcc786d4bf78c3e719c38625f9293ce17ea3828e
3
+ size 6891
models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db97dc0f0b3d356c9d972f111ef0cdaf260566d4c4b01e6157a81403201aa081
3
+ size 721
models/gigaword-L8/checkpoints/best_val_reward-7700/encoder.bin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2920e94cd3916078dfbfc101753f8cf739dd50677a3e8e44f0b6dc4b297287e8
3
+ size 328517361
models/gigaword-L8/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "loader": "loaders/gigaword.py",
3
+ "dataset": "data/train-data/gigaword",
4
+ "indices": "data/train-data/gigaword/indices.npy",
5
+ "model_dir": "data/models/gigaword-L8-nocr",
6
+ "verbose": true,
7
+ "print_every": 1,
8
+ "eval_every": 50,
9
+ "save_every": 50,
10
+ "max_val_steps": 512,
11
+ "max_train_seconds": null,
12
+ "max_train_steps": 8000,
13
+ "batch_size": 4,
14
+ "learning_rate": 1e-05,
15
+ "k_samples": 100,
16
+ "sample_aggregation": "max",
17
+ "loss": "pgb",
18
+ "encoder_model_id": "distilroberta-base",
19
+ "rewards": {
20
+ "Fluency": {
21
+ "weight": 1,
22
+ "type": "masked",
23
+ "model_id": "distilroberta-base",
24
+ "max_score": 40.0,
25
+ "norm": "max"
26
+ },
27
+ "SentenceMeanSimilarity": {
28
+ "weight": 1,
29
+ "model_id": "all-distilroberta-v1"
30
+ },
31
+ "GaussianLength": {
32
+ "weight": 1,
33
+ "mean": 8,
34
+ "std": 4
35
+ }
36
+ }
37
+ }
models/gigaword-L8/series/argmax_len.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e793d8b7450525766dd39d6abab95f06f52384767a017bb3e00a2f9e20cbe4d7
3
+ size 63736
models/gigaword-L8/series/argmax_reward.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29e54e1a54f207de1ebe3e9ca910933e8d5be14a3adbc1bf442c49e368f42502
3
+ size 63736
models/gigaword-L8/series/label_variance.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9492e30aa933b59881a473e90cd8aec416f7f161de6670a3ef32e2d7311db848
3
+ size 63736
models/gigaword-L8/series/loss.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ea59738950cf170267c59786ab84e5a0cb5636f7f48c7083542fa5108eb65e
3
+ size 63736
models/gigaword-L8/series/mean_max_prob.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d91a94440319efb392d777bcd22a7db47ebd81b45460fc2d5607ef006d244e0d
3
+ size 63736
models/gigaword-L8/series/reward_Fluency.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5db6aded20b992d3772f7ed1122307120c3ca1f0931c21e4f63f89948bfb2e
3
+ size 63736
models/gigaword-L8/series/reward_GaussianLength.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ffc2e2b4c58fc051b99cffc00ba9b6cce022b1b2b8d9a6b46896c77788721b5
3
+ size 63736
models/gigaword-L8/series/reward_SentenceMeanSimilarity.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3c0c9fa4e9823bbd800034b731c4ec3f1cffa544b52d307cc6bf50ff9fd40c
3
+ size 63736
models/gigaword-L8/series/sample_prob.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc52e464317969e7c63f7761a0736817a45c01c79417af0726f793a5b11ac5a2
3
+ size 31932