krotima1 commited on
Commit
713b9ba
1 Parent(s): b0c6b28

feat: add summarizer

Browse files
Files changed (1) hide show
  1. MultilingualSummarizer.ipynb +243 -0
MultilingualSummarizer.ipynb ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Multilingual Summarizer\n",
8
+ "- Feel free to play with models\n",
9
+ "- Gpus recommended due to faster summarization\n",
10
+ "- Firstly, include necessary .py files or clone git repo"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": []
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "import torch as pt\n",
27
+ "import numpy as np\n",
28
+ "import pandas as pd\n",
29
+ "\n",
30
+ "from collections import OrderedDict\n",
31
+ "\n",
32
+ "from transformers import AutoModelForSeq2SeqLM\n",
33
+ "from transformers import AutoTokenizer\n",
34
+ "import datasets\n",
35
+ "\n",
36
+ "import re\n",
37
+ "import logging\n",
38
+ "logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')\n",
39
+ "\n",
40
+ "# These files need to be included\n",
41
+ "from summarization.tokenizer import DatasetTokenizer\n",
42
+ "from summarization.summarizer import Summarizer\n",
43
+ "from summarization.summarization_metrics import MetricsComputation\n",
44
+ "\n",
45
+ "\n",
46
+ "\n",
47
+ "## READ THIS (To run):\n",
48
+ "# clone this git repo: https://gitlab.fel.cvut.cz/factchecking/experimental-marian-krotil\n",
49
+ "# insert this .ipynb file into the -/tree/main/projects/python/git_krotima1/source directory\n",
50
+ "# --> due to the included files / or copy them directly to this notebook\n",
51
+ "\n",
52
+ "class MultiSummarizer:\n",
53
+ " \n",
54
+ " ## Constructor\n",
55
+ " # input: model_name : string : Huggingface checkpoint (ctu-aic/m2m100-418M-multilingual-summarization-multilarge-cs, ctu-aic/mt5-base-multilingual-summarization-multilarge-cs, ctu-aic/mbart25-multilingual-summarization-multilarge-cs)\n",
56
+ " # language : string : cs, en, de, fr, es, tr, ru, zh\n",
57
+ " # inference_cfg : dict : parameters of generation method\n",
58
+ " #\n",
59
+ " # \n",
60
+ " #\n",
61
+ " def __init__(self, model_name, language, inference_cfg=None, **kwargs):\n",
62
+ " logging.info(f\"Initializing multilingual summarizer {model_name}\")\n",
63
+ " self.name = model_name.split('/')[-1]\n",
64
+ " self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
65
+ " self.dstTokenizer = DatasetTokenizer(model_name.split('/')[-1], \"../output/trainModel/multilarge/htext2abstract/ml_FR/checkpoint-1078000\", language)\n",
66
+ " self.tokenizer = self.dstTokenizer.get_tokenizer()\n",
67
+ " self.langid = self.dstTokenizer.get_langid()\n",
68
+ " self.lang_token = self.dstTokenizer.get_lang_token()\n",
69
+ " self.inference_cfg = inference_cfg\n",
70
+ " self.enc_max_len = 512\n",
71
+ " self.language = language\n",
72
+ " #cuda/cpu device\n",
73
+ " if pt.cuda.is_available():\n",
74
+ " self.model.cuda()\n",
75
+ "\n",
76
+ "\n",
77
+ " ## Function __call__\n",
78
+ " # input: texts - (list of strings, string, dataset) - texts in selected language to summarize\n",
79
+ " # golds - (None, list of strings, string) - target summary - if provided the ROUGE scores are computed\n",
80
+ " # inference_cfg - dictionary with configuration of generation method\n",
81
+ " # text_column - if texts is in the dataset type, it is column which will be summarized\n",
82
+ " # \n",
83
+ " # output: tuple (list of summaries, - {} empty dic if no golds)\n",
84
+ " # - dict of ROUGE scores if golds are given)\n",
85
+ " # functionality:\n",
86
+ " # - converts input to the Huggingface datasets if not provided\n",
87
+ " # - tokenize & summarize input texts\n",
88
+ " # - compute scores based on passed arguments\n",
89
+ " # \n",
90
+ " def __call__(self, texts, golds=None, inference_cfg=None, text_column=\"\", **kwargs):\n",
91
+ " \n",
92
+ " #check input\n",
93
+ " golds = [golds] if type(golds) == str else golds\n",
94
+ " assert golds is None or type(golds) == list and type(golds[0]) == str, \"Golds: Expected type: None, string or list of strings\"\n",
95
+ " \n",
96
+ " if type(texts) != datasets.Dataset:\n",
97
+ " texts = [texts] if type(texts) == str else texts\n",
98
+ " assert type(texts) == list and type(texts[0]) == str, \"Texts: Expected type: dataset, string or list of strings\"\n",
99
+ " \n",
100
+ " \n",
101
+ " self.inference_cfg = inference_cfg if inference_cfg is not None else self.inference_cfg\n",
102
+ " logging.info(f\"Summarizing data with the generation config: {self.inference_cfg}\")\n",
103
+ " \n",
104
+ " #get hgft dataset\n",
105
+ " dst = datasets.DatasetDict()\n",
106
+ " if type(texts) != datasets.Dataset:\n",
107
+ " df = pd.DataFrame({'text': texts})\n",
108
+ " dst[\"test\"] = datasets.Dataset.from_pandas(df)\n",
109
+ " else:\n",
110
+ " dst[\"test\"] = texts\n",
111
+ " \n",
112
+ " \n",
113
+ " #Tokenize input texts\n",
114
+ " text_column = 'text' if text_column == \"\" else text_column\n",
115
+ " cfg = {\"text_column\": text_column}\n",
116
+ " \n",
117
+ " tok_dst = self.dstTokenizer.tokenize(dst, encoder_input_ids=self.enc_max_len, decoder_input_ids=None,**cfg)[\"test\"]\n",
118
+ " \n",
119
+ " #Init Summarizer\n",
120
+ " summarizer = Summarizer(model = self.model, tokenizer = self.tokenizer,lcode=self.langid, batch_size = 8)\n",
121
+ " \n",
122
+ " #Summarize texts\n",
123
+ " filter_fc = self._filter_final_summaries if self.name.startswith('mt5') else None\n",
124
+ " summarizer.summarize_dst(tok_dst, filter_fc_batch = filter_fc,**self.inference_cfg)\n",
125
+ " \n",
126
+ " \n",
127
+ " scores = {}\n",
128
+ " if golds is not None:\n",
129
+ " #compute scores if gold texts are provided\n",
130
+ " metrics = MetricsComputation(self.language)\n",
131
+ " scores = metrics.compute_scores(gold = golds, summary=summarizer.summarized_dst['summary'])\n",
132
+ " \n",
133
+ " \n",
134
+ " return (summarizer.summarized_dst['summary'], scores)\n",
135
+ " \n",
136
+ " def _filter_final_summaries(self, batch, **kwargs):\n",
137
+ " batch[\"summary\"] = [ re.sub(self.lang_token, '', tmp) for tmp in batch[\"summary\"]]\n",
138
+ " return batch"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": [
147
+ "## Configuration of summarization pipeline\n",
148
+ "#\n",
149
+ "def summ_config():\n",
150
+ " cfg = OrderedDict([\n",
151
+ " \n",
152
+ " ## summarization model - checkpoint\n",
153
+ " # ctu-aic/m2m100-418M-multilingual-summarization-multilarge-cs\n",
154
+ " # ctu-aic/mt5-base-multilingual-summarization-multilarge-cs\n",
155
+ " # ctu-aic/mbart25-multilingual-summarization-multilarge-cs\n",
156
+ " (\"model_name\", \"ctu-aic/mbart25-multilingual-summarization-multilarge-cs\"),\n",
157
+ " \n",
158
+ " ## language of summarization task\n",
159
+ " # language : string : cs, en, de, fr, es, tr, ru, zh\n",
160
+ " (\"language\", \"en\"), \n",
161
+ " \n",
162
+ " ## generation method parameters in dictionary\n",
163
+ " #\n",
164
+ " (\"inference_cfg\", OrderedDict([\n",
165
+ " (\"num_beams\", 4),\n",
166
+ " (\"top_k\", 40),\n",
167
+ " (\"top_p\", 0.92),\n",
168
+ " (\"do_sample\", True),\n",
169
+ " (\"temperature\", 0.95),\n",
170
+ " (\"repetition_penalty\", 1.23),\n",
171
+ " (\"no_repeat_ngram_size\", None),\n",
172
+ " (\"early_stopping\", True),\n",
173
+ " (\"max_length\", 128),\n",
174
+ " (\"min_length\", 10),\n",
175
+ " ])),\n",
176
+ " #texts to summarize values = (list of strings, string, dataset)\n",
177
+ " (\"texts\",\n",
178
+ " [\n",
179
+ " '(CNN)The presence of a harmful pesticide at a luxury villa in the U.S. Virgin Islands may have resulted in the illness of a Delaware family, the U.S. Environmental Protection Agency said Friday. Paramedics were called last week to a rented villa at the Sirenusa resort in St. John after the family of four fell ill. They had rented the villa from March 14 to March 22, and were later hospitalized. The illness was reported to the EPA on March 20. \"Our preliminary results do show that there was a presence of methyl bromide in the unit where the family was staying,\" said Elias Rodriguez, an EPA spokesman. Exposure to methyl bromide can result in serious health effects, including central nervous system and respiratory system damage, according to the EPA. The use of the pesticide is restricted in the United States because of its acute toxicity. It\\'s not allowed to be used indoors. Only certified professionals are permitted to use it in certain agricultural settings. \"It\\'s an ongoing investigation; we\\'re still on the island doing our assessment,\" Rodriguez said. \"We have been doing different types of air sampling and wipe sampling.\" Final test results were expected next week. The EPA is working with local government agencies to investigate whether the family was made ill after a fumigation at the resort on March 18 and whether any environmental regulations or laws were violated. \"Pesticides can be very toxic, and it is critically important that they be applied properly and used only as approved by EPA,\" said Judith A. Enck, a regional administrator for the EPA. \"The EPA is actively working to determine how this happened and will make sure steps are taken to prevent this from happening to others at these vacation apartments or elsewhere.\" Depending on the season, the luxury villa where the family stayed rents between $550 and $1,200 per night. Sea Glass Vacations, which acts as a rental agent for several units at Sirenusa, said that the unit directly below the one where the family stayed was recently treated for pests, but that their unit was not treated. The company said it licensed an outside company, Terminix, for the pest control services. \"Sea Glass Vacations does not treat the units it manages for pests but instead relies on licensed professionals for pest control services,\" the company said in a statement. The U.S. Department of Justice has initiated a criminal investigation into the matter, according to a U.S. Securities and Exchange Commission filing made Monday by ServiceMaster Global Holdings, the parent company of Terminix. In an email to CNN, a spokesman for Terminix wrote that the company is \"committed to performing all work ... in a manner that is safe for our customers, employees, the public and the environment\" and is \"looking into this matter internally, and cooperating with authorities.\" \"We\\'re thinking about the family, and we join the community in wishing them a speedy recovery,\" Terminix wrote. James Maron, an attorney who has been a spokesman for the family, has not responded to requests for comment. The SEC filing described the injuries to the family members as \"serious.\"'\n",
180
+ " ,\n",
181
+ "\n",
182
+ " '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed \"in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014.\" Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\\'s ceremony, said it was a move toward greater justice. \"As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice,\" he said, according to an ICC news release. \"Indeed, today brings us closer to our shared goals of justice and peace.\" Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. \"As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly,\" she said. Rights group Human Rights Watch welcomed the development. \"Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\\'s treaty should speak out to welcome its membership,\" said Balkees Jarrah, international justice counsel for the group. \"What\\'s objectionable is the attempts to undermine international justice, not Palestine\\'s decision to join a treaty to which over 100 countries around the world are members.\" In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it \"strongly\" disagreed with the court\\'s decision. \"As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC,\" the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. \"We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,\" it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as \"Palestine.\" While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would \"conduct its analysis in full independence and impartiality.\" The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'\n",
183
+ " #\"Tady je deset dní držel! První pohled do šíleného úkrytu unesených dětí. Interiér vůbec neodpovídá běžným garážím – stěny jsou obložené izolací, na zemi dlažba a koberec, dvě matrace. „Byl tam nedodělanej sprchovej kout, měl tam vařák na jídlo, stůl, dvě nový matrace na sobě, deky, polštář, ledničku a záchod,“ popsal serveru iDnes.cz Daniel. Garáž měla troje dveře, jedny na klíč, jedny na číselný kód. Protihluková izolace, kterou byl objekt doslova obalený, byla patrná zejména na střeše a zadní stěně zvenku. Zbývá si položit jednu otázku – jak dlouho únosce celou věc plánoval?Už večer před nalezením dvou unesených dětí byla policie v areálu garáží, kde byli vězněni šestnáctiletý chlapec a o tři roky mladší dívka z Litoměřicka. „Nic nenasvědčovalo tomu, že by se tam nacházeli,“ řekl novinářům náměstek ředitele krajské policie Zbyněk Dvořák. Děti nalezli v garáži na ústeckém Střekově minulou středu dopoledne rodiče. Z únosu dětí je obviněn šestatřicetiletý recidivista Zdeněk H. Děti držel od 28. srpna do 7. září. Policie sestavila speciální tým, který bude prověřovat mimo jiné i to, zda se nedopustil dalších skutků.Policie podle Dvořáka při pátrání prověřovala desítky oznámení, podle kterých měly být děti viděny v různých částech Ústeckého kraje, Česka i v zahraničí. Klíčovou roli sehrály v případu dopisy, které dostaly rodiny 6. září. Kriminalisté se primárně zabývali Doksy a okolí, odkud byl dopis odeslaný. „Navíc chlapec měl vztah k Doksům, byli tam na dovolené. Některé indicie šly tam, že je pravděpodobné, že jsou v té lokalitě. I po téhle stopě se 6. a zejména 7. září ráno šlo,“ vysvětlila vedoucí litoměřické policie Helena Pšeničková.Obsah dopisů zkoumali rodiny i písmoznalci. Řada slov byla opakovaně přeškrtaná a přepsaná. „Slova dávala význam střecha, střela, Střekov, současně probíhalo pátrání na Střeleckém ostrově. Dochází k tomu, že večer dostáváme informaci od rodiny, že relevantní má být opravdu Střekov. Kolem desáté hodiny večer probíhá prověrka na místě v areálu těch garáží. Děti nebyly nalezeny. Nic nenasvědčuje tomu, že se tam v tu dobu nacházejí,“ řekl Dvořák.Druhý den ráno se ke garážím vypravili rodiče chlapce. „Informují nás o tom, že z jedné z těch garáží je slyšet hudba,“ doplnil Dvořák. Po prověření zvukových signálů policie do garáže vnikla.Muži hrozí podle stávající právní kvalifikace až osm let vězení. Čelí obvinění z vydírání, zbavení osobní svobody, ohrožování výchovy mládeže a šíření pornografie.Kriminalisté potvrdili, že podezřelým je majitel garáže na Střekově. Je jím šestatřicetiletý recidivista, který byl už před lety odsouzen za pokusy o únos malých dívek na Českolipsku. Soud ho v sobotu poslal do vazby.\",\n",
184
+ " ]\n",
185
+ " ),\n",
186
+ " #Target summaries values = (list of strings, string, None)\n",
187
+ " ('golds',\n",
188
+ " [\n",
189
+ " 'Delaware family becomes ill at the Sirenusa resort in the U.S. Virgin Islands. Preliminary EPA results find methyl bromide was present in unit where family stayed. U.S. Justice Department has initiated a criminal investigation into the matter.'\n",
190
+ " ,\n",
191
+ " 'Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June. Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis.'\n",
192
+ " ]),\n",
193
+ " ])\n",
194
+ " return cfg\n",
195
+ "\n",
196
+ "cfg = summ_config()\n",
197
+ "msummarizer = MultiSummarizer(**cfg)\n",
198
+ "ret = msummarizer(**cfg)\n"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "metadata": {},
205
+ "outputs": [],
206
+ "source": [
207
+ "ret = msummarizer(**cfg)\n",
208
+ "print(ret)"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "ret[1][:]"
218
+ ]
219
+ }
220
+ ],
221
+ "metadata": {
222
+ "kernelspec": {
223
+ "display_name": "Python 3",
224
+ "language": "python",
225
+ "name": "python3"
226
+ },
227
+ "language_info": {
228
+ "codemirror_mode": {
229
+ "name": "ipython",
230
+ "version": 3
231
+ },
232
+ "file_extension": ".py",
233
+ "mimetype": "text/x-python",
234
+ "name": "python",
235
+ "nbconvert_exporter": "python",
236
+ "pygments_lexer": "ipython3",
237
+ "version": "3.6.8"
238
+ },
239
+ "orig_nbformat": 4
240
+ },
241
+ "nbformat": 4,
242
+ "nbformat_minor": 2
243
+ }