arfat-xyz commited on
Commit
c2156fe
1 Parent(s): 3c22569

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1251 -0
app.py ADDED
@@ -0,0 +1,1251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "colab": {
8
+ "background_save": true
9
+ },
10
+ "id": "8JqpxyBueqTH",
11
+ "outputId": "6c2c3908-9067-496c-ad64-74f21895232a"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "name": "stdout",
16
+ "output_type": "stream",
17
+ "text": [
18
+ " Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
19
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
20
+ "Collecting git+https://github.com/boudinfl/pke.git\n",
21
+ " Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-s0vst_dk\n",
22
+ " Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-s0vst_dk\n",
23
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (3.7)\n",
24
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (2.6.3)\n",
25
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.21.6)\n",
26
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.7.3)\n",
27
+ "Collecting sklearn\n",
28
+ " Downloading sklearn-0.0.post1.tar.gz (3.6 kB)\n",
29
+ "Collecting unidecode\n",
30
+ " Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)\n",
31
+ "\u001b[K |████████████████████████████████| 235 kB 6.2 MB/s \n",
32
+ "\u001b[?25hRequirement already satisfied: future in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (0.16.0)\n",
33
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.2.0)\n",
34
+ "Requirement already satisfied: spacy>=3.2.3 in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (3.4.3)\n",
35
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.7)\n",
36
+ "Requirement already satisfied: typing-extensions<4.2.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (4.1.1)\n",
37
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.3)\n",
38
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (57.4.0)\n",
39
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.10)\n",
40
+ "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.10.1)\n",
41
+ "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.7.0)\n",
42
+ "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (8.1.5)\n",
43
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.4.5)\n",
44
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.8)\n",
45
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (4.64.1)\n",
46
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (21.3)\n",
47
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.9)\n",
48
+ "Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.8.1)\n",
49
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.10.2)\n",
50
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.23.0)\n",
51
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.3.0)\n",
52
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.8)\n",
53
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.11.3)\n",
54
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from catalogue<2.1.0,>=2.0.6->spacy>=3.2.3->pke==2.0.0) (3.10.0)\n",
55
+ "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy>=3.2.3->pke==2.0.0) (3.0.9)\n",
56
+ "Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy>=3.2.3->pke==2.0.0) (5.2.1)\n",
57
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2.10)\n",
58
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2022.9.24)\n",
59
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (3.0.4)\n",
60
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (1.24.3)\n",
61
+ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy>=3.2.3->pke==2.0.0) (0.0.3)\n",
62
+ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy>=3.2.3->pke==2.0.0) (0.7.9)\n",
63
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.8.0,>=0.3.0->spacy>=3.2.3->pke==2.0.0) (7.1.2)\n",
64
+ "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy>=3.2.3->pke==2.0.0) (2.0.1)\n",
65
+ "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.7/dist-packages (from nltk->pke==2.0.0) (2022.6.2)\n",
66
+ "Building wheels for collected packages: pke, sklearn\n",
67
+ " Building wheel for pke (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
68
+ " Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160276 sha256=6967c9216d570e0bbc7bab2c16f5f1810ecd62dcc9fad636e26ff35edbab3a68\n",
69
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-_mu5g7sn/wheels/fa/b3/09/612ee93bf3ee4164bcd5783e742942cdfc892a86039d3e0a33\n",
70
+ " Building wheel for sklearn (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
71
+ " Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=47f5287c3e5d1518e0617e1db17d093069e553338d6c0e359aa70352e6c78d66\n",
72
+ " Stored in directory: /root/.cache/pip/wheels/42/56/cc/4a8bf86613aafd5b7f1b310477667c1fca5c51c3ae4124a003\n",
73
+ "Successfully built pke sklearn\n",
74
+ "Installing collected packages: unidecode, sklearn, pke\n",
75
+ "Successfully installed pke-2.0.0 sklearn-0.0.post1 unidecode-1.3.6\n"
76
+ ]
77
+ }
78
+ ],
79
+ "source": [
80
+ "!pip install --quiet flashtext==2.7\n",
81
+ "!pip install git+https://github.com/boudinfl/pke.git\n"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "metadata": {
88
+ "id": "am3XUlr5evYK"
89
+ },
90
+ "outputs": [],
91
+ "source": [
92
+ "!pip install --quiet transformers==4.8.1\n",
93
+ "!pip install --quiet sentencepiece==0.1.95\n",
94
+ "!pip install --quiet textwrap3==0.9.2\n",
95
+ "!pip install gradio"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {
102
+ "colab": {
103
+ "background_save": true
104
+ },
105
+ "id": "mhwpLyuBfFUK",
106
+ "outputId": "dc6f4900-429d-4815-c98c-b8625efcbe7b"
107
+ },
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "\u001b[?25l\r\u001b[K |███████▊ | 10 kB 27.7 MB/s eta 0:00:01\r\u001b[K |███████████████▌ | 20 kB 34.6 MB/s eta 0:00:01\r\u001b[K |███████████████████████▏ | 30 kB 15.4 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████ | 40 kB 6.6 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 42 kB 955 kB/s \n",
114
+ "\u001b[?25h"
115
+ ]
116
+ }
117
+ ],
118
+ "source": [
119
+ "!pip install --quiet strsim==0.0.3\n",
120
+ "!pip install --quiet sense2vec==2.0.0"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {
127
+ "colab": {
128
+ "background_save": true
129
+ },
130
+ "id": "NcNXz17EfQLJ",
131
+ "outputId": "c90851f7-e320-48e3-d994-fcc5c174c636"
132
+ },
133
+ "outputs": [
134
+ {
135
+ "name": "stdout",
136
+ "output_type": "stream",
137
+ "text": [
138
+ "\u001b[?25l\r\u001b[K |▏ | 10 kB 10.5 MB/s eta 0:00:01\r\u001b[K |▍ | 20 kB 7.8 MB/s eta 0:00:01\r\u001b[K |▋ | 30 kB 11.1 MB/s eta 0:00:01\r\u001b[K |▉ | 40 kB 6.3 MB/s eta 0:00:01\r\u001b[K |█ | 51 kB 6.3 MB/s eta 0:00:01\r\u001b[K |█▎ | 61 kB 7.4 MB/s eta 0:00:01\r\u001b[K |█▌ | 71 kB 7.9 MB/s eta 0:00:01\r\u001b[K |█▊ | 81 kB 8.7 MB/s eta 0:00:01\r\u001b[K |█▉ | 92 kB 8.7 MB/s eta 0:00:01\r\u001b[K |██ | 102 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██▎ | 112 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██▌ | 122 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██▊ | 133 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███ | 143 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███▏ | 153 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███▍ | 163 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███▌ | 174 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███▊ | 184 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████ | 194 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████▏ | 204 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████▍ | 215 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████▋ | 225 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████▉ | 235 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████ | 245 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████▎ | 256 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████▍ | 266 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████▋ | 276 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████▉ | 286 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████ | 296 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████▎ | 307 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████▌ | 317 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████▊ | 327 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████ | 337 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████ | 348 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████▎ | 358 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████▌ | 368 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████▊ | 378 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████ | 389 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████▏ | 399 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████▍ | 409 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████▋ | 419 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████▊ | 430 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████ | 440 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████▏ | 450 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████▍ | 460 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████▋ | 471 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████▉ | 481 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████ | 491 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████▎ | 501 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████▌ | 512 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████▋ | 522 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████▉ | 532 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████ | 542 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████▎ | 552 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████▌ | 563 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████▊ | 573 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████���█████ | 583 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████▏ | 593 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████▎ | 604 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████▌ | 614 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████▊ | 624 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████ | 634 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 645 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████▍ | 655 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 665 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████▉ | 675 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████ | 686 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████▏ | 696 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████▍ | 706 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 716 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████▉ | 727 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████ | 737 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████▎ | 747 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████▌ | 757 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 768 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████▉ | 778 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████ | 788 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████▎ | 798 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 808 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████▊ | 819 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████ | 829 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 839 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████▍ | 849 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████▌ | 860 kB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████▊ | 870 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████ | 880 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 890 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 901 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████▋ | 911 kB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████▉ | 921 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████ | 931 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████▎ | 942 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 952 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████▋ | 962 kB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████▉ | 972 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████ | 983 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████▎ | 993 kB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████▌ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████▊ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████▌ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████▊ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████▏ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████▍ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████▋ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████▏ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████▋ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████▎ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████▋ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▌ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▊ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▏ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▎ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▍ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▋ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▍ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▋ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▉ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▌ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▊ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▉ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▎ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▌ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▍| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▊| 1.6 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 1.6 MB 7.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 1.6 MB 7.5 MB/s \n",
139
+ "\u001b[?25htime: 506 µs (started: 2022-11-24 06:06:09 +00:00)\n"
140
+ ]
141
+ }
142
+ ],
143
+ "source": [
144
+ "!pip install --quiet ipython-autotime\n",
145
+ "%load_ext autotime"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "metadata": {
152
+ "colab": {
153
+ "background_save": true
154
+ },
155
+ "id": "Bijc_hfbfUwp",
156
+ "outputId": "54a7f895-8f08-452d-8f3a-8e5310a1aa6c"
157
+ },
158
+ "outputs": [
159
+ {
160
+ "name": "stdout",
161
+ "output_type": "stream",
162
+ "text": [
163
+ "\u001b[K |████████████████████████████████| 85 kB 3.9 MB/s \n",
164
+ "\u001b[K |████████████████████████████████| 182 kB 49.1 MB/s \n",
165
+ "\u001b[K |████████████████████████████████| 5.5 MB 54.9 MB/s \n",
166
+ "\u001b[K |████████████████████████████████| 7.6 MB 55.0 MB/s \n",
167
+ "\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
168
+ "time: 10.4 s (started: 2022-11-24 06:06:09 +00:00)\n"
169
+ ]
170
+ }
171
+ ],
172
+ "source": [
173
+ "!pip install --quiet sentence-transformers==2.2.2"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "metadata": {
179
+ "id": "bmVx9L0yfgvR"
180
+ },
181
+ "source": [
182
+ "The below code restarts the colab notebook. Once it is restarted continue from next section and no need to run this section (installation) again."
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {
189
+ "colab": {
190
+ "background_save": true
191
+ },
192
+ "id": "uPO9U__1fZWh",
193
+ "outputId": "31e8d745-2a88-4bd6-f136-55cd2147ee3f"
194
+ },
195
+ "outputs": [
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "time: 556 µs (started: 2022-11-24 06:06:20 +00:00)\n"
201
+ ]
202
+ }
203
+ ],
204
+ "source": [
205
+ "# import os\n",
206
+ "# os.kill(os.getpid(), 9)"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "markdown",
211
+ "metadata": {
212
+ "id": "POh2_zvgrk0h"
213
+ },
214
+ "source": [
215
+ "## Example 1"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "markdown",
220
+ "metadata": {
221
+ "id": "VJP4CDBBrnNY"
222
+ },
223
+ "source": [
224
+ "Text taken from: \n",
225
+ "https://gadgets.ndtv.com/internet/news/dogecoin-price-rally-surge-elon-musk-tweet-twitter-working-developers-improve-transaction-efficiency-2442120"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {
232
+ "colab": {
233
+ "background_save": true
234
+ },
235
+ "id": "P_jlw7MUfjOp",
236
+ "outputId": "fd3e08da-3595-445d-941f-2c8047e34f08"
237
+ },
238
+ "outputs": [
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n",
244
+ "Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n",
245
+ "system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n",
246
+ "rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n",
247
+ "Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n",
248
+ "transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly\n",
249
+ "believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but\n",
250
+ "the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n",
251
+ "that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency.\n",
252
+ "\n",
253
+ "\n",
254
+ "time: 18.8 ms (started: 2022-11-24 06:06:20 +00:00)\n"
255
+ ]
256
+ }
257
+ ],
258
+ "source": [
259
+ "from textwrap3 import wrap\n",
260
+ "\n",
261
+ "text = \"\"\"Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n",
262
+ "Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n",
263
+ "system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n",
264
+ "rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n",
265
+ "Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n",
266
+ "transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly\n",
267
+ "believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but\n",
268
+ "the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n",
269
+ "that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency.\"\"\"\n",
270
+ "\n",
271
+ "for wrp in wrap(text, 150):\n",
272
+ " print (wrp)\n",
273
+ "print (\"\\n\")"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "markdown",
278
+ "metadata": {
279
+ "id": "ShPNEZz8u7s6"
280
+ },
281
+ "source": [
282
+ "# **Summarization with T5**"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "metadata": {
289
+ "colab": {
290
+ "background_save": true,
291
+ "referenced_widgets": [
292
+ "c9c2e5d5824345f780befcf11d6ff946",
293
+ "c39b4e7e424d4f64a8fb25495f8c7026",
294
+ "543714c7a41a4429a57a069bc2eca1dc"
295
+ ]
296
+ },
297
+ "id": "H1eIU521rrn5",
298
+ "outputId": "d3bb1402-1cba-4881-b05f-b8e24bb19278"
299
+ },
300
+ "outputs": [
301
+ {
302
+ "data": {
303
+ "application/vnd.jupyter.widget-view+json": {
304
+ "model_id": "c9c2e5d5824345f780befcf11d6ff946",
305
+ "version_major": 2,
306
+ "version_minor": 0
307
+ },
308
+ "text/plain": [
309
+ "Downloading: 0%| | 0.00/1.20k [00:00<?, ?B/s]"
310
+ ]
311
+ },
312
+ "metadata": {},
313
+ "output_type": "display_data"
314
+ },
315
+ {
316
+ "data": {
317
+ "application/vnd.jupyter.widget-view+json": {
318
+ "model_id": "c39b4e7e424d4f64a8fb25495f8c7026",
319
+ "version_major": 2,
320
+ "version_minor": 0
321
+ },
322
+ "text/plain": [
323
+ "Downloading: 0%| | 0.00/892M [00:00<?, ?B/s]"
324
+ ]
325
+ },
326
+ "metadata": {},
327
+ "output_type": "display_data"
328
+ },
329
+ {
330
+ "data": {
331
+ "application/vnd.jupyter.widget-view+json": {
332
+ "model_id": "543714c7a41a4429a57a069bc2eca1dc",
333
+ "version_major": 2,
334
+ "version_minor": 0
335
+ },
336
+ "text/plain": [
337
+ "Downloading: 0%| | 0.00/792k [00:00<?, ?B/s]"
338
+ ]
339
+ },
340
+ "metadata": {},
341
+ "output_type": "display_data"
342
+ },
343
+ {
344
+ "name": "stderr",
345
+ "output_type": "stream",
346
+ "text": [
347
+ "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:174: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
348
+ "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
349
+ "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
350
+ "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
351
+ "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
352
+ " FutureWarning,\n"
353
+ ]
354
+ },
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "time: 30.6 s (started: 2022-11-24 06:06:20 +00:00)\n"
360
+ ]
361
+ }
362
+ ],
363
+ "source": [
364
+ "import torch\n",
365
+ "from transformers import T5ForConditionalGeneration,T5Tokenizer\n",
366
+ "summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')\n",
367
+ "summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')\n",
368
+ "\n",
369
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
370
+ "summary_model = summary_model.to(device)\n"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": null,
376
+ "metadata": {
377
+ "colab": {
378
+ "background_save": true
379
+ },
380
+ "id": "8mVsjMPTu-bj",
381
+ "outputId": "e0ac198d-4625-4f8f-a2fd-9968c0a5a72d"
382
+ },
383
+ "outputs": [
384
+ {
385
+ "name": "stdout",
386
+ "output_type": "stream",
387
+ "text": [
388
+ "time: 1.03 ms (started: 2022-11-24 06:06:50 +00:00)\n"
389
+ ]
390
+ }
391
+ ],
392
+ "source": [
393
+ "import random\n",
394
+ "import numpy as np\n",
395
+ "\n",
396
+ "def set_seed(seed: int):\n",
397
+ " random.seed(seed)\n",
398
+ " np.random.seed(seed)\n",
399
+ " torch.manual_seed(seed)\n",
400
+ " torch.cuda.manual_seed_all(seed)\n",
401
+ "\n",
402
+ "set_seed(42)"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": null,
408
+ "metadata": {
409
+ "colab": {
410
+ "background_save": true
411
+ },
412
+ "id": "Gh2Xc5JRvQDp",
413
+ "outputId": "c1198166-2a2b-4571-b831-3ed1a8705c9e"
414
+ },
415
+ "outputs": [
416
+ {
417
+ "name": "stderr",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
421
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n",
422
+ "[nltk_data] Downloading package brown to /root/nltk_data...\n",
423
+ "[nltk_data] Unzipping corpora/brown.zip.\n",
424
+ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
425
+ ]
426
+ },
427
+ {
428
+ "name": "stdout",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "\n",
432
+ "original Text >>\n",
433
+ "Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n",
434
+ "Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n",
435
+ "system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n",
436
+ "rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n",
437
+ "Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n",
438
+ "transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly\n",
439
+ "believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but\n",
440
+ "the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n",
441
+ "that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency.\n",
442
+ "\n",
443
+ "\n",
444
+ "Summarized Text >>\n",
445
+ "Musk tweeted that his electric vehicle-making company tesla will not accept payments in bitcoin because of environmental concerns. He also said that\n",
446
+ "the company was working with developers of dogecoin to improve system transaction efficiency. The world's largest cryptocurrency hit a two-month low,\n",
447
+ "while doge coin rallied by about 20 percent. Musk has in recent months often tweeted in support of crypto, but rarely for bitcoin.\n",
448
+ "\n",
449
+ "\n",
450
+ "time: 6.14 s (started: 2022-11-24 06:06:50 +00:00)\n"
451
+ ]
452
+ }
453
+ ],
454
+ "source": [
455
+ "import nltk\n",
456
+ "nltk.download('punkt')\n",
457
+ "nltk.download('brown')\n",
458
+ "nltk.download('wordnet')\n",
459
+ "from nltk.corpus import wordnet as wn\n",
460
+ "from nltk.tokenize import sent_tokenize\n",
461
+ "\n",
462
+ "def postprocesstext (content):\n",
463
+ " final=\"\"\n",
464
+ " for sent in sent_tokenize(content):\n",
465
+ " sent = sent.capitalize()\n",
466
+ " final = final +\" \"+sent\n",
467
+ " return final\n",
468
+ "\n",
469
+ "\n",
470
+ "def summarizer(text,model,tokenizer):\n",
471
+ " text = text.strip().replace(\"\\n\",\" \")\n",
472
+ " text = \"summarize: \"+text\n",
473
+ " # print (text)\n",
474
+ " max_len = 512\n",
475
+ " encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors=\"pt\").to(device)\n",
476
+ "\n",
477
+ " input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n",
478
+ "\n",
479
+ " outs = model.generate(input_ids=input_ids,\n",
480
+ " attention_mask=attention_mask,\n",
481
+ " early_stopping=True,\n",
482
+ " num_beams=3,\n",
483
+ " num_return_sequences=1,\n",
484
+ " no_repeat_ngram_size=2,\n",
485
+ " min_length = 75,\n",
486
+ " max_length=300)\n",
487
+ "\n",
488
+ "\n",
489
+ " dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n",
490
+ " summary = dec[0]\n",
491
+ " summary = postprocesstext(summary)\n",
492
+ " summary= summary.strip()\n",
493
+ "\n",
494
+ " return summary\n",
495
+ "\n",
496
+ "\n",
497
+ "summarized_text = summarizer(text,summary_model,summary_tokenizer)\n",
498
+ "\n",
499
+ "\n",
500
+ "print (\"\\noriginal Text >>\")\n",
501
+ "for wrp in wrap(text, 150):\n",
502
+ " print (wrp)\n",
503
+ "print (\"\\n\")\n",
504
+ "print (\"Summarized Text >>\")\n",
505
+ "for wrp in wrap(summarized_text, 150):\n",
506
+ " print (wrp)\n",
507
+ "print (\"\\n\")"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "markdown",
512
+ "metadata": {
513
+ "id": "JvBHu5eXv_wp"
514
+ },
515
+ "source": [
516
+ "# **Answer Span Extraction (Keywords and Noun Phrases)**"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {
523
+ "colab": {
524
+ "background_save": true
525
+ },
526
+ "id": "84DxJGFn4MfD",
527
+ "outputId": "27c39b58-dcaa-4b92-ff9e-0da292be34d9"
528
+ },
529
+ "outputs": [
530
+ {
531
+ "name": "stderr",
532
+ "output_type": "stream",
533
+ "text": [
534
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
535
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n"
536
+ ]
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "time: 8.23 s (started: 2022-11-24 06:06:56 +00:00)\n"
543
+ ]
544
+ }
545
+ ],
546
+ "source": [
547
+ "import nltk\n",
548
+ "nltk.download('stopwords')\n",
549
+ "from nltk.corpus import stopwords\n",
550
+ "import string\n",
551
+ "import pke\n",
552
+ "import traceback\n",
553
+ "\n",
554
+ "def get_nouns_multipartite(content):\n",
555
+ " out=[]\n",
556
+ " try:\n",
557
+ " extractor = pke.unsupervised.MultipartiteRank()\n",
558
+ " extractor.load_document(input=content,language='en')\n",
559
+ " # not contain punctuation marks or stopwords as candidates.\n",
560
+ " pos = {'PROPN','NOUN'}\n",
561
+ " #pos = {'PROPN','NOUN'}\n",
562
+ " stoplist = list(string.punctuation)\n",
563
+ " stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']\n",
564
+ " stoplist += stopwords.words('english')\n",
565
+ " # extractor.candidate_selection(pos=pos, stoplist=stoplist)\n",
566
+ " extractor.candidate_selection(pos=pos)\n",
567
+ " # 4. build the Multipartite graph and rank candidates using random walk,\n",
568
+ " # alpha controls the weight adjustment mechanism, see TopicRank for\n",
569
+ " # threshold/method parameters.\n",
570
+ " extractor.candidate_weighting(alpha=1.1,\n",
571
+ " threshold=0.75,\n",
572
+ " method='average')\n",
573
+ " keyphrases = extractor.get_n_best(n=15)\n",
574
+ " \n",
575
+ "\n",
576
+ " for val in keyphrases:\n",
577
+ " out.append(val[0])\n",
578
+ " except:\n",
579
+ " out = []\n",
580
+ " traceback.print_exc()\n",
581
+ "\n",
582
+ " return out"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": null,
588
+ "metadata": {
589
+ "colab": {
590
+ "background_save": true
591
+ },
592
+ "id": "E8LNRzDVwDbp",
593
+ "outputId": "c2ae2bda-8250-4e82-ed71-d10568251e68"
594
+ },
595
+ "outputs": [
596
+ {
597
+ "name": "stdout",
598
+ "output_type": "stream",
599
+ "text": [
600
+ "keywords unsummarized: ['elon musk', 'dogecoin', 'bitcoin', 'statements', 'use', 'cryptocurrency', 'tesla', 'tweets', 'musk', 'system transaction efficiency', 'currency market', 'world', 'price', 'payments', 'company']\n",
601
+ "keywords_found in summarized: ['world', 'dogecoin', 'musk', 'cryptocurrency', 'system transaction efficiency', 'payments', 'company', 'bitcoin', 'tesla']\n",
602
+ "['dogecoin', 'bitcoin', 'cryptocurrency', 'tesla', 'musk', 'system transaction efficiency', 'world', 'payments', 'company']\n",
603
+ "time: 785 ms (started: 2022-11-24 06:07:05 +00:00)\n"
604
+ ]
605
+ }
606
+ ],
607
+ "source": [
608
+ "from flashtext import KeywordProcessor\n",
609
+ "\n",
610
+ "\n",
611
+ "def get_keywords(originaltext,summarytext):\n",
612
+ " keywords = get_nouns_multipartite(originaltext)\n",
613
+ " print (\"keywords unsummarized: \",keywords)\n",
614
+ " keyword_processor = KeywordProcessor()\n",
615
+ " for keyword in keywords:\n",
616
+ " keyword_processor.add_keyword(keyword)\n",
617
+ "\n",
618
+ " keywords_found = keyword_processor.extract_keywords(summarytext)\n",
619
+ " keywords_found = list(set(keywords_found))\n",
620
+ " print (\"keywords_found in summarized: \",keywords_found)\n",
621
+ "\n",
622
+ " important_keywords =[]\n",
623
+ " for keyword in keywords:\n",
624
+ " if keyword in keywords_found:\n",
625
+ " important_keywords.append(keyword)\n",
626
+ "\n",
627
+ " return important_keywords[:10]\n",
628
+ "\n",
629
+ "\n",
630
+ "imp_keywords = get_keywords(text,summarized_text)\n",
631
+ "print (imp_keywords)\n"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": null,
637
+ "metadata": {
638
+ "colab": {
639
+ "background_save": true,
640
+ "referenced_widgets": [
641
+ "24334ddee9f74d3c82a575f0edbc8720",
642
+ "c884156893794fa6bad4171a9aacbd2f",
643
+ "2f0d8bf7b60a423383ae6ab2469106eb",
644
+ "70c932999b0f4dcda0525b9a81ceabf3",
645
+ "7897cc69283d475694042ed9cbc6e92c"
646
+ ]
647
+ },
648
+ "id": "m44RM44OwGzR",
649
+ "outputId": "ca45cae8-a813-4425-9adc-3d8e0f886324"
650
+ },
651
+ "outputs": [
652
+ {
653
+ "data": {
654
+ "application/vnd.jupyter.widget-view+json": {
655
+ "model_id": "24334ddee9f74d3c82a575f0edbc8720",
656
+ "version_major": 2,
657
+ "version_minor": 0
658
+ },
659
+ "text/plain": [
660
+ "Downloading: 0%| | 0.00/1.21k [00:00<?, ?B/s]"
661
+ ]
662
+ },
663
+ "metadata": {},
664
+ "output_type": "display_data"
665
+ },
666
+ {
667
+ "data": {
668
+ "application/vnd.jupyter.widget-view+json": {
669
+ "model_id": "c884156893794fa6bad4171a9aacbd2f",
670
+ "version_major": 2,
671
+ "version_minor": 0
672
+ },
673
+ "text/plain": [
674
+ "Downloading: 0%| | 0.00/892M [00:00<?, ?B/s]"
675
+ ]
676
+ },
677
+ "metadata": {},
678
+ "output_type": "display_data"
679
+ },
680
+ {
681
+ "data": {
682
+ "application/vnd.jupyter.widget-view+json": {
683
+ "model_id": "2f0d8bf7b60a423383ae6ab2469106eb",
684
+ "version_major": 2,
685
+ "version_minor": 0
686
+ },
687
+ "text/plain": [
688
+ "Downloading: 0%| | 0.00/792k [00:00<?, ?B/s]"
689
+ ]
690
+ },
691
+ "metadata": {},
692
+ "output_type": "display_data"
693
+ },
694
+ {
695
+ "data": {
696
+ "application/vnd.jupyter.widget-view+json": {
697
+ "model_id": "70c932999b0f4dcda0525b9a81ceabf3",
698
+ "version_major": 2,
699
+ "version_minor": 0
700
+ },
701
+ "text/plain": [
702
+ "Downloading: 0%| | 0.00/1.79k [00:00<?, ?B/s]"
703
+ ]
704
+ },
705
+ "metadata": {},
706
+ "output_type": "display_data"
707
+ },
708
+ {
709
+ "data": {
710
+ "application/vnd.jupyter.widget-view+json": {
711
+ "model_id": "7897cc69283d475694042ed9cbc6e92c",
712
+ "version_major": 2,
713
+ "version_minor": 0
714
+ },
715
+ "text/plain": [
716
+ "Downloading: 0%| | 0.00/1.86k [00:00<?, ?B/s]"
717
+ ]
718
+ },
719
+ "metadata": {},
720
+ "output_type": "display_data"
721
+ },
722
+ {
723
+ "name": "stdout",
724
+ "output_type": "stream",
725
+ "text": [
726
+ "time: 35.2 s (started: 2022-11-24 06:07:05 +00:00)\n"
727
+ ]
728
+ }
729
+ ],
730
+ "source": [
731
+ "question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')\n",
732
+ "question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')\n",
733
+ "question_model = question_model.to(device)"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": null,
739
+ "metadata": {
740
+ "colab": {
741
+ "background_save": true
742
+ },
743
+ "id": "1usLabLu5DUB",
744
+ "outputId": "69d364b6-ee46-46d2-ee22-19b1fe5b2411"
745
+ },
746
+ "outputs": [
747
+ {
748
+ "name": "stdout",
749
+ "output_type": "stream",
750
+ "text": [
751
+ "Musk tweeted that his electric vehicle-making company tesla will not accept payments in bitcoin because of environmental concerns. He also said that\n",
752
+ "the company was working with developers of dogecoin to improve system transaction efficiency. The world's largest cryptocurrency hit a two-month low,\n",
753
+ "while doge coin rallied by about 20 percent. Musk has in recent months often tweeted in support of crypto, but rarely for bitcoin.\n",
754
+ "\n",
755
+ "\n",
756
+ "What did Musk say he was working with to improve system transaction efficiency?\n",
757
+ "Dogecoin\n",
758
+ "\n",
759
+ "\n",
760
+ "What cryptocurrency did Musk rarely tweet about?\n",
761
+ "Bitcoin\n",
762
+ "\n",
763
+ "\n",
764
+ "What has Musk often tweeted in support of?\n",
765
+ "Cryptocurrency\n",
766
+ "\n",
767
+ "\n",
768
+ "What company did Musk say would not accept bitcoin payments?\n",
769
+ "Tesla\n",
770
+ "\n",
771
+ "\n",
772
+ "Who said tesla would not accept bitcoin payments?\n",
773
+ "Musk\n",
774
+ "\n",
775
+ "\n",
776
+ "What did Musk want to improve with dogecoin?\n",
777
+ "System transaction efficiency\n",
778
+ "\n",
779
+ "\n",
780
+ "What is the largest cryptocurrency?\n",
781
+ "World\n",
782
+ "\n",
783
+ "\n",
784
+ "What did Musk say his company would not accept in bitcoin?\n",
785
+ "Payments\n",
786
+ "\n",
787
+ "\n",
788
+ "What did Musk say was working with dogecoin developers?\n",
789
+ "Company\n",
790
+ "\n",
791
+ "\n",
792
+ "time: 2.78 s (started: 2022-11-24 06:07:41 +00:00)\n"
793
+ ]
794
+ }
795
+ ],
796
+ "source": [
797
+ "def get_question(context,answer,model,tokenizer):\n",
798
+ " text = \"context: {} answer: {}\".format(context,answer)\n",
799
+ " encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors=\"pt\").to(device)\n",
800
+ " input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n",
801
+ "\n",
802
+ " outs = model.generate(input_ids=input_ids,\n",
803
+ " attention_mask=attention_mask,\n",
804
+ " early_stopping=True,\n",
805
+ " num_beams=5,\n",
806
+ " num_return_sequences=1,\n",
807
+ " no_repeat_ngram_size=2,\n",
808
+ " max_length=72)\n",
809
+ "\n",
810
+ "\n",
811
+ " dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n",
812
+ "\n",
813
+ "\n",
814
+ " Question = dec[0].replace(\"question:\",\"\")\n",
815
+ " Question= Question.strip()\n",
816
+ " return Question\n",
817
+ "\n",
818
+ "\n",
819
+ "\n",
820
+ "for wrp in wrap(summarized_text, 150):\n",
821
+ " print (wrp)\n",
822
+ "print (\"\\n\")\n",
823
+ "\n",
824
+ "for answer in imp_keywords:\n",
825
+ " ques = get_question(summarized_text,answer,question_model,question_tokenizer)\n",
826
+ " print (ques)\n",
827
+ " print (answer.capitalize())\n",
828
+ " print (\"\\n\")\n"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": null,
834
+ "metadata": {
835
+ "id": "4kEuH__G6oDK",
836
+ "colab": {
837
+ "base_uri": "https://localhost:8080/",
838
+ "height": 740
839
+ },
840
+ "outputId": "8a8b7911-1e79-403e-9601-6f7221fc8bd7"
841
+ },
842
+ "outputs": [
843
+ {
844
+ "metadata": {
845
+ "tags": null
846
+ },
847
+ "name": "stderr",
848
+ "output_type": "stream",
849
+ "text": [
850
+ "/usr/local/lib/python3.7/dist-packages/gradio/inputs.py:27: UserWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n",
851
+ " \"Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\",\n",
852
+ "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `optional` parameter is deprecated, and it has no effect\n",
853
+ " warnings.warn(value)\n",
854
+ "/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `numeric` parameter is deprecated, and it has no effect\n",
855
+ " warnings.warn(value)\n"
856
+ ]
857
+ },
858
+ {
859
+ "metadata": {
860
+ "tags": null
861
+ },
862
+ "name": "stdout",
863
+ "output_type": "stream",
864
+ "text": [
865
+ "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n",
866
+ "Note: opening Chrome Inspector may crash demo inside Colab notebooks.\n",
867
+ "\n",
868
+ "To create a public link, set `share=True` in `launch()`.\n"
869
+ ]
870
+ },
871
+ {
872
+ "data": {
873
+ "application/javascript": [
874
+ "(async (port, path, width, height, cache, element) => {\n",
875
+ " if (!google.colab.kernel.accessAllowed && !cache) {\n",
876
+ " return;\n",
877
+ " }\n",
878
+ " element.appendChild(document.createTextNode(''));\n",
879
+ " const url = await google.colab.kernel.proxyPort(port, {cache});\n",
880
+ "\n",
881
+ " const external_link = document.createElement('div');\n",
882
+ " external_link.innerHTML = `\n",
883
+ " <div style=\"font-family: monospace; margin-bottom: 0.5rem\">\n",
884
+ " Running on <a href=${new URL(path, url).toString()} target=\"_blank\">\n",
885
+ " https://localhost:${port}${path}\n",
886
+ " </a>\n",
887
+ " </div>\n",
888
+ " `;\n",
889
+ " element.appendChild(external_link);\n",
890
+ "\n",
891
+ " const iframe = document.createElement('iframe');\n",
892
+ " iframe.src = new URL(path, url).toString();\n",
893
+ " iframe.height = height;\n",
894
+ " iframe.allow = \"autoplay; camera; microphone; clipboard-read; clipboard-write;\"\n",
895
+ " iframe.width = width;\n",
896
+ " iframe.style.border = 0;\n",
897
+ " element.appendChild(iframe);\n",
898
+ " })(7860, \"/\", \"100%\", 500, false, window.element)"
899
+ ],
900
+ "text/plain": [
901
+ "<IPython.core.display.Javascript object>"
902
+ ]
903
+ },
904
+ "metadata": {},
905
+ "output_type": "display_data"
906
+ }
907
+ ],
908
+ "source": [
909
+ "import gradio as gr\n",
910
+ "\n",
911
+ "context = gr.inputs.Textbox(lines=10, placeholder=\"Enter paragraph/content here...\")\n",
912
+ "output = gr.outputs.HTML( label=\"Question and Answers\")\n",
913
+ "\n",
914
+ "\n",
915
+ "def generate_question(context):\n",
916
+ " summary_text = summarizer(context,summary_model,summary_tokenizer)\n",
917
+ " for wrp in wrap(summary_text, 150):\n",
918
+ " print (wrp)\n",
919
+ " np = get_keywords(context,summary_text)\n",
920
+ " print (\"\\n\\nNoun phrases\",np)\n",
921
+ " output=\"\"\n",
922
+ " for answer in np:\n",
923
+ " ques = get_question(summary_text,answer,question_model,question_tokenizer)\n",
924
+ " # output= output + ques + \"\\n\" + \"Ans: \"+answer.capitalize() + \"\\n\\n\"\n",
925
+ " output = output + \"<b style='color:blue;'>\" + ques + \"</b>\"\n",
926
+ " output = output + \"<br>\"\n",
927
+ " output = output + \"<b style='color:green;'>\" + \"Ans: \" +answer.capitalize()+ \"</b>\"\n",
928
+ " output = output + \"<br>\"\n",
929
+ "\n",
930
+ " summary =\"Summary: \"+ summary_text\n",
931
+ " for answer in np:\n",
932
+ " summary = summary.replace(answer,\"<b>\"+answer+\"</b>\")\n",
933
+ " summary = summary.replace(answer.capitalize(),\"<b>\"+answer.capitalize()+\"</b>\")\n",
934
+ " output = output + \"<p>\"+summary+\"</p>\"\n",
935
+ " \n",
936
+ " return output\n",
937
+ "\n",
938
+ "iface = gr.Interface(\n",
939
+ " fn=generate_question, \n",
940
+ " inputs=context, \n",
941
+ " outputs=output)\n",
942
+ "iface.launch(debug=True)"
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "markdown",
947
+ "metadata": {
948
+ "id": "dNmJx7QNfLcy"
949
+ },
950
+ "source": [
951
+ "# **Filter keywords with Maximum marginal Relevance**"
952
+ ]
953
+ },
954
+ {
955
+ "cell_type": "code",
956
+ "execution_count": null,
957
+ "metadata": {
958
+ "id": "zPBj-IUL7L8x"
959
+ },
960
+ "outputs": [],
961
+ "source": [
962
+ "!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz\n",
963
+ "!tar -xvf s2v_reddit_2015_md.tar.gz"
964
+ ]
965
+ },
966
+ {
967
+ "cell_type": "code",
968
+ "execution_count": null,
969
+ "metadata": {
970
+ "id": "s5RI3fk9fOOz"
971
+ },
972
+ "outputs": [],
973
+ "source": [
974
+ "import numpy as np\n",
975
+ "from sense2vec import Sense2Vec\n",
976
+ "s2v = Sense2Vec().from_disk('s2v_old')"
977
+ ]
978
+ },
979
+ {
980
+ "cell_type": "code",
981
+ "execution_count": null,
982
+ "metadata": {
983
+ "id": "J2y3unpvfo1y"
984
+ },
985
+ "outputs": [],
986
+ "source": [
987
+ "from sentence_transformers import SentenceTransformer\n",
988
+ "# paraphrase-distilroberta-base-v1\n",
989
+ "sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')"
990
+ ]
991
+ },
992
+ {
993
+ "cell_type": "code",
994
+ "execution_count": null,
995
+ "metadata": {
996
+ "id": "pvfmhuWVfsJb"
997
+ },
998
+ "outputs": [],
999
+ "source": [
1000
+ "from similarity.normalized_levenshtein import NormalizedLevenshtein\n",
1001
+ "normalized_levenshtein = NormalizedLevenshtein()\n",
1002
+ "\n",
1003
+ "def filter_same_sense_words(original,wordlist):\n",
1004
+ " filtered_words=[]\n",
1005
+ " base_sense =original.split('|')[1] \n",
1006
+ " print (base_sense)\n",
1007
+ " for eachword in wordlist:\n",
1008
+ " if eachword[0].split('|')[1] == base_sense:\n",
1009
+ " filtered_words.append(eachword[0].split('|')[0].replace(\"_\", \" \").title().strip())\n",
1010
+ " return filtered_words\n",
1011
+ "\n",
1012
+ "def get_highest_similarity_score(wordlist,wrd):\n",
1013
+ " score=[]\n",
1014
+ " for each in wordlist:\n",
1015
+ " score.append(normalized_levenshtein.similarity(each.lower(),wrd.lower()))\n",
1016
+ " return max(score)\n",
1017
+ "\n",
1018
+ "def sense2vec_get_words(word,s2v,topn,question):\n",
1019
+ " output = []\n",
1020
+ " print (\"word \",word)\n",
1021
+ " try:\n",
1022
+ " sense = s2v.get_best_sense(word, senses= [\"NOUN\", \"PERSON\",\"PRODUCT\",\"LOC\",\"ORG\",\"EVENT\",\"NORP\",\"WORK OF ART\",\"FAC\",\"GPE\",\"NUM\",\"FACILITY\"])\n",
1023
+ " most_similar = s2v.most_similar(sense, n=topn)\n",
1024
+ " # print (most_similar)\n",
1025
+ " output = filter_same_sense_words(sense,most_similar)\n",
1026
+ " print (\"Similar \",output)\n",
1027
+ " except:\n",
1028
+ " output =[]\n",
1029
+ "\n",
1030
+ " threshold = 0.6\n",
1031
+ " final=[word]\n",
1032
+ " checklist =question.split()\n",
1033
+ " for x in output:\n",
1034
+ " if get_highest_similarity_score(final,x)<threshold and x not in final and x not in checklist:\n",
1035
+ " final.append(x)\n",
1036
+ " \n",
1037
+ " return final[1:]\n",
1038
+ "\n",
1039
+ "def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param):\n",
1040
+ "\n",
1041
+ " # Extract similarity within words, and between words and the document\n",
1042
+ " word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)\n",
1043
+ " word_similarity = cosine_similarity(word_embeddings)\n",
1044
+ "\n",
1045
+ " # Initialize candidates and already choose best keyword/keyphrase\n",
1046
+ " keywords_idx = [np.argmax(word_doc_similarity)]\n",
1047
+ " candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]\n",
1048
+ "\n",
1049
+ " for _ in range(top_n - 1):\n",
1050
+ " # Extract similarities within candidates and\n",
1051
+ " # between candidates and selected keywords/phrases\n",
1052
+ " candidate_similarities = word_doc_similarity[candidates_idx, :]\n",
1053
+ " target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)\n",
1054
+ "\n",
1055
+ " # Calculate MMR\n",
1056
+ " mmr = (lambda_param) * candidate_similarities - (1-lambda_param) * target_similarities.reshape(-1, 1)\n",
1057
+ " mmr_idx = candidates_idx[np.argmax(mmr)]\n",
1058
+ "\n",
1059
+ " # Update keywords & candidates\n",
1060
+ " keywords_idx.append(mmr_idx)\n",
1061
+ " candidates_idx.remove(mmr_idx)\n",
1062
+ "\n",
1063
+ " return [words[idx] for idx in keywords_idx]"
1064
+ ]
1065
+ },
1066
+ {
1067
+ "cell_type": "code",
1068
+ "execution_count": null,
1069
+ "metadata": {
1070
+ "id": "UCN0-kXEfxwy"
1071
+ },
1072
+ "outputs": [],
1073
+ "source": [
1074
+ "from collections import OrderedDict\n",
1075
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1076
+ "import nltk\n",
1077
+ "nltk.download('omw-1.4')\n",
1078
+ "\n",
1079
+ "def get_distractors_wordnet(word):\n",
1080
+ " distractors=[]\n",
1081
+ " try:\n",
1082
+ " syn = wn.synsets(word,'n')[0]\n",
1083
+ " \n",
1084
+ " word= word.lower()\n",
1085
+ " orig_word = word\n",
1086
+ " if len(word.split())>0:\n",
1087
+ " word = word.replace(\" \",\"_\")\n",
1088
+ " hypernym = syn.hypernyms()\n",
1089
+ " if len(hypernym) == 0: \n",
1090
+ " return distractors\n",
1091
+ " for item in hypernym[0].hyponyms():\n",
1092
+ " name = item.lemmas()[0].name()\n",
1093
+ " #print (\"name \",name, \" word\",orig_word)\n",
1094
+ " if name == orig_word:\n",
1095
+ " continue\n",
1096
+ " name = name.replace(\"_\",\" \")\n",
1097
+ " name = \" \".join(w.capitalize() for w in name.split())\n",
1098
+ " if name is not None and name not in distractors:\n",
1099
+ " distractors.append(name)\n",
1100
+ " except:\n",
1101
+ " print (\"Wordnet distractors not found\")\n",
1102
+ " return distractors\n",
1103
+ "\n",
1104
+ "def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval):\n",
1105
+ " distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence)\n",
1106
+ " print (\"distractors \",distractors)\n",
1107
+ " if len(distractors) ==0:\n",
1108
+ " return distractors\n",
1109
+ " distractors_new = [word.capitalize()]\n",
1110
+ " distractors_new.extend(distractors)\n",
1111
+ " # print (\"distractors_new .. \",distractors_new)\n",
1112
+ "\n",
1113
+ " embedding_sentence = origsentence+ \" \"+word.capitalize()\n",
1114
+ " # embedding_sentence = word\n",
1115
+ " keyword_embedding = sentencemodel.encode([embedding_sentence])\n",
1116
+ " distractor_embeddings = sentencemodel.encode(distractors_new)\n",
1117
+ "\n",
1118
+ " # filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7)\n",
1119
+ " max_keywords = min(len(distractors_new),5)\n",
1120
+ " filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval)\n",
1121
+ " # filtered_keywords = filtered_keywords[1:]\n",
1122
+ " final = [word.capitalize()]\n",
1123
+ " for wrd in filtered_keywords:\n",
1124
+ " if wrd.lower() !=word.lower():\n",
1125
+ " final.append(wrd.capitalize())\n",
1126
+ " final = final[1:]\n",
1127
+ " return final\n",
1128
+ "\n",
1129
+ "sent = \"What cryptocurrency did Musk rarely tweet about?\"\n",
1130
+ "keyword = \"Bitcoin\"\n",
1131
+ "\n",
1132
+ "# sent = \"What did Musk say he was working with to improve system transaction efficiency?\"\n",
1133
+ "# keyword= \"Dogecoin\"\n",
1134
+ "\n",
1135
+ "\n",
1136
+ "# sent = \"What company did Musk say would not accept bitcoin payments?\"\n",
1137
+ "# keyword= \"Tesla\"\n",
1138
+ "\n",
1139
+ "\n",
1140
+ "# sent = \"What has Musk often tweeted in support of?\"\n",
1141
+ "# keyword = \"Cryptocurrency\"\n",
1142
+ "\n",
1143
+ "print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2))\n"
1144
+ ]
1145
+ },
1146
+ {
1147
+ "cell_type": "code",
1148
+ "execution_count": null,
1149
+ "metadata": {
1150
+ "id": "s2FX-mGdf08p"
1151
+ },
1152
+ "outputs": [],
1153
+ "source": [
1154
+ "get_distractors_wordnet('lion')"
1155
+ ]
1156
+ },
1157
+ {
1158
+ "cell_type": "code",
1159
+ "execution_count": null,
1160
+ "metadata": {
1161
+ "id": "vgvffLecf4Cq"
1162
+ },
1163
+ "outputs": [],
1164
+ "source": [
1165
+ "import gradio as gr\n",
1166
+ "\n",
1167
+ "context = gr.inputs.Textbox(lines=10, placeholder=\"Enter paragraph/content here...\")\n",
1168
+ "output = gr.outputs.HTML( label=\"Question and Answers\")\n",
1169
+ "radiobutton = gr.inputs.Radio([\"Wordnet\", \"Sense2Vec\"])\n",
1170
+ "\n",
1171
+ "def generate_question(context,radiobutton):\n",
1172
+ " summary_text = summarizer(context,summary_model,summary_tokenizer)\n",
1173
+ " for wrp in wrap(summary_text, 100):\n",
1174
+ " print (wrp)\n",
1175
+ " # np = getnounphrases(summary_text,sentence_transformer_model,3)\n",
1176
+ " np = get_keywords(context,summary_text)\n",
1177
+ " print (\"\\n\\nNoun phrases\",np)\n",
1178
+ " output=\"\"\n",
1179
+ " for answer in np:\n",
1180
+ " ques = get_question(summary_text,answer,question_model,question_tokenizer)\n",
1181
+ " if radiobutton==\"Wordnet\":\n",
1182
+ " distractors = get_distractors_wordnet(answer)\n",
1183
+ " else:\n",
1184
+ " distractors = get_distractors(answer.capitalize(),ques,s2v,sentence_transformer_model,40,0.2)\n",
1185
+ " # output= output + ques + \"\\n\" + \"Ans: \"+answer.capitalize() + \"\\n\\n\"\n",
1186
+ " output = output + \"<b style='color:blue;'>\" + ques + \"</b>\"\n",
1187
+ " output = output + \"<br>\"\n",
1188
+ " output = output + \"<b style='color:green;'>\" + \"Ans: \" +answer.capitalize()+ \"</b>\"+\"<br>\"\n",
1189
+ " if len(distractors)>0:\n",
1190
+ " for distractor in distractors[:4]:\n",
1191
+ " output = output + \"<b style='color:brown;'>\" + distractor+ \"</b>\"+\"<br>\"\n",
1192
+ " output = output + \"<br>\"\n",
1193
+ "\n",
1194
+ " summary =\"Summary: \"+ summary_text\n",
1195
+ " for answer in np:\n",
1196
+ " summary = summary.replace(answer,\"<b>\"+answer+\"</b>\" + \"<br>\")\n",
1197
+ " summary = summary.replace(answer.capitalize(),\"<b>\"+answer.capitalize()+\"</b>\")\n",
1198
+ " output = output + \"<p>\"+summary+\"</p>\"\n",
1199
+ " output = output + \"<br>\"\n",
1200
+ " return output\n",
1201
+ "\n",
1202
+ "\n",
1203
+ "iface = gr.Interface(\n",
1204
+ " fn=generate_question, \n",
1205
+ " inputs=[context,radiobutton], \n",
1206
+ " outputs=output)\n",
1207
+ "iface.launch(debug=True)"
1208
+ ]
1209
+ },
1210
+ {
1211
+ "cell_type": "code",
1212
+ "execution_count": null,
1213
+ "metadata": {
1214
+ "id": "EhKGhA1ff7Hi"
1215
+ },
1216
+ "outputs": [],
1217
+ "source": [
1218
+ "import requests\n",
1219
+ "\n",
1220
+ "url = \"https://question-answer.p.rapidapi.com/question-answer\"\n",
1221
+ "\n",
1222
+ "querystring = {\"question\":\"What are some tips to starting up your own small business?\"}\n",
1223
+ "\n",
1224
+ "headers = {\n",
1225
+ "\t\"X-RapidAPI-Key\": \"SIGN-UP-FOR-KEY\",\n",
1226
+ "\t\"X-RapidAPI-Host\": \"question-answer.p.rapidapi.com\"\n",
1227
+ "}\n",
1228
+ "\n",
1229
+ "response = requests.request(\"GET\", url, headers=headers, params=querystring)\n",
1230
+ "\n",
1231
+ "print(response.text)"
1232
+ ]
1233
+ }
1234
+ ],
1235
+ "metadata": {
1236
+ "accelerator": "GPU",
1237
+ "colab": {
1238
+ "provenance": []
1239
+ },
1240
+ "gpuClass": "standard",
1241
+ "kernelspec": {
1242
+ "display_name": "Python 3",
1243
+ "name": "python3"
1244
+ },
1245
+ "language_info": {
1246
+ "name": "python"
1247
+ }
1248
+ },
1249
+ "nbformat": 4,
1250
+ "nbformat_minor": 0
1251
+ }