|
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "8JqpxyBueqTH", |
|
"outputId": "6c2c3908-9067-496c-ad64-74f21895232a" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
" Building wheel for flashtext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", |
|
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", |
|
"Collecting git+https://github.com/boudinfl/pke.git\n", |
|
" Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-s0vst_dk\n", |
|
" Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-s0vst_dk\n", |
|
"Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (3.7)\n", |
|
"Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (2.6.3)\n", |
|
"Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.21.6)\n", |
|
"Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.7.3)\n", |
|
"Collecting sklearn\n", |
|
" Downloading sklearn-0.0.post1.tar.gz (3.6 kB)\n", |
|
"Collecting unidecode\n", |
|
" Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)\n", |
|
"\u001b[K |ββββββββββββββββββββββββββββββββ| 235 kB 6.2 MB/s \n", |
|
"\u001b[?25hRequirement already satisfied: future in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (0.16.0)\n", |
|
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (1.2.0)\n", |
|
"Requirement already satisfied: spacy>=3.2.3 in /usr/local/lib/python3.7/dist-packages (from pke==2.0.0) (3.4.3)\n", |
|
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.7)\n", |
|
"Requirement already satisfied: typing-extensions<4.2.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (4.1.1)\n", |
|
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.3)\n", |
|
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (57.4.0)\n", |
|
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.10)\n", |
|
"Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.10.1)\n", |
|
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.7.0)\n", |
|
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (8.1.5)\n", |
|
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.4.5)\n", |
|
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.8)\n", |
|
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (4.64.1)\n", |
|
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (21.3)\n", |
|
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.9)\n", |
|
"Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (0.8.1)\n", |
|
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (1.10.2)\n", |
|
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.23.0)\n", |
|
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (3.3.0)\n", |
|
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.8)\n", |
|
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy>=3.2.3->pke==2.0.0) (2.11.3)\n", |
|
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from catalogue<2.1.0,>=2.0.6->spacy>=3.2.3->pke==2.0.0) (3.10.0)\n", |
|
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy>=3.2.3->pke==2.0.0) (3.0.9)\n", |
|
"Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy>=3.2.3->pke==2.0.0) (5.2.1)\n", |
|
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2.10)\n", |
|
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2022.9.24)\n", |
|
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (3.0.4)\n", |
|
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (1.24.3)\n", |
|
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy>=3.2.3->pke==2.0.0) (0.0.3)\n", |
|
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.7/dist-packages (from thinc<8.2.0,>=8.1.0->spacy>=3.2.3->pke==2.0.0) (0.7.9)\n", |
|
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.8.0,>=0.3.0->spacy>=3.2.3->pke==2.0.0) (7.1.2)\n", |
|
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy>=3.2.3->pke==2.0.0) (2.0.1)\n", |
|
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.7/dist-packages (from nltk->pke==2.0.0) (2022.6.2)\n", |
|
"Building wheels for collected packages: pke, sklearn\n", |
|
" Building wheel for pke (setup.py) ... \u001b[?25l\u001b[?25hdone\n", |
|
" Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160276 sha256=6967c9216d570e0bbc7bab2c16f5f1810ecd62dcc9fad636e26ff35edbab3a68\n", |
|
" Stored in directory: /tmp/pip-ephem-wheel-cache-_mu5g7sn/wheels/fa/b3/09/612ee93bf3ee4164bcd5783e742942cdfc892a86039d3e0a33\n", |
|
" Building wheel for sklearn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", |
|
" Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=47f5287c3e5d1518e0617e1db17d093069e553338d6c0e359aa70352e6c78d66\n", |
|
" Stored in directory: /root/.cache/pip/wheels/42/56/cc/4a8bf86613aafd5b7f1b310477667c1fca5c51c3ae4124a003\n", |
|
"Successfully built pke sklearn\n", |
|
"Installing collected packages: unidecode, sklearn, pke\n", |
|
"Successfully installed pke-2.0.0 sklearn-0.0.post1 unidecode-1.3.6\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"!pip install --quiet flashtext==2.7\n", |
|
"!pip install git+https://github.com/boudinfl/pke.git\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "am3XUlr5evYK" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"!pip install --quiet transformers==4.8.1\n", |
|
"!pip install --quiet sentencepiece==0.1.95\n", |
|
"!pip install --quiet textwrap3==0.9.2\n", |
|
"!pip install gradio" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "mhwpLyuBfFUK", |
|
"outputId": "dc6f4900-429d-4815-c98c-b8625efcbe7b" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"\u001b[?25l\r\u001b[K |ββββββββ | 10 kB 27.7 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 20 kB 34.6 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 30 kB 15.4 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββββ | 40 kB 6.6 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 42 kB 955 kB/s \n", |
|
"\u001b[?25h" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"!pip install --quiet strsim==0.0.3\n", |
|
"!pip install --quiet sense2vec==2.0.0" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "NcNXz17EfQLJ", |
|
"outputId": "c90851f7-e320-48e3-d994-fcc5c174c636" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"\u001b[?25l\r\u001b[K |β | 10 kB 10.5 MB/s eta 0:00:01\r\u001b[K |β | 20 kB 7.8 MB/s eta 0:00:01\r\u001b[K |β | 30 kB 11.1 MB/s eta 0:00:01\r\u001b[K |β | 40 kB 6.3 MB/s eta 0:00:01\r\u001b[K |β | 51 kB 6.3 MB/s eta 0:00:01\r\u001b[K |ββ | 61 kB 7.4 MB/s eta 0:00:01\r\u001b[K |ββ | 71 kB 7.9 MB/s eta 0:00:01\r\u001b[K |ββ | 81 kB 8.7 MB/s eta 0:00:01\r\u001b[K |ββ | 92 kB 8.7 MB/s eta 0:00:01\r\u001b[K |ββ | 102 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββ | 112 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββ | 122 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββ | 133 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββ | 143 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββ | 153 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββ | 163 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββ | 174 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββ | 184 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββ | 194 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββ | 204 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββ | 215 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββ | 225 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββ | 235 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββ | 245 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββ | 256 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββ | 266 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββ | 276 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββ | 286 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββ | 296 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββ | 307 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββ | 317 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββ | 327 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββ | 337 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββ | 348 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββ | 358 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββ | 368 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββ | 378 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββ | 389 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββ | 399 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββ | 409 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββ | 419 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββ | 430 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββ | 440 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββ | 450 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββ | 460 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββ | 471 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββ | 481 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββ | 491 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββ | 501 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββ | 512 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββ | 522 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββ | 532 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββ | 542 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββ | 552 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββ | 563 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββ | 573 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββ | 583 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββ | 593 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββ | 604 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββ | 614 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββ | 624 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββ | 634 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββ | 645 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββ | 655 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββ | 665 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββ | 675 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββ | 686 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββ | 696 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββ | 706 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββ | 716 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββ | 727 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββ | 737 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 747 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 757 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 768 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 778 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββ | 788 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββ | 798 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββ | 808 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββ | 819 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββ | 829 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββ | 839 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββ | 849 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββ | 860 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββ | 870 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββ | 880 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββ | 890 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββ | 901 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββ | 911 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββ | 921 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββ | 931 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββ | 942 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββ | 952 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββ | 962 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββ | 972 kB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββ | 983 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββ | 993 kB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββ | 1.0 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 1.1 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββ | 1.2 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββ | 1.3 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββ | 1.4 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |βββββββββββββββββββββββββββββββ | 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.5 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.6 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.6 MB 7.5 MB/s eta 0:00:01\r\u001b[K |ββββββββββββββββββββββββββββββββ| 1.6 MB 7.5 MB/s \n", |
|
"\u001b[?25htime: 506 Β΅s (started: 2022-11-24 06:06:09 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"!pip install --quiet ipython-autotime\n", |
|
"%load_ext autotime" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "Bijc_hfbfUwp", |
|
"outputId": "54a7f895-8f08-452d-8f3a-8e5310a1aa6c" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"\u001b[K |ββββββββββββββββββββββββββββββββ| 85 kB 3.9 MB/s \n", |
|
"\u001b[K |ββββββββββββββββββββββββββββββββ| 182 kB 49.1 MB/s \n", |
|
"\u001b[K |ββββββββββββββββββββββββββββββββ| 5.5 MB 54.9 MB/s \n", |
|
"\u001b[K |ββββββββββββββββββββββββββββββββ| 7.6 MB 55.0 MB/s \n", |
|
"\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", |
|
"time: 10.4 s (started: 2022-11-24 06:06:09 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"!pip install --quiet sentence-transformers==2.2.2" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "bmVx9L0yfgvR" |
|
}, |
|
"source": [ |
|
"The below code restarts the colab notebook. Once it is restarted continue from next section and no need to run this section (installation) again." |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "uPO9U__1fZWh", |
|
"outputId": "31e8d745-2a88-4bd6-f136-55cd2147ee3f" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"time: 556 Β΅s (started: 2022-11-24 06:06:20 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"# import os\n", |
|
"# os.kill(os.getpid(), 9)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "POh2_zvgrk0h" |
|
}, |
|
"source": [ |
|
"## Example 1" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "VJP4CDBBrnNY" |
|
}, |
|
"source": [ |
|
"Text taken from: \n", |
|
"https://gadgets.ndtv.com/internet/news/dogecoin-price-rally-surge-elon-musk-tweet-twitter-working-developers-improve-transaction-efficiency-2442120" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "P_jlw7MUfjOp", |
|
"outputId": "fd3e08da-3595-445d-941f-2c8047e34f08" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n", |
|
"Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n", |
|
"system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n", |
|
"rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n", |
|
"Musk put out a statement from Tesla that it was βconcernedβ about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n", |
|
"transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, βTo be clear, I strongly\n", |
|
"believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coalβ. It triggered a downward spiral for Bitcoin value but\n", |
|
"the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n", |
|
"that Dogecoin βis here to stayβ and another referred to Musk's previous assertion that crypto could become the world's future currency.\n", |
|
"\n", |
|
"\n", |
|
"time: 18.8 ms (started: 2022-11-24 06:06:20 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"from textwrap3 import wrap\n", |
|
"\n", |
|
"text = \"\"\"Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n", |
|
"Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n", |
|
"system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n", |
|
"rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n", |
|
"Musk put out a statement from Tesla that it was βconcernedβ about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n", |
|
"transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, βTo be clear, I strongly\n", |
|
"believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coalβ. It triggered a downward spiral for Bitcoin value but\n", |
|
"the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n", |
|
"that Dogecoin βis here to stayβ and another referred to Musk's previous assertion that crypto could become the world's future currency.\"\"\"\n", |
|
"\n", |
|
"for wrp in wrap(text, 150):\n", |
|
" print (wrp)\n", |
|
"print (\"\\n\")" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "ShPNEZz8u7s6" |
|
}, |
|
"source": [ |
|
"# **Summarization with T5**" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true, |
|
"referenced_widgets": [ |
|
"c9c2e5d5824345f780befcf11d6ff946", |
|
"c39b4e7e424d4f64a8fb25495f8c7026", |
|
"543714c7a41a4429a57a069bc2eca1dc" |
|
] |
|
}, |
|
"id": "H1eIU521rrn5", |
|
"outputId": "d3bb1402-1cba-4881-b05f-b8e24bb19278" |
|
}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "c9c2e5d5824345f780befcf11d6ff946", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/1.20k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "c39b4e7e424d4f64a8fb25495f8c7026", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/892M [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "543714c7a41a4429a57a069bc2eca1dc", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/792k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:174: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", |
|
"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", |
|
"- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", |
|
"- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", |
|
"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", |
|
" FutureWarning,\n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"time: 30.6 s (started: 2022-11-24 06:06:20 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import torch\n", |
|
"from transformers import T5ForConditionalGeneration,T5Tokenizer\n", |
|
"summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')\n", |
|
"summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')\n", |
|
"\n", |
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", |
|
"summary_model = summary_model.to(device)\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "8mVsjMPTu-bj", |
|
"outputId": "e0ac198d-4625-4f8f-a2fd-9968c0a5a72d" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"time: 1.03 ms (started: 2022-11-24 06:06:50 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import random\n", |
|
"import numpy as np\n", |
|
"\n", |
|
"def set_seed(seed: int):\n", |
|
" random.seed(seed)\n", |
|
" np.random.seed(seed)\n", |
|
" torch.manual_seed(seed)\n", |
|
" torch.cuda.manual_seed_all(seed)\n", |
|
"\n", |
|
"set_seed(42)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "Gh2Xc5JRvQDp", |
|
"outputId": "c1198166-2a2b-4571-b831-3ed1a8705c9e" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"[nltk_data] Downloading package punkt to /root/nltk_data...\n", |
|
"[nltk_data] Unzipping tokenizers/punkt.zip.\n", |
|
"[nltk_data] Downloading package brown to /root/nltk_data...\n", |
|
"[nltk_data] Unzipping corpora/brown.zip.\n", |
|
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"\n", |
|
"original Text >>\n", |
|
"Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company\n", |
|
"Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve\n", |
|
"system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin\n", |
|
"rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,\n", |
|
"Musk put out a statement from Tesla that it was βconcernedβ about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and\n", |
|
"transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, βTo be clear, I strongly\n", |
|
"believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coalβ. It triggered a downward spiral for Bitcoin value but\n", |
|
"the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising\n", |
|
"that Dogecoin βis here to stayβ and another referred to Musk's previous assertion that crypto could become the world's future currency.\n", |
|
"\n", |
|
"\n", |
|
"Summarized Text >>\n", |
|
"Musk tweeted that his electric vehicle-making company tesla will not accept payments in bitcoin because of environmental concerns. He also said that\n", |
|
"the company was working with developers of dogecoin to improve system transaction efficiency. The world's largest cryptocurrency hit a two-month low,\n", |
|
"while doge coin rallied by about 20 percent. Musk has in recent months often tweeted in support of crypto, but rarely for bitcoin.\n", |
|
"\n", |
|
"\n", |
|
"time: 6.14 s (started: 2022-11-24 06:06:50 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import nltk\n", |
|
"nltk.download('punkt')\n", |
|
"nltk.download('brown')\n", |
|
"nltk.download('wordnet')\n", |
|
"from nltk.corpus import wordnet as wn\n", |
|
"from nltk.tokenize import sent_tokenize\n", |
|
"\n", |
|
"def postprocesstext (content):\n", |
|
" final=\"\"\n", |
|
" for sent in sent_tokenize(content):\n", |
|
" sent = sent.capitalize()\n", |
|
" final = final +\" \"+sent\n", |
|
" return final\n", |
|
"\n", |
|
"\n", |
|
"def summarizer(text,model,tokenizer):\n", |
|
" text = text.strip().replace(\"\\n\",\" \")\n", |
|
" text = \"summarize: \"+text\n", |
|
" # print (text)\n", |
|
" max_len = 512\n", |
|
" encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors=\"pt\").to(device)\n", |
|
"\n", |
|
" input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n", |
|
"\n", |
|
" outs = model.generate(input_ids=input_ids,\n", |
|
" attention_mask=attention_mask,\n", |
|
" early_stopping=True,\n", |
|
" num_beams=3,\n", |
|
" num_return_sequences=1,\n", |
|
" no_repeat_ngram_size=2,\n", |
|
" min_length = 75,\n", |
|
" max_length=300)\n", |
|
"\n", |
|
"\n", |
|
" dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n", |
|
" summary = dec[0]\n", |
|
" summary = postprocesstext(summary)\n", |
|
" summary= summary.strip()\n", |
|
"\n", |
|
" return summary\n", |
|
"\n", |
|
"\n", |
|
"summarized_text = summarizer(text,summary_model,summary_tokenizer)\n", |
|
"\n", |
|
"\n", |
|
"print (\"\\noriginal Text >>\")\n", |
|
"for wrp in wrap(text, 150):\n", |
|
" print (wrp)\n", |
|
"print (\"\\n\")\n", |
|
"print (\"Summarized Text >>\")\n", |
|
"for wrp in wrap(summarized_text, 150):\n", |
|
" print (wrp)\n", |
|
"print (\"\\n\")" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "JvBHu5eXv_wp" |
|
}, |
|
"source": [ |
|
"# **Answer Span Extraction (Keywords and Noun Phrases)**" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "84DxJGFn4MfD", |
|
"outputId": "27c39b58-dcaa-4b92-ff9e-0da292be34d9" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n", |
|
"[nltk_data] Unzipping corpora/stopwords.zip.\n" |
|
] |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"time: 8.23 s (started: 2022-11-24 06:06:56 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"import nltk\n", |
|
"nltk.download('stopwords')\n", |
|
"from nltk.corpus import stopwords\n", |
|
"import string\n", |
|
"import pke\n", |
|
"import traceback\n", |
|
"\n", |
|
"def get_nouns_multipartite(content):\n", |
|
" out=[]\n", |
|
" try:\n", |
|
" extractor = pke.unsupervised.MultipartiteRank()\n", |
|
" extractor.load_document(input=content,language='en')\n", |
|
" # not contain punctuation marks or stopwords as candidates.\n", |
|
" pos = {'PROPN','NOUN'}\n", |
|
" #pos = {'PROPN','NOUN'}\n", |
|
" stoplist = list(string.punctuation)\n", |
|
" stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']\n", |
|
" stoplist += stopwords.words('english')\n", |
|
" # extractor.candidate_selection(pos=pos, stoplist=stoplist)\n", |
|
" extractor.candidate_selection(pos=pos)\n", |
|
" # 4. build the Multipartite graph and rank candidates using random walk,\n", |
|
" # alpha controls the weight adjustment mechanism, see TopicRank for\n", |
|
" # threshold/method parameters.\n", |
|
" extractor.candidate_weighting(alpha=1.1,\n", |
|
" threshold=0.75,\n", |
|
" method='average')\n", |
|
" keyphrases = extractor.get_n_best(n=15)\n", |
|
" \n", |
|
"\n", |
|
" for val in keyphrases:\n", |
|
" out.append(val[0])\n", |
|
" except:\n", |
|
" out = []\n", |
|
" traceback.print_exc()\n", |
|
"\n", |
|
" return out" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "E8LNRzDVwDbp", |
|
"outputId": "c2ae2bda-8250-4e82-ed71-d10568251e68" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"keywords unsummarized: ['elon musk', 'dogecoin', 'bitcoin', 'statements', 'use', 'cryptocurrency', 'tesla', 'tweets', 'musk', 'system transaction efficiency', 'currency market', 'world', 'price', 'payments', 'company']\n", |
|
"keywords_found in summarized: ['world', 'dogecoin', 'musk', 'cryptocurrency', 'system transaction efficiency', 'payments', 'company', 'bitcoin', 'tesla']\n", |
|
"['dogecoin', 'bitcoin', 'cryptocurrency', 'tesla', 'musk', 'system transaction efficiency', 'world', 'payments', 'company']\n", |
|
"time: 785 ms (started: 2022-11-24 06:07:05 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"from flashtext import KeywordProcessor\n", |
|
"\n", |
|
"\n", |
|
"def get_keywords(originaltext,summarytext):\n", |
|
" keywords = get_nouns_multipartite(originaltext)\n", |
|
" print (\"keywords unsummarized: \",keywords)\n", |
|
" keyword_processor = KeywordProcessor()\n", |
|
" for keyword in keywords:\n", |
|
" keyword_processor.add_keyword(keyword)\n", |
|
"\n", |
|
" keywords_found = keyword_processor.extract_keywords(summarytext)\n", |
|
" keywords_found = list(set(keywords_found))\n", |
|
" print (\"keywords_found in summarized: \",keywords_found)\n", |
|
"\n", |
|
" important_keywords =[]\n", |
|
" for keyword in keywords:\n", |
|
" if keyword in keywords_found:\n", |
|
" important_keywords.append(keyword)\n", |
|
"\n", |
|
" return important_keywords[:10]\n", |
|
"\n", |
|
"\n", |
|
"imp_keywords = get_keywords(text,summarized_text)\n", |
|
"print (imp_keywords)\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true, |
|
"referenced_widgets": [ |
|
"24334ddee9f74d3c82a575f0edbc8720", |
|
"c884156893794fa6bad4171a9aacbd2f", |
|
"2f0d8bf7b60a423383ae6ab2469106eb", |
|
"70c932999b0f4dcda0525b9a81ceabf3", |
|
"7897cc69283d475694042ed9cbc6e92c" |
|
] |
|
}, |
|
"id": "m44RM44OwGzR", |
|
"outputId": "ca45cae8-a813-4425-9adc-3d8e0f886324" |
|
}, |
|
"outputs": [ |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "24334ddee9f74d3c82a575f0edbc8720", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/1.21k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "c884156893794fa6bad4171a9aacbd2f", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/892M [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "2f0d8bf7b60a423383ae6ab2469106eb", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/792k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "70c932999b0f4dcda0525b9a81ceabf3", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/1.79k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"data": { |
|
"application/vnd.jupyter.widget-view+json": { |
|
"model_id": "7897cc69283d475694042ed9cbc6e92c", |
|
"version_major": 2, |
|
"version_minor": 0 |
|
}, |
|
"text/plain": [ |
|
"Downloading: 0%| | 0.00/1.86k [00:00<?, ?B/s]" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
}, |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"time: 35.2 s (started: 2022-11-24 06:07:05 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')\n", |
|
"question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')\n", |
|
"question_model = question_model.to(device)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"colab": { |
|
"background_save": true |
|
}, |
|
"id": "1usLabLu5DUB", |
|
"outputId": "69d364b6-ee46-46d2-ee22-19b1fe5b2411" |
|
}, |
|
"outputs": [ |
|
{ |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Musk tweeted that his electric vehicle-making company tesla will not accept payments in bitcoin because of environmental concerns. He also said that\n", |
|
"the company was working with developers of dogecoin to improve system transaction efficiency. The world's largest cryptocurrency hit a two-month low,\n", |
|
"while doge coin rallied by about 20 percent. Musk has in recent months often tweeted in support of crypto, but rarely for bitcoin.\n", |
|
"\n", |
|
"\n", |
|
"What did Musk say he was working with to improve system transaction efficiency?\n", |
|
"Dogecoin\n", |
|
"\n", |
|
"\n", |
|
"What cryptocurrency did Musk rarely tweet about?\n", |
|
"Bitcoin\n", |
|
"\n", |
|
"\n", |
|
"What has Musk often tweeted in support of?\n", |
|
"Cryptocurrency\n", |
|
"\n", |
|
"\n", |
|
"What company did Musk say would not accept bitcoin payments?\n", |
|
"Tesla\n", |
|
"\n", |
|
"\n", |
|
"Who said tesla would not accept bitcoin payments?\n", |
|
"Musk\n", |
|
"\n", |
|
"\n", |
|
"What did Musk want to improve with dogecoin?\n", |
|
"System transaction efficiency\n", |
|
"\n", |
|
"\n", |
|
"What is the largest cryptocurrency?\n", |
|
"World\n", |
|
"\n", |
|
"\n", |
|
"What did Musk say his company would not accept in bitcoin?\n", |
|
"Payments\n", |
|
"\n", |
|
"\n", |
|
"What did Musk say was working with dogecoin developers?\n", |
|
"Company\n", |
|
"\n", |
|
"\n", |
|
"time: 2.78 s (started: 2022-11-24 06:07:41 +00:00)\n" |
|
] |
|
} |
|
], |
|
"source": [ |
|
"def get_question(context,answer,model,tokenizer):\n", |
|
" text = \"context: {} answer: {}\".format(context,answer)\n", |
|
" encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors=\"pt\").to(device)\n", |
|
" input_ids, attention_mask = encoding[\"input_ids\"], encoding[\"attention_mask\"]\n", |
|
"\n", |
|
" outs = model.generate(input_ids=input_ids,\n", |
|
" attention_mask=attention_mask,\n", |
|
" early_stopping=True,\n", |
|
" num_beams=5,\n", |
|
" num_return_sequences=1,\n", |
|
" no_repeat_ngram_size=2,\n", |
|
" max_length=72)\n", |
|
"\n", |
|
"\n", |
|
" dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]\n", |
|
"\n", |
|
"\n", |
|
" Question = dec[0].replace(\"question:\",\"\")\n", |
|
" Question= Question.strip()\n", |
|
" return Question\n", |
|
"\n", |
|
"\n", |
|
"\n", |
|
"for wrp in wrap(summarized_text, 150):\n", |
|
" print (wrp)\n", |
|
"print (\"\\n\")\n", |
|
"\n", |
|
"for answer in imp_keywords:\n", |
|
" ques = get_question(summarized_text,answer,question_model,question_tokenizer)\n", |
|
" print (ques)\n", |
|
" print (answer.capitalize())\n", |
|
" print (\"\\n\")\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "4kEuH__G6oDK", |
|
"colab": { |
|
"base_uri": "https://localhost:8080/", |
|
"height": 740 |
|
}, |
|
"outputId": "8a8b7911-1e79-403e-9601-6f7221fc8bd7" |
|
}, |
|
"outputs": [ |
|
{ |
|
"metadata": { |
|
"tags": null |
|
}, |
|
"name": "stderr", |
|
"output_type": "stream", |
|
"text": [ |
|
"/usr/local/lib/python3.7/dist-packages/gradio/inputs.py:27: UserWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\n", |
|
" \"Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components\",\n", |
|
"/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `optional` parameter is deprecated, and it has no effect\n", |
|
" warnings.warn(value)\n", |
|
"/usr/local/lib/python3.7/dist-packages/gradio/deprecation.py:40: UserWarning: `numeric` parameter is deprecated, and it has no effect\n", |
|
" warnings.warn(value)\n" |
|
] |
|
}, |
|
{ |
|
"metadata": { |
|
"tags": null |
|
}, |
|
"name": "stdout", |
|
"output_type": "stream", |
|
"text": [ |
|
"Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", |
|
"Note: opening Chrome Inspector may crash demo inside Colab notebooks.\n", |
|
"\n", |
|
"To create a public link, set `share=True` in `launch()`.\n" |
|
] |
|
}, |
|
{ |
|
"data": { |
|
"application/javascript": [ |
|
"(async (port, path, width, height, cache, element) => {\n", |
|
" if (!google.colab.kernel.accessAllowed && !cache) {\n", |
|
" return;\n", |
|
" }\n", |
|
" element.appendChild(document.createTextNode(''));\n", |
|
" const url = await google.colab.kernel.proxyPort(port, {cache});\n", |
|
"\n", |
|
" const external_link = document.createElement('div');\n", |
|
" external_link.innerHTML = `\n", |
|
" <div style=\"font-family: monospace; margin-bottom: 0.5rem\">\n", |
|
" Running on <a href=${new URL(path, url).toString()} target=\"_blank\">\n", |
|
" https://localhost:${port}${path}\n", |
|
" </a>\n", |
|
" </div>\n", |
|
" `;\n", |
|
" element.appendChild(external_link);\n", |
|
"\n", |
|
" const iframe = document.createElement('iframe');\n", |
|
" iframe.src = new URL(path, url).toString();\n", |
|
" iframe.height = height;\n", |
|
" iframe.allow = \"autoplay; camera; microphone; clipboard-read; clipboard-write;\"\n", |
|
" iframe.width = width;\n", |
|
" iframe.style.border = 0;\n", |
|
" element.appendChild(iframe);\n", |
|
" })(7860, \"/\", \"100%\", 500, false, window.element)" |
|
], |
|
"text/plain": [ |
|
"<IPython.core.display.Javascript object>" |
|
] |
|
}, |
|
"metadata": {}, |
|
"output_type": "display_data" |
|
} |
|
], |
|
"source": [ |
|
"import gradio as gr\n", |
|
"\n", |
|
"context = gr.inputs.Textbox(lines=10, placeholder=\"Enter paragraph/content here...\")\n", |
|
"output = gr.outputs.HTML( label=\"Question and Answers\")\n", |
|
"\n", |
|
"\n", |
|
"def generate_question(context):\n", |
|
" summary_text = summarizer(context,summary_model,summary_tokenizer)\n", |
|
" for wrp in wrap(summary_text, 150):\n", |
|
" print (wrp)\n", |
|
" np = get_keywords(context,summary_text)\n", |
|
" print (\"\\n\\nNoun phrases\",np)\n", |
|
" output=\"\"\n", |
|
" for answer in np:\n", |
|
" ques = get_question(summary_text,answer,question_model,question_tokenizer)\n", |
|
" # output= output + ques + \"\\n\" + \"Ans: \"+answer.capitalize() + \"\\n\\n\"\n", |
|
" output = output + \"<b style='color:blue;'>\" + ques + \"</b>\"\n", |
|
" output = output + \"<br>\"\n", |
|
" output = output + \"<b style='color:green;'>\" + \"Ans: \" +answer.capitalize()+ \"</b>\"\n", |
|
" output = output + \"<br>\"\n", |
|
"\n", |
|
" summary =\"Summary: \"+ summary_text\n", |
|
" for answer in np:\n", |
|
" summary = summary.replace(answer,\"<b>\"+answer+\"</b>\")\n", |
|
" summary = summary.replace(answer.capitalize(),\"<b>\"+answer.capitalize()+\"</b>\")\n", |
|
" output = output + \"<p>\"+summary+\"</p>\"\n", |
|
" \n", |
|
" return output\n", |
|
"\n", |
|
"iface = gr.Interface(\n", |
|
" fn=generate_question, \n", |
|
" inputs=context, \n", |
|
" outputs=output)\n", |
|
"iface.launch(debug=True)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "markdown", |
|
"metadata": { |
|
"id": "dNmJx7QNfLcy" |
|
}, |
|
"source": [ |
|
"# **Filter keywords with Maximum marginal Relevance**" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "zPBj-IUL7L8x" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz\n", |
|
"!tar -xvf s2v_reddit_2015_md.tar.gz" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "s5RI3fk9fOOz" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"import numpy as np\n", |
|
"from sense2vec import Sense2Vec\n", |
|
"s2v = Sense2Vec().from_disk('s2v_old')" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "J2y3unpvfo1y" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"from sentence_transformers import SentenceTransformer\n", |
|
"# paraphrase-distilroberta-base-v1\n", |
|
"sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "pvfmhuWVfsJb" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"from similarity.normalized_levenshtein import NormalizedLevenshtein\n", |
|
"normalized_levenshtein = NormalizedLevenshtein()\n", |
|
"\n", |
|
"def filter_same_sense_words(original,wordlist):\n", |
|
" filtered_words=[]\n", |
|
" base_sense =original.split('|')[1] \n", |
|
" print (base_sense)\n", |
|
" for eachword in wordlist:\n", |
|
" if eachword[0].split('|')[1] == base_sense:\n", |
|
" filtered_words.append(eachword[0].split('|')[0].replace(\"_\", \" \").title().strip())\n", |
|
" return filtered_words\n", |
|
"\n", |
|
"def get_highest_similarity_score(wordlist,wrd):\n", |
|
" score=[]\n", |
|
" for each in wordlist:\n", |
|
" score.append(normalized_levenshtein.similarity(each.lower(),wrd.lower()))\n", |
|
" return max(score)\n", |
|
"\n", |
|
"def sense2vec_get_words(word,s2v,topn,question):\n", |
|
" output = []\n", |
|
" print (\"word \",word)\n", |
|
" try:\n", |
|
" sense = s2v.get_best_sense(word, senses= [\"NOUN\", \"PERSON\",\"PRODUCT\",\"LOC\",\"ORG\",\"EVENT\",\"NORP\",\"WORK OF ART\",\"FAC\",\"GPE\",\"NUM\",\"FACILITY\"])\n", |
|
" most_similar = s2v.most_similar(sense, n=topn)\n", |
|
" # print (most_similar)\n", |
|
" output = filter_same_sense_words(sense,most_similar)\n", |
|
" print (\"Similar \",output)\n", |
|
" except:\n", |
|
" output =[]\n", |
|
"\n", |
|
" threshold = 0.6\n", |
|
" final=[word]\n", |
|
" checklist =question.split()\n", |
|
" for x in output:\n", |
|
" if get_highest_similarity_score(final,x)<threshold and x not in final and x not in checklist:\n", |
|
" final.append(x)\n", |
|
" \n", |
|
" return final[1:]\n", |
|
"\n", |
|
"def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param):\n", |
|
"\n", |
|
" # Extract similarity within words, and between words and the document\n", |
|
" word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)\n", |
|
" word_similarity = cosine_similarity(word_embeddings)\n", |
|
"\n", |
|
" # Initialize candidates and already choose best keyword/keyphrase\n", |
|
" keywords_idx = [np.argmax(word_doc_similarity)]\n", |
|
" candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]\n", |
|
"\n", |
|
" for _ in range(top_n - 1):\n", |
|
" # Extract similarities within candidates and\n", |
|
" # between candidates and selected keywords/phrases\n", |
|
" candidate_similarities = word_doc_similarity[candidates_idx, :]\n", |
|
" target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)\n", |
|
"\n", |
|
" # Calculate MMR\n", |
|
" mmr = (lambda_param) * candidate_similarities - (1-lambda_param) * target_similarities.reshape(-1, 1)\n", |
|
" mmr_idx = candidates_idx[np.argmax(mmr)]\n", |
|
"\n", |
|
" # Update keywords & candidates\n", |
|
" keywords_idx.append(mmr_idx)\n", |
|
" candidates_idx.remove(mmr_idx)\n", |
|
"\n", |
|
" return [words[idx] for idx in keywords_idx]" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "UCN0-kXEfxwy" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"from collections import OrderedDict\n", |
|
"from sklearn.metrics.pairwise import cosine_similarity\n", |
|
"import nltk\n", |
|
"nltk.download('omw-1.4')\n", |
|
"\n", |
|
"def get_distractors_wordnet(word):\n", |
|
" distractors=[]\n", |
|
" try:\n", |
|
" syn = wn.synsets(word,'n')[0]\n", |
|
" \n", |
|
" word= word.lower()\n", |
|
" orig_word = word\n", |
|
" if len(word.split())>0:\n", |
|
" word = word.replace(\" \",\"_\")\n", |
|
" hypernym = syn.hypernyms()\n", |
|
" if len(hypernym) == 0: \n", |
|
" return distractors\n", |
|
" for item in hypernym[0].hyponyms():\n", |
|
" name = item.lemmas()[0].name()\n", |
|
" #print (\"name \",name, \" word\",orig_word)\n", |
|
" if name == orig_word:\n", |
|
" continue\n", |
|
" name = name.replace(\"_\",\" \")\n", |
|
" name = \" \".join(w.capitalize() for w in name.split())\n", |
|
" if name is not None and name not in distractors:\n", |
|
" distractors.append(name)\n", |
|
" except:\n", |
|
" print (\"Wordnet distractors not found\")\n", |
|
" return distractors\n", |
|
"\n", |
|
"def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval):\n", |
|
" distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence)\n", |
|
" print (\"distractors \",distractors)\n", |
|
" if len(distractors) ==0:\n", |
|
" return distractors\n", |
|
" distractors_new = [word.capitalize()]\n", |
|
" distractors_new.extend(distractors)\n", |
|
" # print (\"distractors_new .. \",distractors_new)\n", |
|
"\n", |
|
" embedding_sentence = origsentence+ \" \"+word.capitalize()\n", |
|
" # embedding_sentence = word\n", |
|
" keyword_embedding = sentencemodel.encode([embedding_sentence])\n", |
|
" distractor_embeddings = sentencemodel.encode(distractors_new)\n", |
|
"\n", |
|
" # filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7)\n", |
|
" max_keywords = min(len(distractors_new),5)\n", |
|
" filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval)\n", |
|
" # filtered_keywords = filtered_keywords[1:]\n", |
|
" final = [word.capitalize()]\n", |
|
" for wrd in filtered_keywords:\n", |
|
" if wrd.lower() !=word.lower():\n", |
|
" final.append(wrd.capitalize())\n", |
|
" final = final[1:]\n", |
|
" return final\n", |
|
"\n", |
|
"sent = \"What cryptocurrency did Musk rarely tweet about?\"\n", |
|
"keyword = \"Bitcoin\"\n", |
|
"\n", |
|
"# sent = \"What did Musk say he was working with to improve system transaction efficiency?\"\n", |
|
"# keyword= \"Dogecoin\"\n", |
|
"\n", |
|
"\n", |
|
"# sent = \"What company did Musk say would not accept bitcoin payments?\"\n", |
|
"# keyword= \"Tesla\"\n", |
|
"\n", |
|
"\n", |
|
"# sent = \"What has Musk often tweeted in support of?\"\n", |
|
"# keyword = \"Cryptocurrency\"\n", |
|
"\n", |
|
"print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2))\n" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "s2FX-mGdf08p" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"get_distractors_wordnet('lion')" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "vgvffLecf4Cq" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"import gradio as gr\n", |
|
"\n", |
|
"context = gr.inputs.Textbox(lines=10, placeholder=\"Enter paragraph/content here...\")\n", |
|
"output = gr.outputs.HTML( label=\"Question and Answers\")\n", |
|
"radiobutton = gr.inputs.Radio([\"Wordnet\", \"Sense2Vec\"])\n", |
|
"\n", |
|
"def generate_question(context,radiobutton):\n", |
|
" summary_text = summarizer(context,summary_model,summary_tokenizer)\n", |
|
" for wrp in wrap(summary_text, 100):\n", |
|
" print (wrp)\n", |
|
" # np = getnounphrases(summary_text,sentence_transformer_model,3)\n", |
|
" np = get_keywords(context,summary_text)\n", |
|
" print (\"\\n\\nNoun phrases\",np)\n", |
|
" output=\"\"\n", |
|
" for answer in np:\n", |
|
" ques = get_question(summary_text,answer,question_model,question_tokenizer)\n", |
|
" if radiobutton==\"Wordnet\":\n", |
|
" distractors = get_distractors_wordnet(answer)\n", |
|
" else:\n", |
|
" distractors = get_distractors(answer.capitalize(),ques,s2v,sentence_transformer_model,40,0.2)\n", |
|
" # output= output + ques + \"\\n\" + \"Ans: \"+answer.capitalize() + \"\\n\\n\"\n", |
|
" output = output + \"<b style='color:blue;'>\" + ques + \"</b>\"\n", |
|
" output = output + \"<br>\"\n", |
|
" output = output + \"<b style='color:green;'>\" + \"Ans: \" +answer.capitalize()+ \"</b>\"+\"<br>\"\n", |
|
" if len(distractors)>0:\n", |
|
" for distractor in distractors[:4]:\n", |
|
" output = output + \"<b style='color:brown;'>\" + distractor+ \"</b>\"+\"<br>\"\n", |
|
" output = output + \"<br>\"\n", |
|
"\n", |
|
" summary =\"Summary: \"+ summary_text\n", |
|
" for answer in np:\n", |
|
" summary = summary.replace(answer,\"<b>\"+answer+\"</b>\" + \"<br>\")\n", |
|
" summary = summary.replace(answer.capitalize(),\"<b>\"+answer.capitalize()+\"</b>\")\n", |
|
" output = output + \"<p>\"+summary+\"</p>\"\n", |
|
" output = output + \"<br>\"\n", |
|
" return output\n", |
|
"\n", |
|
"\n", |
|
"iface = gr.Interface(\n", |
|
" fn=generate_question, \n", |
|
" inputs=[context,radiobutton], \n", |
|
" outputs=output)\n", |
|
"iface.launch(debug=True)" |
|
] |
|
}, |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": { |
|
"id": "EhKGhA1ff7Hi" |
|
}, |
|
"outputs": [], |
|
"source": [ |
|
"import requests\n", |
|
"\n", |
|
"url = \"https://question-answer.p.rapidapi.com/question-answer\"\n", |
|
"\n", |
|
"querystring = {\"question\":\"What are some tips to starting up your own small business?\"}\n", |
|
"\n", |
|
"headers = {\n", |
|
"\t\"X-RapidAPI-Key\": \"SIGN-UP-FOR-KEY\",\n", |
|
"\t\"X-RapidAPI-Host\": \"question-answer.p.rapidapi.com\"\n", |
|
"}\n", |
|
"\n", |
|
"response = requests.request(\"GET\", url, headers=headers, params=querystring)\n", |
|
"\n", |
|
"print(response.text)" |
|
] |
|
} |
|
], |
|
"metadata": { |
|
"accelerator": "GPU", |
|
"colab": { |
|
"provenance": [] |
|
}, |
|
"gpuClass": "standard", |
|
"kernelspec": { |
|
"display_name": "Python 3", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"name": "python" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 0 |
|
} |