diff --git "a/notebooks/index_creation.ipynb" "b/notebooks/index_creation.ipynb" --- "a/notebooks/index_creation.ipynb" +++ "b/notebooks/index_creation.ipynb" @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","source":["## Preliminary operations"],"metadata":{"id":"viixGIJcKPSQ"},"id":"viixGIJcKPSQ"},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MevE4jEZ5QBT","executionInfo":{"status":"ok","timestamp":1652189481823,"user_tz":-120,"elapsed":25189,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"d4b2a927-e000-442b-ebc6-0d40d8a165d6"},"id":"MevE4jEZ5QBT","execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","source":["# install dependencies\n","! pip install farm-haystack[faiss-gpu]"],"metadata":{"id":"VYWRJ-Lf55nV","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1652189651623,"user_tz":-120,"elapsed":161669,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"5c860ef6-d4cb-4293-d704-51454a3f88bf"},"id":"VYWRJ-Lf55nV","execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting farm-haystack[faiss-gpu]\n"," Downloading farm_haystack-1.4.0-py3-none-any.whl (524 kB)\n","\u001b[K |████████████████████████████████| 524 kB 6.8 MB/s \n","\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (1.3.5)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (2.6.3)\n","Collecting elastic-apm\n"," Downloading elastic_apm-6.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (374 kB)\n","\u001b[K |████████████████████████████████| 374 kB 44.4 MB/s \n","\u001b[?25hCollecting rapidfuzz\n"," Downloading rapidfuzz-2.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n","\u001b[K |████████████████████████████████| 1.8 MB 45.7 MB/s \n","\u001b[?25hCollecting mmh3\n"," Downloading mmh3-3.0.0-cp37-cp37m-manylinux2010_x86_64.whl (50 kB)\n","\u001b[K |████████████████████████████████| 50 kB 6.7 MB/s \n","\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (4.11.3)\n","Collecting mlflow\n"," Downloading mlflow-1.25.1-py3-none-any.whl (16.8 MB)\n","\u001b[K |████████████████████████████████| 16.8 MB 720 kB/s \n","\u001b[?25hCollecting seqeval\n"," Downloading seqeval-1.2.2.tar.gz (43 kB)\n","\u001b[K |████████████████████████████████| 43 kB 2.0 MB/s \n","\u001b[?25hRequirement already satisfied: scikit-learn>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (1.0.2)\n","Collecting langdetect\n"," Downloading langdetect-1.0.9.tar.gz (981 kB)\n","\u001b[K |████████████████████████████████| 981 kB 17.9 MB/s \n","\u001b[?25hCollecting tika\n"," Downloading tika-1.24.tar.gz (28 kB)\n","Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (0.3.4)\n","Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (3.2.5)\n","Collecting pydantic\n"," Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)\n","\u001b[K |████████████████████████████████| 10.9 MB 13.2 MB/s \n","\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (2.23.0)\n","Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (4.3.3)\n","Collecting elasticsearch<=7.10,>=7.7\n"," Downloading elasticsearch-7.10.0-py2.py3-none-any.whl (321 kB)\n","\u001b[K |████████████████████████████████| 321 kB 52.3 MB/s \n","\u001b[?25hCollecting azure-core<1.23\n"," Downloading azure_core-1.22.1-py3-none-any.whl (178 kB)\n","\u001b[K |████████████████████████████████| 178 kB 52.2 MB/s \n","\u001b[?25hCollecting python-docx\n"," Downloading python-docx-0.8.11.tar.gz (5.6 MB)\n","\u001b[K |████████████████████████████████| 5.6 MB 20.4 MB/s \n","\u001b[?25hRequirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (1.4.1)\n","Collecting sentence-transformers>=2.2.0\n"," Downloading sentence-transformers-2.2.0.tar.gz (79 kB)\n","\u001b[K |████████████████████████████████| 79 kB 8.3 MB/s \n","\u001b[?25hRequirement already satisfied: more-itertools in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (8.12.0)\n","Collecting torch<1.11,>1.9\n"," Downloading torch-1.10.2-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)\n","\u001b[K |██████████████████████████████▎ | 834.1 MB 1.3 MB/s eta 0:00:37tcmalloc: large alloc 1147494400 bytes == 0x399b0000 @ 0x7fe1e1193615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7\n","\u001b[K |████████████████████████████████| 881.9 MB 2.0 kB/s \n","\u001b[?25hCollecting transformers==4.13.0\n"," Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)\n","\u001b[K |████████████████████████████████| 3.3 MB 38.0 MB/s \n","\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (4.64.0)\n","Collecting quantulum3\n"," Downloading quantulum3-0.7.10-py3-none-any.whl (10.7 MB)\n","\u001b[K |████████████████████████████████| 10.7 MB 34.5 MB/s \n","\u001b[?25hCollecting azure-ai-formrecognizer==3.2.0b2\n"," Downloading azure_ai_formrecognizer-3.2.0b2-py2.py3-none-any.whl (219 kB)\n","\u001b[K |████████████████████████████████| 219 kB 35.3 MB/s \n","\u001b[?25hCollecting posthog\n"," Downloading posthog-1.4.7-py2.py3-none-any.whl (22 kB)\n","Requirement already satisfied: six>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from azure-ai-formrecognizer==3.2.0b2->farm-haystack[faiss-gpu]) (1.15.0)\n","Collecting azure-common~=1.1\n"," Downloading azure_common-1.1.28-py2.py3-none-any.whl (14 kB)\n","Collecting msrest>=0.6.21\n"," Downloading msrest-0.6.21-py2.py3-none-any.whl (85 kB)\n","\u001b[K |████████████████████████████████| 85 kB 4.2 MB/s \n","\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n"," Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n","\u001b[K |████████████████████████████████| 3.3 MB 33.6 MB/s \n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.13.0->farm-haystack[faiss-gpu]) (1.21.6)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.13.0->farm-haystack[faiss-gpu]) (3.6.0)\n","Collecting huggingface-hub<1.0,>=0.1.0\n"," Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n","\u001b[K |████████████████████████████████| 77 kB 6.6 MB/s \n","\u001b[?25hCollecting pyyaml>=5.1\n"," Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n","\u001b[K |████████████████████████████████| 596 kB 51.0 MB/s \n","\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.13.0->farm-haystack[faiss-gpu]) (2019.12.20)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.13.0->farm-haystack[faiss-gpu]) (21.3)\n","Collecting sacremoses\n"," Downloading sacremoses-0.0.53.tar.gz (880 kB)\n","\u001b[K |████████████████████████████████| 880 kB 43.3 MB/s \n","\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from elasticsearch<=7.10,>=7.7->farm-haystack[faiss-gpu]) (2021.10.8)\n","Requirement already satisfied: urllib3<2,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from elasticsearch<=7.10,>=7.7->farm-haystack[faiss-gpu]) (1.24.3)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.13.0->farm-haystack[faiss-gpu]) (4.2.0)\n","Collecting isodate>=0.6.0\n"," Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n","\u001b[K |████████████████████████████████| 41 kB 623 kB/s \n","\u001b[?25hRequirement already satisfied: requests-oauthlib>=0.5.0 in /usr/local/lib/python3.7/dist-packages (from msrest>=0.6.21->azure-ai-formrecognizer==3.2.0b2->farm-haystack[faiss-gpu]) (1.3.1)\n","Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.13.0->farm-haystack[faiss-gpu]) (3.0.8)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->farm-haystack[faiss-gpu]) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->farm-haystack[faiss-gpu]) (2.10)\n","Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.5.0->msrest>=0.6.21->azure-ai-formrecognizer==3.2.0b2->farm-haystack[faiss-gpu]) (3.2.0)\n","Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=1.0.0->farm-haystack[faiss-gpu]) (1.1.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=1.0.0->farm-haystack[faiss-gpu]) (3.1.0)\n","Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (from sentence-transformers>=2.2.0->farm-haystack[faiss-gpu]) (0.12.0+cu113)\n","Collecting sentencepiece\n"," Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n","\u001b[K |████████████████████████████████| 1.2 MB 31.5 MB/s \n","\u001b[?25hCollecting sqlalchemy-utils\n"," Downloading SQLAlchemy_Utils-0.38.2-py3-none-any.whl (100 kB)\n","\u001b[K |████████████████████████████████| 100 kB 10.0 MB/s \n","\u001b[?25hRequirement already satisfied: sqlalchemy<2,>=1.4.2 in /usr/local/lib/python3.7/dist-packages (from farm-haystack[faiss-gpu]) (1.4.36)\n","Collecting psycopg2-binary\n"," Downloading psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n","\u001b[K |████████████████████████████████| 3.0 MB 37.1 MB/s \n","\u001b[?25hCollecting faiss-gpu<2,>=1.6.3\n"," Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)\n","\u001b[K |████████████████████████████████| 85.5 MB 92 kB/s \n","\u001b[?25hRequirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.7/dist-packages (from sqlalchemy<2,>=1.4.2->farm-haystack[faiss-gpu]) (1.1.2)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->farm-haystack[faiss-gpu]) (3.8.0)\n","Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->farm-haystack[faiss-gpu]) (0.18.1)\n","Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->farm-haystack[faiss-gpu]) (5.7.1)\n","Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->farm-haystack[faiss-gpu]) (21.4.0)\n","Collecting gunicorn\n"," Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)\n","\u001b[K |████████████████████████████████| 79 kB 8.4 MB/s \n","\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (0.4)\n","Requirement already satisfied: sqlparse>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (0.4.2)\n","Collecting alembic\n"," Downloading alembic-1.7.7-py3-none-any.whl (210 kB)\n","\u001b[K |████████████████████████████████| 210 kB 46.3 MB/s \n","\u001b[?25hCollecting gitpython>=2.1.0\n"," Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)\n","\u001b[K |████████████████████████████████| 181 kB 48.8 MB/s \n","\u001b[?25hRequirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (2022.1)\n","Collecting querystring-parser\n"," Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)\n","Collecting prometheus-flask-exporter\n"," Downloading prometheus_flask_exporter-0.20.1-py3-none-any.whl (18 kB)\n","Collecting databricks-cli>=0.8.7\n"," Downloading databricks-cli-0.16.6.tar.gz (62 kB)\n","\u001b[K |████████████████████████████████| 62 kB 622 kB/s \n","\u001b[?25hCollecting docker>=4.0.0\n"," Downloading docker-5.0.3-py2.py3-none-any.whl (146 kB)\n","\u001b[K |████████████████████████████████| 146 kB 52.8 MB/s \n","\u001b[?25hRequirement already satisfied: cloudpickle in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (1.3.0)\n","Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (7.1.2)\n","Requirement already satisfied: Flask in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (1.1.4)\n","Requirement already satisfied: protobuf>=3.7.0 in /usr/local/lib/python3.7/dist-packages (from mlflow->farm-haystack[faiss-gpu]) (3.17.3)\n","Collecting pyjwt>=1.7.0\n"," Downloading PyJWT-2.3.0-py3-none-any.whl (16 kB)\n","Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.7/dist-packages (from databricks-cli>=0.8.7->mlflow->farm-haystack[faiss-gpu]) (0.8.9)\n","Collecting websocket-client>=0.32.0\n"," Downloading websocket_client-1.3.2-py3-none-any.whl (54 kB)\n","\u001b[K |████████████████████████████████| 54 kB 2.9 MB/s \n","\u001b[?25hCollecting gitdb<5,>=4.0.1\n"," Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)\n","\u001b[K |████████████████████████████████| 63 kB 1.9 MB/s \n","\u001b[?25hCollecting smmap<6,>=3.0.1\n"," Downloading smmap-5.0.0-py3-none-any.whl (24 kB)\n","Collecting Mako\n"," Downloading Mako-1.2.0-py3-none-any.whl (78 kB)\n","\u001b[K |████████████████████████████████| 78 kB 7.3 MB/s \n","\u001b[?25hRequirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask->mlflow->farm-haystack[faiss-gpu]) (1.1.0)\n","Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask->mlflow->farm-haystack[faiss-gpu]) (1.0.1)\n","Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask->mlflow->farm-haystack[faiss-gpu]) (2.11.3)\n","Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask->mlflow->farm-haystack[faiss-gpu]) (2.0.1)\n","Requirement already satisfied: setuptools>=3.0 in /usr/local/lib/python3.7/dist-packages (from gunicorn->mlflow->farm-haystack[faiss-gpu]) (57.4.0)\n","Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->farm-haystack[faiss-gpu]) (2.8.2)\n","Collecting backoff<2.0.0,>=1.10.0\n"," Downloading backoff-1.11.1-py2.py3-none-any.whl (13 kB)\n","Collecting monotonic>=1.5\n"," Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n","Requirement already satisfied: prometheus-client in /usr/local/lib/python3.7/dist-packages (from prometheus-flask-exporter->mlflow->farm-haystack[faiss-gpu]) (0.14.1)\n","Requirement already satisfied: lxml>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from python-docx->farm-haystack[faiss-gpu]) (4.2.6)\n","Collecting num2words\n"," Downloading num2words-0.5.10-py3-none-any.whl (101 kB)\n","\u001b[K |████████████████████████████████| 101 kB 11.4 MB/s \n","\u001b[?25hRequirement already satisfied: inflect in /usr/local/lib/python3.7/dist-packages (from quantulum3->farm-haystack[faiss-gpu]) (2.1.0)\n","Requirement already satisfied: docopt>=0.6.2 in /usr/local/lib/python3.7/dist-packages (from num2words->quantulum3->farm-haystack[faiss-gpu]) (0.6.2)\n","Collecting jarowinkler<1.1.0,>=1.0.2\n"," Downloading jarowinkler-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103 kB)\n","\u001b[K |████████████████████████████████| 103 kB 52.7 MB/s \n","\u001b[?25hCollecting torchvision\n"," Downloading torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (21.0 MB)\n","\u001b[K |████████████████████████████████| 21.0 MB 5.2 MB/s \n","\u001b[?25h Downloading torchvision-0.11.3-cp37-cp37m-manylinux1_x86_64.whl (23.2 MB)\n","\u001b[K |████████████████████████████████| 23.2 MB 1.4 MB/s \n","\u001b[?25hRequirement already satisfied: pillow!=8.3.0,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision->sentence-transformers>=2.2.0->farm-haystack[faiss-gpu]) (7.1.2)\n","Building wheels for collected packages: sentence-transformers, langdetect, databricks-cli, python-docx, sacremoses, seqeval, tika\n"," Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120747 sha256=78ee0812cc2d1d74eb33df92f06ab47670672b543b3620e0caeec1881ae3ead0\n"," Stored in directory: /root/.cache/pip/wheels/83/c0/df/b6873ab7aac3f2465aa9144b6b4c41c4391cfecc027c8b07e7\n"," Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=bef3968e1cceab5b68dd1025df3703a389b0206a219a69768c643a09b011013c\n"," Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a\n"," Building wheel for databricks-cli (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for databricks-cli: filename=databricks_cli-0.16.6-py3-none-any.whl size=112631 sha256=72e291d5e52fd87e572aec37b8f5f38bc9848e8183a5dea6ac8c0d91abc1f46e\n"," Stored in directory: /root/.cache/pip/wheels/96/c1/f8/d75a22e789ab6a4dff11f18338c3af4360189aa371295cc934\n"," Building wheel for python-docx (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184507 sha256=63327ac216c3fcdbdaf12c2b06f89b607d9512eb699bf77beb105bf9c8d4df67\n"," Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6\n"," Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=47d211902d48fbf93e6c1e701400ce9a0d097557d10ba112c6412abda8f93abe\n"," Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9\n"," Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=c325eef1ce6ac89a8430038af5d6f149e2e92eaf9a9ad880c7d74f027c738460\n"," Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7\n"," Building wheel for tika (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for tika: filename=tika-1.24-py3-none-any.whl size=32893 sha256=61bb8b103bb3d215af2339c27ad44793b3a7a92182d9658c2f677a9691a583ee\n"," Stored in directory: /root/.cache/pip/wheels/ec/2b/38/58ff05467a742e32f67f5d0de048fa046e764e2fbb25ac93f3\n","Successfully built sentence-transformers langdetect databricks-cli python-docx sacremoses seqeval tika\n","Installing collected packages: smmap, pyyaml, websocket-client, torch, tokenizers, sacremoses, pyjwt, Mako, isodate, huggingface-hub, gitdb, transformers, torchvision, sentencepiece, querystring-parser, prometheus-flask-exporter, num2words, msrest, monotonic, jarowinkler, gunicorn, gitpython, docker, databricks-cli, backoff, azure-core, azure-common, alembic, tika, seqeval, sentence-transformers, rapidfuzz, quantulum3, python-docx, pydantic, posthog, mmh3, mlflow, langdetect, elasticsearch, elastic-apm, azure-ai-formrecognizer, sqlalchemy-utils, psycopg2-binary, farm-haystack, faiss-gpu\n"," Attempting uninstall: pyyaml\n"," Found existing installation: PyYAML 3.13\n"," Uninstalling PyYAML-3.13:\n"," Successfully uninstalled PyYAML-3.13\n"," Attempting uninstall: torch\n"," Found existing installation: torch 1.11.0+cu113\n"," Uninstalling torch-1.11.0+cu113:\n"," Successfully uninstalled torch-1.11.0+cu113\n"," Attempting uninstall: torchvision\n"," Found existing installation: torchvision 0.12.0+cu113\n"," Uninstalling torchvision-0.12.0+cu113:\n"," Successfully uninstalled torchvision-0.12.0+cu113\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.10.2 which is incompatible.\n","torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.10.2 which is incompatible.\u001b[0m\n","Successfully installed Mako-1.2.0 alembic-1.7.7 azure-ai-formrecognizer-3.2.0b2 azure-common-1.1.28 azure-core-1.22.1 backoff-1.11.1 databricks-cli-0.16.6 docker-5.0.3 elastic-apm-6.9.1 elasticsearch-7.10.0 faiss-gpu-1.7.2 farm-haystack-1.4.0 gitdb-4.0.9 gitpython-3.1.27 gunicorn-20.1.0 huggingface-hub-0.5.1 isodate-0.6.1 jarowinkler-1.0.2 langdetect-1.0.9 mlflow-1.25.1 mmh3-3.0.0 monotonic-1.6 msrest-0.6.21 num2words-0.5.10 posthog-1.4.7 prometheus-flask-exporter-0.20.1 psycopg2-binary-2.9.3 pydantic-1.9.0 pyjwt-2.3.0 python-docx-0.8.11 pyyaml-6.0 quantulum3-0.7.10 querystring-parser-1.2.4 rapidfuzz-2.0.11 sacremoses-0.0.53 sentence-transformers-2.2.0 sentencepiece-0.1.96 seqeval-1.2.2 smmap-5.0.0 sqlalchemy-utils-0.38.2 tika-1.24 tokenizers-0.10.3 torch-1.10.2 torchvision-0.11.3 transformers-4.13.0 websocket-client-1.3.2\n"]}]},{"cell_type":"markdown","source":["## Load data"],"metadata":{"id":"QVDuHAMIK4bg"},"id":"QVDuHAMIK4bg"},{"cell_type":"code","execution_count":3,"id":"72139774","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:46.176031Z","iopub.status.busy":"2022-01-09T08:40:46.175755Z","iopub.status.idle":"2022-01-09T08:40:46.179554Z","shell.execute_reply":"2022-01-09T08:40:46.178704Z","shell.execute_reply.started":"2022-01-09T08:40:46.175959Z"},"id":"72139774","executionInfo":{"status":"ok","timestamp":1652189651625,"user_tz":-120,"elapsed":32,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}}},"outputs":[],"source":["import glob\n","import json"]},{"cell_type":"code","execution_count":4,"id":"4421e328","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:47.846999Z","iopub.status.busy":"2022-01-09T08:40:47.846757Z","iopub.status.idle":"2022-01-09T08:40:48.327632Z","shell.execute_reply":"2022-01-09T08:40:48.326829Z","shell.execute_reply.started":"2022-01-09T08:40:47.846975Z"},"id":"4421e328","executionInfo":{"status":"ok","timestamp":1652189675961,"user_tz":-120,"elapsed":24363,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}}},"outputs":[],"source":["DATA_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/wklp/data'\n","\n","docs=[]\n","\n","for json_file in glob.glob(f'{DATA_DIRECTORY}/*.json'):\n"," with open(json_file, 'r') as fin:\n"," json_content=json.load(fin)\n"," \n"," doc={'content': json_content['text'],\n"," 'meta': {'name': json_content['name'],\n"," 'url': json_content['url']}}\n"," docs.append(doc)"]},{"cell_type":"code","source":["len(docs)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GR6qWQAn72WG","executionInfo":{"status":"ok","timestamp":1652189679928,"user_tz":-120,"elapsed":9,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"3e17336f-1145-43ff-c3ca-fab7604343d1"},"id":"GR6qWQAn72WG","execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1087"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","execution_count":6,"id":"aa231b94","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:48.796741Z","iopub.status.busy":"2022-01-09T08:40:48.796550Z","iopub.status.idle":"2022-01-09T08:40:48.805224Z","shell.execute_reply":"2022-01-09T08:40:48.804705Z","shell.execute_reply.started":"2022-01-09T08:40:48.796722Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"aa231b94","executionInfo":{"status":"ok","timestamp":1652189681394,"user_tz":-120,"elapsed":10,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"a42147fb-b9a4-4500-cc96-ce73177030f9"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'content': \"Pete Lindstrom\\nPete Lindstrom was a citizen of Twin Peaks, Washington who was killed in the Blizzard of 1889.\\nHis death was witnessed by Knut Zimmerman, who reported that wind had plunged a candle from the Annual Candlelighting and Christmas Tree Ceremony into the back of Lindstrom's head, killing him.\",\n"," 'meta': {'name': 'Pete_Lindstrom',\n"," 'url': 'https://twinpeaks.fandom.com/wiki/Pete_Lindstrom'}}"]},"metadata":{},"execution_count":6}],"source":["docs[5]"]},{"cell_type":"markdown","source":["## Define document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents\n","\n"],"metadata":{"id":"Yu3bAUPoLrPI"},"id":"Yu3bAUPoLrPI"},{"cell_type":"code","execution_count":8,"id":"bfe846df","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:40:59.678181Z","iopub.status.busy":"2022-01-09T08:40:59.678003Z","iopub.status.idle":"2022-01-09T08:40:59.753228Z","shell.execute_reply":"2022-01-09T08:40:59.752500Z","shell.execute_reply.started":"2022-01-09T08:40:59.678161Z"},"id":"bfe846df","executionInfo":{"status":"ok","timestamp":1652190218453,"user_tz":-120,"elapsed":10410,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"187c2d40-470a-4f87-ab50-ec4082bccb33"},"outputs":[{"output_type":"stream","name":"stderr","text":["INFO - haystack.modeling.model.optimization - apex not found, won't use it. See https://nvidia.github.io/apex/\n","ERROR - root - Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n","INFO - haystack.telemetry - Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry\n"]}],"source":["from haystack.document_stores import FAISSDocumentStore\n","\n","# the document store settings are those compatible with Embedding Retriever\n","document_store = FAISSDocumentStore(\n"," similarity=\"dot_product\",\n"," embedding_dim=768)"]},{"cell_type":"code","execution_count":9,"id":"bc5adb1c","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:41:04.538529Z","iopub.status.busy":"2022-01-09T08:41:04.538227Z","iopub.status.idle":"2022-01-09T08:41:05.147190Z","shell.execute_reply":"2022-01-09T08:41:05.146513Z","shell.execute_reply.started":"2022-01-09T08:41:04.538503Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"bc5adb1c","executionInfo":{"status":"ok","timestamp":1652190317389,"user_tz":-120,"elapsed":2085,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"4cc11a2d-5ce5-41c1-e5eb-a0ee411ab00b"},"outputs":[{"output_type":"stream","name":"stdout","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"stream","name":"stderr","text":[" 0%| | 0/1087 [00:00\n"]}],"source":["print(preprocessed_docs[5])\n"]},{"cell_type":"code","source":["len(preprocessed_docs)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"b9PS0PkM_1EF","executionInfo":{"status":"ok","timestamp":1652190343399,"user_tz":-120,"elapsed":370,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"25fba54f-46d9-4c53-b0c1-15e8a878cad0"},"id":"b9PS0PkM_1EF","execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["2825"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","execution_count":81,"id":"191144b4","metadata":{"execution":{"iopub.execute_input":"2022-01-09T08:41:10.695292Z","iopub.status.busy":"2022-01-09T08:41:10.695064Z","iopub.status.idle":"2022-01-09T08:41:22.144864Z","shell.execute_reply":"2022-01-09T08:41:22.144203Z","shell.execute_reply.started":"2022-01-09T08:41:10.695271Z"},"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["425730d860514e2d87c0870cbb943842","06c58f8fc29343fa96e36d5b1f8dd078","046fa73af99645cc88b49c0f3e5f96b7","e256a26a0f41436a9755c56f3ffebd11","1e2bf8bf2ab14c9e880c06b04f752a1b","1377c76f1051467fb391c2c0119b0634","4d4babe9fcb24dd7996ecbeb7006018f","ff4bc8be1b8041e6a116bc37e366bf96","e004a6c61f2d4e1d8e9d02c51dcc6ebd","88c675dce7bd4247842ffeb6470d31dd","1d447ec86fe84008b29495ecb78a7fac"]},"id":"191144b4","executionInfo":{"status":"ok","timestamp":1652179167100,"user_tz":-120,"elapsed":11491,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"}},"outputId":"c30f2216-2c6c-4f28-867c-dfc0bd76bc09"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Writing Documents: 0%| | 0/2825 [00:00\n"]}],"source":["print(preprocessed_docs[5])\n"]},{"cell_type":"code","execution_count":12,"id":"b9PS0PkM_1EF","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":370,"status":"ok","timestamp":1652190343399,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"b9PS0PkM_1EF","outputId":"25fba54f-46d9-4c53-b0c1-15e8a878cad0"},"outputs":[{"data":{"text/plain":["2825"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["len(preprocessed_docs)"]},{"cell_type":"code","execution_count":81,"id":"191144b4","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["425730d860514e2d87c0870cbb943842","06c58f8fc29343fa96e36d5b1f8dd078","046fa73af99645cc88b49c0f3e5f96b7","e256a26a0f41436a9755c56f3ffebd11","1e2bf8bf2ab14c9e880c06b04f752a1b","1377c76f1051467fb391c2c0119b0634","4d4babe9fcb24dd7996ecbeb7006018f","ff4bc8be1b8041e6a116bc37e366bf96","e004a6c61f2d4e1d8e9d02c51dcc6ebd","88c675dce7bd4247842ffeb6470d31dd","1d447ec86fe84008b29495ecb78a7fac"]},"execution":{"iopub.execute_input":"2022-01-09T08:41:10.695292Z","iopub.status.busy":"2022-01-09T08:41:10.695064Z","iopub.status.idle":"2022-01-09T08:41:22.144864Z","shell.execute_reply":"2022-01-09T08:41:22.144203Z","shell.execute_reply.started":"2022-01-09T08:41:10.695271Z"},"executionInfo":{"elapsed":11491,"status":"ok","timestamp":1652179167100,"user":{"displayName":"Stefano Fiorucci","userId":"12409279692445770059"},"user_tz":-120},"id":"191144b4","outputId":"c30f2216-2c6c-4f28-867c-dfc0bd76bc09"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"425730d860514e2d87c0870cbb943842","version_major":2,"version_minor":0},"text/plain":["Writing Documents: 0%| | 0/2825 [00:00