File size: 118,371 Bytes

87b6881

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !python3 -m venv env \n",
    "# !source env/bin/activate \n",
    "# !pip3 install langchain\n",
    "# !pip3 install pypdf2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting transformers\n",
      "  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m198.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting filelock (from transformers)\n",
      "  Downloading filelock-3.16.0-py3-none-any.whl.metadata (3.0 kB)\n",
      "Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)\n",
      "  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)\n",
      "Requirement already satisfied: numpy>=1.17 in ./env/lib/python3.11/site-packages (from transformers) (1.26.4)\n",
      "Requirement already satisfied: packaging>=20.0 in ./env/lib/python3.11/site-packages (from transformers) (24.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in ./env/lib/python3.11/site-packages (from transformers) (6.0.2)\n",
      "Collecting regex!=2019.12.17 (from transformers)\n",
      "  Downloading regex-2024.7.24-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m992.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in ./env/lib/python3.11/site-packages (from transformers) (2.32.3)\n",
      "Collecting safetensors>=0.4.1 (from transformers)\n",
      "  Downloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)\n",
      "Collecting tokenizers<0.20,>=0.19 (from transformers)\n",
      "  Downloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)\n",
      "Collecting tqdm>=4.27 (from transformers)\n",
      "  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m966.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)\n",
      "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests->transformers) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests->transformers) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests->transformers) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests->transformers) (2024.7.4)\n",
      "Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.5/9.5 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
      "\u001b[?25hDownloading huggingface_hub-0.24.6-py3-none-any.whl (417 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m417.5/417.5 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.7.24-cp311-cp311-macosx_11_0_arm64.whl (278 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.9/278.9 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl (381 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m381.5/381.5 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl (2.4 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading tqdm-4.66.5-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.4/78.4 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading filelock-3.16.0-py3-none-any.whl (16 kB)\n",
      "Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: tqdm, safetensors, regex, fsspec, filelock, huggingface-hub, tokenizers, transformers\n",
      "Successfully installed filelock-3.16.0 fsspec-2024.9.0 huggingface-hub-0.24.6 regex-2024.7.24 safetensors-0.4.5 tokenizers-0.19.1 tqdm-4.66.5 transformers-4.44.2\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install transformers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting tensorflow\n",
      "  Downloading tensorflow-2.17.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)\n",
      "Collecting absl-py>=1.0.0 (from tensorflow)\n",
      "  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)\n",
      "Collecting astunparse>=1.6.0 (from tensorflow)\n",
      "  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)\n",
      "Collecting flatbuffers>=24.3.25 (from tensorflow)\n",
      "  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)\n",
      "Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)\n",
      "  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)\n",
      "Collecting google-pasta>=0.1.1 (from tensorflow)\n",
      "  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)\n",
      "Collecting h5py>=3.10.0 (from tensorflow)\n",
      "  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)\n",
      "Collecting libclang>=13.0.0 (from tensorflow)\n",
      "  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)\n",
      "Collecting ml-dtypes<0.5.0,>=0.3.1 (from tensorflow)\n",
      "  Downloading ml_dtypes-0.4.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (20 kB)\n",
      "Collecting opt-einsum>=2.3.2 (from tensorflow)\n",
      "  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)\n",
      "Requirement already satisfied: packaging in ./env/lib/python3.11/site-packages (from tensorflow) (24.1)\n",
      "Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow)\n",
      "  Downloading protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)\n",
      "Requirement already satisfied: requests<3,>=2.21.0 in ./env/lib/python3.11/site-packages (from tensorflow) (2.32.3)\n",
      "Requirement already satisfied: setuptools in ./env/lib/python3.11/site-packages (from tensorflow) (68.2.2)\n",
      "Requirement already satisfied: six>=1.12.0 in ./env/lib/python3.11/site-packages (from tensorflow) (1.16.0)\n",
      "Collecting termcolor>=1.1.0 (from tensorflow)\n",
      "  Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
      "Requirement already satisfied: typing-extensions>=3.6.6 in ./env/lib/python3.11/site-packages (from tensorflow) (4.12.2)\n",
      "Collecting wrapt>=1.11.0 (from tensorflow)\n",
      "  Downloading wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)\n",
      "Collecting grpcio<2.0,>=1.24.3 (from tensorflow)\n",
      "  Downloading grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.9 kB)\n",
      "Collecting tensorboard<2.18,>=2.17 (from tensorflow)\n",
      "  Downloading tensorboard-2.17.1-py3-none-any.whl.metadata (1.6 kB)\n",
      "Collecting keras>=3.2.0 (from tensorflow)\n",
      "  Downloading keras-3.5.0-py3-none-any.whl.metadata (5.8 kB)\n",
      "Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)\n",
      "  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (14 kB)\n",
      "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in ./env/lib/python3.11/site-packages (from tensorflow) (1.26.4)\n",
      "Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)\n",
      "  Downloading wheel-0.44.0-py3-none-any.whl.metadata (2.3 kB)\n",
      "Collecting rich (from keras>=3.2.0->tensorflow)\n",
      "  Downloading rich-13.8.1-py3-none-any.whl.metadata (18 kB)\n",
      "Collecting namex (from keras>=3.2.0->tensorflow)\n",
      "  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)\n",
      "Collecting optree (from keras>=3.2.0->tensorflow)\n",
      "  Downloading optree-0.12.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (47 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m263.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow) (2024.7.4)\n",
      "Collecting markdown>=2.6.8 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)\n",
      "Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)\n",
      "Collecting werkzeug>=1.0.1 (from tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading werkzeug-3.0.4-py3-none-any.whl.metadata (3.7 kB)\n",
      "Collecting MarkupSafe>=2.1.1 (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow)\n",
      "  Downloading MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.0 kB)\n",
      "Collecting markdown-it-py>=2.2.0 (from rich->keras>=3.2.0->tensorflow)\n",
      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow) (2.18.0)\n",
      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow)\n",
      "  Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
      "Downloading tensorflow-2.17.0-cp311-cp311-macosx_12_0_arm64.whl (236.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.2/236.2 MB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading absl_py-2.1.0-py3-none-any.whl (133 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.7/133.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)\n",
      "Downloading flatbuffers-24.3.25-py2.py3-none-any.whl (26 kB)\n",
      "Downloading gast-0.6.0-py3-none-any.whl (21 kB)\n",
      "Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl (10.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.6/10.6 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl (2.9 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading keras-3.5.0-py3-none-any.whl (1.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl (25.8 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m25.8/25.8 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading ml_dtypes-0.4.0-cp311-cp311-macosx_10_9_universal2.whl (390 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m390.9/390.9 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.5/65.5 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading protobuf-4.25.4-cp37-abi3-macosx_10_9_universal2.whl (394 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m394.2/394.2 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading tensorboard-2.17.1-py3-none-any.whl (5.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hDownloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-macosx_12_0_arm64.whl (3.5 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hDownloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
      "Downloading wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl (38 kB)\n",
      "Downloading Markdown-3.7-py3-none-any.whl (106 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.3/106.3 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)\n",
      "Downloading werkzeug-3.0.4-py3-none-any.whl (227 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading wheel-0.44.0-py3-none-any.whl (67 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading namex-0.0.8-py3-none-any.whl (5.8 kB)\n",
      "Downloading optree-0.12.1-cp311-cp311-macosx_11_0_arm64.whl (283 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.7/283.7 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading rich-13.8.1-py3-none-any.whl (241 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.6/241.6 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl (18 kB)\n",
      "Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
      "Installing collected packages: namex, libclang, flatbuffers, wrapt, wheel, termcolor, tensorflow-io-gcs-filesystem, tensorboard-data-server, protobuf, optree, opt-einsum, ml-dtypes, mdurl, MarkupSafe, markdown, h5py, grpcio, google-pasta, gast, absl-py, werkzeug, markdown-it-py, astunparse, tensorboard, rich, keras, tensorflow\n",
      "Successfully installed MarkupSafe-2.1.5 absl-py-2.1.0 astunparse-1.6.3 flatbuffers-24.3.25 gast-0.6.0 google-pasta-0.2.0 grpcio-1.66.1 h5py-3.11.0 keras-3.5.0 libclang-18.1.1 markdown-3.7 markdown-it-py-3.0.0 mdurl-0.1.2 ml-dtypes-0.4.0 namex-0.0.8 opt-einsum-3.3.0 optree-0.12.1 protobuf-4.25.4 rich-13.8.1 tensorboard-2.17.1 tensorboard-data-server-0.7.2 tensorflow-2.17.0 tensorflow-io-gcs-filesystem-0.37.1 termcolor-2.4.0 werkzeug-3.0.4 wheel-0.44.0 wrapt-1.16.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing installation: keras 3.5.0\n",
      "Uninstalling keras-3.5.0:\n",
      "  Would remove:\n",
      "    /Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/keras-3.5.0.dist-info/*\n",
      "    /Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/keras/*\n",
      "Proceed (Y/n)? ^C\n",
      "\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
      "\u001b[0mCollecting tf-keras\n",
      "  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)\n",
      "Requirement already satisfied: tensorflow<2.18,>=2.17 in ./env/lib/python3.11/site-packages (from tf-keras) (2.17.0)\n",
      "Requirement already satisfied: absl-py>=1.0.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.1.0)\n",
      "Requirement already satisfied: astunparse>=1.6.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.6.3)\n",
      "Requirement already satisfied: flatbuffers>=24.3.25 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (24.3.25)\n",
      "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.6.0)\n",
      "Requirement already satisfied: google-pasta>=0.1.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.2.0)\n",
      "Requirement already satisfied: h5py>=3.10.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.11.0)\n",
      "Requirement already satisfied: libclang>=13.0.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (18.1.1)\n",
      "Requirement already satisfied: ml-dtypes<0.5.0,>=0.3.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.4.0)\n",
      "Requirement already satisfied: opt-einsum>=2.3.2 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.3.0)\n",
      "Requirement already satisfied: packaging in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (24.1)\n",
      "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (4.25.4)\n",
      "Requirement already satisfied: requests<3,>=2.21.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.32.3)\n",
      "Requirement already satisfied: setuptools in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (68.2.2)\n",
      "Requirement already satisfied: six>=1.12.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.16.0)\n",
      "Requirement already satisfied: termcolor>=1.1.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.4.0)\n",
      "Requirement already satisfied: typing-extensions>=3.6.6 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (4.12.2)\n",
      "Requirement already satisfied: wrapt>=1.11.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.16.0)\n",
      "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.66.1)\n",
      "Requirement already satisfied: tensorboard<2.18,>=2.17 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (2.17.1)\n",
      "Requirement already satisfied: keras>=3.2.0 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (3.5.0)\n",
      "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (0.37.1)\n",
      "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in ./env/lib/python3.11/site-packages (from tensorflow<2.18,>=2.17->tf-keras) (1.26.4)\n",
      "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./env/lib/python3.11/site-packages (from astunparse>=1.6.0->tensorflow<2.18,>=2.17->tf-keras) (0.44.0)\n",
      "Requirement already satisfied: rich in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (13.8.1)\n",
      "Requirement already satisfied: namex in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.0.8)\n",
      "Requirement already satisfied: optree in ./env/lib/python3.11/site-packages (from keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.12.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (3.8)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (2.2.2)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in ./env/lib/python3.11/site-packages (from requests<3,>=2.21.0->tensorflow<2.18,>=2.17->tf-keras) (2024.7.4)\n",
      "Requirement already satisfied: markdown>=2.6.8 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (3.7)\n",
      "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (0.7.2)\n",
      "Requirement already satisfied: werkzeug>=1.0.1 in ./env/lib/python3.11/site-packages (from tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (3.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=2.1.1 in ./env/lib/python3.11/site-packages (from werkzeug>=1.0.1->tensorboard<2.18,>=2.17->tensorflow<2.18,>=2.17->tf-keras) (2.1.5)\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (3.0.0)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./env/lib/python3.11/site-packages (from rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (2.18.0)\n",
      "Requirement already satisfied: mdurl~=0.1 in ./env/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.2.0->tensorflow<2.18,>=2.17->tf-keras) (0.1.2)\n",
      "Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m0m\n",
      "\u001b[?25hInstalling collected packages: tf-keras\n",
      "Successfully installed tf-keras-2.17.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip uninstall keras\n",
    "!pip install tf-keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.\n",
      "\n",
      "All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.\n",
      "/Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "# Load the pre-trained model\n",
    "nlp = pipeline(\"question-answering\", model=\"distilbert-base-uncased-distilled-squad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = \"\"\"\n",
    "Hugging Face is an AI company specializing in Natural Language Processing (NLP). Their transformers library is widely used for various NLP tasks, including text classification, question answering, and summarization.\n",
    "\"\"\"\n",
    "question = \"What does Hugging Face specialize in?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.551713228225708, 'start': 47, 'end': 74, 'answer': 'Natural Language Processing'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=context)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pdfplumber\n",
      "  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m193.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting pdfminer.six==20231228 (from pdfplumber)\n",
      "  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)\n",
      "Collecting Pillow>=9.1 (from pdfplumber)\n",
      "  Downloading pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.2 kB)\n",
      "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
      "  Downloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m955.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in ./env/lib/python3.11/site-packages (from pdfminer.six==20231228->pdfplumber) (3.3.2)\n",
      "Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl.metadata (5.4 kB)\n",
      "Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (1.5 kB)\n",
      "Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)\n",
      "  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)\n",
      "Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
      "\u001b[?25hDownloading pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl (3.4 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl (2.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl (6.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
      "\u001b[?25hDownloading cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl (178 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m178.7/178.7 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pycparser-2.22-py3-none-any.whl (117 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.6/117.6 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: pypdfium2, pycparser, Pillow, cffi, cryptography, pdfminer.six, pdfplumber\n",
      "Successfully installed Pillow-10.4.0 cffi-1.17.1 cryptography-43.0.1 pdfminer.six-20231228 pdfplumber-0.11.4 pycparser-2.22 pypdfium2-4.30.0\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install pdfplumber\n",
    "#fitz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 351,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UNITED STATES BANKRUPTCY COURT\n",
      "MIDDLE DISTRICT OF GEORGIA\n",
      "IN RE: CHAPTER 13\n",
      "Ashley Unshandra Haygood\n",
      "DEBTOR(S) CASE NO.24−50911−AEC\n",
      "ORDER OF DISMISSAL\n",
      "It appearing to the Court that the trustee has moved to dismiss the above−captioned case, and sent notice of said\n",
      "motion and hearing date to the debtor(s) and debtor's(s') attorney, if any; and a hearing was held on this motion\n",
      "wherein just cause was found to grant the trustee's motion; and it is hereby\n",
      "ORDERED that this case be and the same is dismissed; and it is further\n",
      "ORDERED that the Clerk of this Court take such action as is appropriate to close this case.\n",
      "Dated: 8/26/24 /s/ Austin E. Carter\n",
      "United States Bankruptcy Judge\n",
      "Powered by TCPDF (www.tcpdf.org)\n",
      "1 / 1\n"
     ]
    }
   ],
   "source": [
    "import pdfplumber\n",
    "\n",
    "pdf_path = \"BK Examples/Ashley Unshandra Haygood.pdf\"\n",
    "text_all = \"\"\n",
    "\n",
    "# Open the PDF file\n",
    "with pdfplumber.open(pdf_path) as pdf:\n",
    "    # Iterate through each page\n",
    "    for page in pdf.pages:\n",
    "        text = page.extract_text()\n",
    "        text_all += text\n",
    "\n",
    "print(text_all)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"BK Examples/Ashley Unshandra Haygood.pdf\", \"rb\") as file:\n",
    "    reader = PyPDF2.PdfReader(file)\n",
    "    text_all = ''\n",
    "    # Extract text from each page\n",
    "    for page_num in range(len(reader.pages)):\n",
    "        page = reader.pages[page_num]\n",
    "        text = page.extract_text()\n",
    "        text_all = text_all +text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\" \\nUNITED STATES BANKRUPTCY COURT\\nMIDDLE DISTRICT OF GEORGIA\\nIN RE:                               CHAPTER 13\\nAshley Unshandra Haygood\\nDEBTOR(S)                               CASE NO.24−50911−AEC\\nORDER OF DISMISSAL\\n          It appearing to the Court that the trustee has moved to dismiss the above−captioned case, and sent notice of said\\nmotion and hearing date to the debtor(s) and debtor's(s') attorney, if any; and a hearing was held on this motion\\nwherein just cause was found to grant the trustee's motion; and it is hereby\\nORDERED  that this case be and the same is dismissed; and it is further\\nORDERED  that the Clerk of this Court take such action as is appropriate to close this case.\\nDated: 8/26/24 /s/ Austin E. Carter\\nUnited States Bankruptcy JudgePowered by TCPDF (www.tcpdf.org)\\n                               1 / 1\""
      ]
     },
     "execution_count": 355,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = \"\"\"\n",
    "Hugging Face is an AI company specializing in Natural Language Processing (NLP). Their transformers library is widely used for various NLP tasks, including text classification, question answering, and summarization.\n",
    "\"\"\"\n",
    "question = \"What is the state according to the country? Like Utah\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.0804615467786789, 'start': 330, 'end': 383, 'answer': 'United States Bankruptcy Court   District of UtahDate'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Which stage is the proccess at? Answer between petition,discharge or dismissed ?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.3394260108470917, 'start': 1563, 'end': 1578, 'answer': 'chapter 13 plan'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What are the Social Security number or ITIN?\"\n",
    "question = \"List all the Social Security number or ITIN\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What are the Social Security number or ITIN?\"\n",
    "question = \"how many Social Security number or ITIN are in the text?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'score': 0.7864278554916382, 'start': 137, 'end': 148, 'answer': '461−81−0513'}\n"
     ]
    }
   ],
   "source": [
    "result = nlp(question=question, context=text_all)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.\n",
      "\n",
      "All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering\n",
    "import tensorflow as tf\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "model = TFDistilBertForQuestionAnswering.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"What is the fifth Social Security number or ITIN in the text?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' \\nInformation to identify the case:\\nDebtor 1 Cristina Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    461−81−0513\\nEIN    _ _−_ _ _ _ _ _ _\\nDebtor 2\\n(Spouse, if filing)Timothy Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    529−97−1200\\nEIN    _ _−_ _ _ _ _ _ _\\nUnited States Bankruptcy Court   District of UtahDate case filed for chapter  13:   8/7/24Case number:   24−23963   JTM\\nOfficial Form 309I\\nNotice of Chapter 13 Bankruptcy Case 10/20\\nFor the debtors listed above, a case has been filed under chapter 13 of the Bankruptcy Code. An order for relief has\\nbeen entered.\\nThis notice has important information about the case for creditors, debtors, and trustees, including information about\\nthe meeting of creditors and deadlines. Read both pages carefully.\\nThe filing of the case imposed an automatic stay against most collection activities. This means that creditors generally may not take action to collect debts\\nfrom the debt'"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering\n",
    "import tensorflow as tf\n",
    "\n",
    "tokenizer = DistilBertTokenizer.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "model = TFDistilBertForQuestionAnswering.from_pretrained(\"distilbert-base-cased-distilled-squad\")\n",
    "\n",
    "question = \"What is the fifth Social Security number or ITIN in the text?\"\n",
    "inputs = tokenizer(question, text_all[:3000], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)\n",
    "answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = inputs = tokenizer(question, text, return_tensors=\"tf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = inputs['input_ids'].numpy()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "708"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = []\n",
    "for start in range(0, len(input_ids), 512 - 2):  # -2 for [CLS] and [SEP]\n",
    "    end = min(start + 512 - 2, len(input_ids))\n",
    "    chunk = input_ids[start:end]\n",
    "    chunks.append(chunk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_text(text, tokenizer, max_length=512):\n",
    "    # Tokenize the text to get input IDs\n",
    "    inputs = tokenizer(question, text, return_tensors=\"tf\")\n",
    "    ##tokenizer(text, return_tensors=\"tf\", max_length=max_length, truncation=True, padding='max_length')\n",
    "    input_ids = inputs['input_ids'].numpy()[0]\n",
    "    \n",
    "    # Chunk the text\n",
    "    chunks = []\n",
    "    for start in range(0, len(input_ids), max_length - 2):  # -2 for [CLS] and [SEP]\n",
    "        end = min(start + max_length - 2, len(input_ids))\n",
    "        chunk = input_ids[start:end]\n",
    "        chunks.append(chunk)\n",
    "    \n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_text(text, tokenizer, max_length=512):\n",
    "    # Tokenize the text into chunks\n",
    "    tokens = tokenizer(text, add_special_tokens=False, return_tensors=None)['input_ids']\n",
    "    chunks = []\n",
    "    for start in range(0, len(tokens), max_length - 20):  # -2 for [CLS] and [SEP]\n",
    "        end = min(start + max_length - 20, len(tokens))\n",
    "        chunk_tokens = tokens[start:end]\n",
    "        chunk_text = tokenizer.convert_ids_to_tokens(chunk_tokens, skip_special_tokens=True)\n",
    "        chunk_text = tokenizer.convert_tokens_to_string(chunk_text)\n",
    "        chunks.append(chunk_text)\n",
    "    \n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse , if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States Bankruptcy Court District of UtahDate case filed for chapter 13 : 8 / 7 / 24Case number : 24−23963 JTM Official Form 309I Notice of Chapter 13 Bankruptcy Case 10 / 20 For the debtors listed above , a case has been filed under chapter 13 of the Bankruptcy Code . An order for relief has been entered . This notice has important information about the case for creditors , debtors , and trustees , including information about the meeting of creditors and deadlines . Read both pages carefully . The filing of the case imposed an automatic stay against most collection activities . This means that creditors generally may not take action to collect debts from the debtors , the debtors ' property , and certain codebtors . For example , while the stay is in effect , creditors cannot sue , garnish wages , assert a deficiency , repossess property , or otherwise try to collect from the debtors . Creditors cannot demand repayment from debtors by mail , phone , or otherwise . Creditors who violate the stay can be required to pay actual and punitive damages and attorney ' s fees . Under certain circumstances , the stay may be limited to 30 days or not exist at all , although debtors can ask the court to extend or impose a stay . Confirmation of a chapter 13 plan may result in a discharge . Creditors who assert that the debtors are not entitled to a discharge under 11 U . S . C . § 1328 ( f ) must file a motion objecting to discharge in the bankruptcy clerk ' s office within the deadline specified in this notice . Creditors who want to have their debt excepted from discharge may be required to file a complaint in the bankruptcy clerk ' s office by the same deadline . ( See line 13 below for more information . ) To protect your rights , consult an attorney . All documents filed in the case may be inspected at\",\n",
       " \"the bankruptcy clerk ' s office at the address listed below or through PACER ( Public Access to Court Electronic Records at https : / / pacer . uscourts . gov ) . Case status information is available at no charge by calling the Voice Case Information System ( VCIS ) at 1−866−222−8029 . The staff of the bankruptcy clerk ' s office cannot give legal advice . To help creditors correctly identify debtors , debtors submit full Social Security or Individual Taxpayer Identification Numbers , which may appear on a version of this notice . However , the full numbers must not appear on any document filed with the court . Do not file this notice with any proof of claim or other filing in the case . Do not include more than the last four digits of a Social Security or Individual Taxpayer Identification Number in any document , including attachments , that you file with the court . About Debtor 1 : About Debtor 2 : 1 . Debtor ' s full name Cristina Nelson Timothy Nelson 2 . All other names used in the last 8 years 3 . Address3342 N 1825 E Layton , UT 840401788 E 3350 N Layton , UT 84040 4 . Debtor ' s attorney Name and addressE . Kent Winward 4850 Harrison Blvd . Suite 1 Ogden , UT 84403Contact phone ( 801 ) 392−8200 Email : utahbankruptcyfirm @ gmail . com 5 . Bankruptcy trustee Name and addressLon Jenkins tr Ch . 13 Trustee ' s Office 465 South 400 East Suite 200 Salt Lake City , UT 84111Contact phone 801−596−2884 Email : utahtrusteemail @ ch13ut . org 6 . Bankruptcy clerk ' s office Documents in this case may be filed at this address . You may inspect all records filed in this case at this office or online at https : / / pacer . uscourts . gov . United States Bankruptcy Court District of Utah 350 South Main # 301 Salt Lake City , UT 84101 Clerk of Court : David A . SimeHours open : 8 : 00 AM to 4 : 30 PM , Monday − Friday Contact phone : ( 80\",\n",
       " \"##1 ) 524−6687 Website : www . utb . uscourts . gov Official Form 309I Notice of Chapter 13 Bankruptcy Case page 1 Date Generated : 8 / 14 / 24 For more information , see page 2 > 1 / 2 Debtor Cristina Nelson and Timothy Nelson Case number 24−23963 7 . Meeting of creditors Debtors must attend the meeting to be questioned under oath . In a joint case , both spouses must attend . Creditors may attend , but are not required to do so . The meeting may be continued or adjourned to a later date . All individual debtor ( s ) must provide picture identification and proof of social security number to the trustee . Failure to do so may result in your case being dismissed . Meeting to be held on : Sep . 5 , 2024 at 11 : 00 AM Location : Zoom video meeting . Go to Zoom . us / join , Enter Meeting ID 437 037 6107 , and Passcode 3375693596 , OR call 1−385−832−9074 For additional meeting information go to https : / / www . justice . gov / ust / moc 8 . Deadlines The bankruptcy clerk ' s office must receive these documents and any required filing fee by the following deadlines . Deadline to file a complaint to challenge dischargeability of certain debts : Filing deadline : 11 / 4 / 24 You must file : • a motion if you assert that the debtors are not entitled to receive a discharge under U . S . C . § 1328 ( f ) , or • a complaint if you want to have a particular debt excepted from discharge under 11 U . S . C . § 523 ( a ) ( 2 ) or ( 4 ) . Deadline for all creditors to file a proof of claim ( except governmental units ) : Filing deadline : 10 / 16 / 24 Deadline for governmental units to file a proof of claim : Filing deadline : 2 / 3 / 25 Deadlines for filing proof of claim : A proof of claim is a signed statement describing a creditor ' s claim . A proof of claim form may be obtained at www . uscourts . gov or any bankruptcy clerk ' s office . If you do not file a proof of claim by the deadline , you might not be\",\n",
       " 'paid on your claim . To be paid , you must file a proof of claim even if your claim is listed in the schedules that the debtor filed . Secured creditors retain rights in their collateral regardless of whether they file a proof of claim . Filing a proof of claim submits the creditor to the jurisdiction of the bankruptcy court , with consequences a lawyer can explain . For example , a secured creditor who files a proof of claim may surrender important nonmonetary rights , including the right to a jury trial . Deadline to object to exemptions : The law permits debtors to keep certain property as exempt . If you believe that the law does not authorize an exemption claimed , you may file an objection . Filing deadline : 30 days after the conclusion of the meeting of creditors 9 . Filing of plan and confirmation hearing on docket Objections to ConfirmationThe debtor has filed a plan . The hearing on confirmation will be held on : 10 / 10 / 24 at 10 : 00 AM Location : This meeting is by Zoom . Go to , ZoomGov . com / join or call 1 + ( 669 ) 254−5252 , Enter Meeting ID 161 5478 8875 , Passcode 3834658 Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation . If there are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a court order or a stipulation signed by the debtor , the trustee and the objecting party , a plan may be confirmed without objection , and the hearing stricken . 10 . Creditors with a foreign addressIf you are a creditor receiving a notice mailed to a foreign address , you may file a motion asking the court to extend the deadline in this notice . Consult an attorney familiar with United States bankruptcy law if you have any questions about your rights in this case . 11 . Filing a chapter 13 bankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts according to a plan . A plan is not effective unless the court confirms it . You may object to confirmation of the plan and appear at the confirmation hearing . A copy of the plan , if not enclosed , will be sent to you later , and if the confirmation hearing is not indicated on this notice , you will',\n",
       " \"be sent notice of the confirmation hearing . The debtor will remain in possession of the property and may continue to operate the business , if any , unless the court orders otherwise . 12 . Exempt property The law allows debtors to keep certain property as exempt . Fully exempt property will not be sold and distributed to creditors , even if the case is converted to chapter 7 . Debtors must file a list of property claimed as exempt . You may inspect that list at the bankruptcy clerk ' s office or online at https : / / pacer . uscourts . gov . If you believe that the law does not authorize an exemption that debtors claimed , you may file an objection by the deadline . 13 . Discharge of debts Confirmation of a chapter 13 plan may result in a discharge of debts , which may include all or part of a debt . However , unless the court orders otherwise , the debts will not be discharged until all payments under the plan are made . A discharge means that creditors may never try to collect the debt from the debtors personally except as provided in the plan . If you want to have a particular debt excepted from discharge under 11 U . S . C . § 523 ( a ) ( 2 ) or ( 4 ) , you must file a complaint and pay the filing fee in the bankruptcy clerk ' s office by the deadline . If you believe that the debtors are not entitled to a discharge of any of their debts under 11 U . S . C . § 1328 ( f ) , you must file a motion by the deadline . Official Form 309I Notice of Chapter 13 Bankruptcy Case page 2Powered by TCPDF ( www . tcpdf . org ) 2 / 2\"]"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_text(text_all, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = chunk_text(text_all, tokenizer)\n",
    "start_logits = []\n",
    "end_logits = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 197,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'paid on your claim . To be paid , you must file a proof of claim even if your claim is listed in the schedules that the debtor filed . Secured creditors retain rights in their collateral regardless of whether they file a proof of claim . Filing a proof of claim submits the creditor to the jurisdiction of the bankruptcy court , with consequences a lawyer can explain . For example , a secured creditor who files a proof of claim may surrender important nonmonetary rights , including the right to a jury trial . Deadline to object to exemptions : The law permits debtors to keep certain property as exempt . If you believe that the law does not authorize an exemption claimed , you may file an objection . Filing deadline : 30 days after the conclusion of the meeting of creditors 9 . Filing of plan and confirmation hearing on docket Objections to ConfirmationThe debtor has filed a plan . The hearing on confirmation will be held on : 10 / 10 / 24 at 10 : 00 AM Location : This meeting is by Zoom . Go to , ZoomGov . com / join or call 1 + ( 669 ) 254−5252 , Enter Meeting ID 161 5478 8875 , Passcode 3834658 Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation . If there are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a court order or a stipulation signed by the debtor , the trustee and the objecting party , a plan may be confirmed without objection , and the hearing stricken . 10 . Creditors with a foreign addressIf you are a creditor receiving a notice mailed to a foreign address , you may file a motion asking the court to extend the deadline in this notice . Consult an attorney familiar with United States bankruptcy law if you have any questions about your rights in this case . 11 . Filing a chapter 13 bankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts according to a plan . A plan is not effective unless the court confirms it . You may object to confirmation of the plan and appear at the confirmation hearing . A copy of the plan , if not enclosed , will be sent to you later , and if the confirmation hearing is not indicated on this notice , you will'"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunks[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What are the Social Security numbers or ITIN in the text?'"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(question, chunks[0], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)\n",
    "start_logits.append(outputs.start_logits)\n",
    "end_logits.append(outputs.end_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_logits.append(outputs.start_logits)\n",
    "end_logits.append(outputs.end_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What are the Social Security numbers or ITIN in the text?'"
      ]
     },
     "execution_count": 208,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "\n",
    "    for chunk in chunks:\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True)\n",
    "        outputs = model(**inputs)\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "    \n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    \n",
    "    print('1', start_logits)\n",
    "    print('2', end_logits)\n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    print('3', answer_start_index)\n",
    "    print('4', answer_end_index)\n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = inputs[\"input_ids\"][0, answer_start_index : answer_end_index + 1]\n",
    "    return tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 tf.Tensor([[-1.251733  -7.9249597 -9.82178   ... -7.0328403 -4.5499053 -8.900514 ]], shape=(1, 2383), dtype=float32)\n",
      "2 tf.Tensor(\n",
      "[[ -0.8806853   -8.205478   -10.482227   ...  -6.0008802   -0.83891714\n",
      "   -9.523805  ]], shape=(1, 2383), dtype=float32)\n",
      "3 503\n",
      "4 503\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#question = \"What are is the third Social Security number or ITIN in the text?\"\n",
    "question = \"What is the Notice of Chapter?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "    input_ids_all = []\n",
    "\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        \n",
    "        # Collect logits and input_ids for all chunks\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "        input_ids_all.append(inputs[\"input_ids\"])\n",
    "\n",
    "    # Concatenate all logits and input_ids\n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    input_ids_all = tf.concat(input_ids_all, axis=1)\n",
    "    \n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    \n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = input_ids_all[0, answer_start_index : answer_end_index + 1]\n",
    "    return tokenizer.decode(predict_answer_tokens)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    start_logits = []\n",
    "    end_logits = []\n",
    "    input_ids_all = []\n",
    "    all_ands = []\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        #print(outputs)\n",
    "        answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "        answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "        print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "        all_ands.append(tokenizer.decode(predict_answer_tokens))\n",
    "\n",
    "        start_logits.append(outputs.start_logits)\n",
    "        end_logits.append(outputs.end_logits)\n",
    "        input_ids_all.append(inputs[\"input_ids\"])\n",
    "        print(len(start_logits[0][0]),len(end_logits[0][0]))\n",
    "    \n",
    "    # Concatenate all logits and input_ids\n",
    "    start_logits = tf.concat(start_logits, axis=1)\n",
    "    end_logits = tf.concat(end_logits, axis=1)\n",
    "    input_ids_all = tf.concat(input_ids_all, axis=1)\n",
    "    print('xx',start_logits)\n",
    "    print('x0',start_logits[0, 151].numpy())\n",
    "    print('x1',tf.math.argmax(start_logits, axis=-1))\n",
    "    # Find the best start and end indices\n",
    "    answer_start_index = int(tf.math.argmax(start_logits, axis=-1)[0])\n",
    "    answer_end_index = int(tf.math.argmax(end_logits, axis=-1)[0])\n",
    "    # Get the predicted answer\n",
    "    predict_answer_tokens = input_ids_all[0, answer_start_index : answer_end_index + 1]\n",
    "    print('here',tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "\n",
    "    return all_ands\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_answer(question, text_all):\n",
    "    chunks = chunk_text(text_all, tokenizer)\n",
    "    answer_start_indexs = []\n",
    "    answer_end_indexs = []\n",
    "    input_ids_all = []\n",
    "    all_ands = []\n",
    "    for chunk in chunks:\n",
    "        # Tokenize the question and chunk text\n",
    "        inputs = tokenizer(question, chunk, return_tensors=\"tf\", max_length=512, truncation=True, padding='max_length')\n",
    "        outputs = model(**inputs)\n",
    "        #print(outputs)\n",
    "        answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "        answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "        if answer_start_index != 0 and answer_end_index != 0:\n",
    "            answer_start_indexs.append(answer_start_index)\n",
    "            answer_end_indexs.append(answer_end_index)\n",
    "            break\n",
    "        #predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "        #print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "        #all_ands.append(tokenizer.decode(predict_answer_tokens))\n",
    "    print(answer_start_indexs[0])\n",
    "    print(answer_end_indexs[0])\n",
    "    predict_answer_tokens = inputs.input_ids[0, answer_start_indexs[0] : answer_end_indexs[0] + 1]\n",
    "    print(tokenizer.decode(predict_answer_tokens),answer_start_index,answer_end_index)\n",
    "    return all_ands\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "174\n",
      "174\n",
      "dismissed 174 174\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question = \"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"which chapter of bankruptcy is referenced in the text?\"\n",
    "#\"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"which is the country?\"\n",
    "#\"Which date case was filed?\"\n",
    "#\"which is the Bankruptcy Court district?\"\n",
    "#\"which is the country?\"\n",
    "#\"Which is the bankruptcy stage in the text petitioned,dismissed or discharged?\"\n",
    "#\"What is the second Social Security number or ITIN in the text?\"\n",
    "#\"which chapter of bankruptcy is referenced in the text?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] 0 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "['[CLS]', '[CLS]', '[CLS]', '[CLS]', '[CLS]']\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question = \"which bankruptcy stages does the text refer: petitioned,dismissed/withdrawn or discharged?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] which country and state? [SEP] Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse, if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States 0 103\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "[CLS] 0 0\n",
      "['[CLS] which country and state? [SEP] Information to identify the case : Debtor 1 Cristina Nelson First Name Middle Name Last NameSocial Security number or ITIN 461−81−0513 EIN _ _ − _ _ _ _ _ _ _ Debtor 2 ( Spouse, if filing ) Timothy Nelson First Name Middle Name Last NameSocial Security number or ITIN 529−97−1200 EIN _ _ − _ _ _ _ _ _ _ United States', '[CLS]', '[CLS]', '[CLS]', '[CLS]']\n"
     ]
    }
   ],
   "source": [
    "# Example usage\n",
    "#text_all = \"...\"  # Your long text here\n",
    "question3 = \"which country and state?\"\n",
    "#\"What is the third Social Security number or ITIN in the text?\"\n",
    "answer = get_answer(question3, text_all)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\" \\nInformation to identify the case:\\nDebtor 1 Cristina Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    461−81−0513\\nEIN    _ _−_ _ _ _ _ _ _\\nDebtor 2\\n(Spouse, if filing)Timothy Nelson\\nFirst Name      Middle Name      Last NameSocial Security number or ITIN    529−97−1200\\nEIN    _ _−_ _ _ _ _ _ _\\nUnited States Bankruptcy Court   District of UtahDate case filed for chapter  13:   8/7/24Case number:   24−23963   JTM\\nOfficial Form 309I\\nNotice of Chapter 13 Bankruptcy Case 10/20\\nFor the debtors listed above, a case has been filed under chapter 13 of the Bankruptcy Code. An order for relief has\\nbeen entered.\\nThis notice has important information about the case for creditors, debtors, and trustees, including information about\\nthe meeting of creditors and deadlines. Read both pages carefully.\\nThe filing of the case imposed an automatic stay against most collection activities. This means that creditors generally may not take action to collect debts\\nfrom the debtors, the debtors' property, and certain codebtors. For example, while the stay is in effect, creditors cannot sue, garnish wages, assert a\\ndeficiency, repossess property, or otherwise try to collect from the debtors. Creditors cannot demand repayment from debtors by mail, phone, or\\notherwise. Creditors who violate the stay can be required to pay actual and punitive damages and attorney's fees. Under certain circumstances, the stay\\nmay be limited to 30 days or not exist at all, although debtors can ask the court to extend or impose a stay.\\nConfirmation of a chapter 13 plan may result in a discharge. Creditors who assert that the debtors are not entitled to a discharge under 11 U.S.C. §\\n1328(f) must file a motion objecting to discharge in the bankruptcy clerk's office within the deadline specified in this notice. Creditors who want to have\\ntheir debt excepted from discharge may be required to file a complaint in the bankruptcy clerk's office by the same deadline. (See line 13 below for more\\ninformation.)\\nTo protect your rights, consult an attorney. All documents filed in the case may be inspected at the bankruptcy clerk's office at the address listed below or\\nthrough PACER (Public Access to Court Electronic Records at https://pacer.uscourts.gov). Case status information is available at no charge by calling the\\nVoice Case Information System (VCIS) at 1−866−222−8029.\\nThe staff of the bankruptcy clerk's office cannot give legal advice.\\nTo help creditors correctly identify debtors, debtors submit full Social Security or Individual Taxpayer Identification\\nNumbers, which may appear on a version of this notice. However, the full numbers must not appear on any document\\nfiled with the court.\\nDo not file this notice with any proof of claim or other filing in the case. Do not include more than the last four digits of\\na Social Security or Individual Taxpayer Identification Number in any document, including attachments, that you file\\nwith the court.\\nAbout Debtor 1: About Debtor 2:\\n1. Debtor's full name Cristina Nelson Timothy Nelson\\n2.All other names used in the\\nlast 8 years\\n3. Address3342 N 1825 E\\nLayton, UT 840401788 E 3350 N\\nLayton, UT 84040\\n4.Debtor's attorney\\nName and addressE. Kent Winward\\n4850 Harrison Blvd. Suite 1\\nOgden, UT 84403Contact phone (801) 392−8200\\nEmail:  utahbankruptcyfirm@gmail.com\\n5. Bankruptcy trustee\\nName and addressLon Jenkins tr\\nCh. 13 Trustee's Office\\n465 South 400 East\\nSuite 200\\nSalt Lake City, UT 84111Contact phone 801−596−2884\\nEmail:  utahtrusteemail@ch13ut.org\\n6. Bankruptcy clerk's office\\nDocuments in this case may be filed\\nat this address.\\nYou may inspect all records filed in\\nthis case at this office or online at\\nhttps://pacer.uscourts.gov.United States Bankruptcy Court\\nDistrict of Utah\\n350 South Main #301\\nSalt Lake City, UT 84101\\nClerk of Court: David A. SimeHours open: 8:00 AM to 4:30 PM, Monday −\\nFriday\\nContact phone: (801) 524−6687\\nWebsite: www.utb.uscourts.gov\\nOfficial Form 309I    Notice of Chapter 13 Bankruptcy Case page 1\\nDate Generated: 8/14/24 For more information, see page 2 >\\n                               1 / 2 \\nDebtor  Cristina Nelson   and  Timothy Nelson Case number 24−23963\\n7. Meeting of creditors\\n Debtors must attend the meeting to\\nbe questioned under oath. In a joint\\ncase, both spouses must attend.\\nCreditors may attend, but are not\\nrequired to do so.The meeting may be continued or adjourned\\nto a later date.\\nAll individual debtor(s) must provide picture\\nidentification and proof of social security\\nnumber to the trustee. Failure to do so may\\nresult in your case being dismissed.Meeting to be held on:\\nSep. 5, 2024  at 11:00 AM\\nLocation:\\nZoom video meeting. Go to Zoom.us/join,\\nEnter Meeting ID 437 037 6107, and\\nPasscode 3375693596, OR call\\n1−385−832−9074\\nFor additional meeting information go to https://www.justice.gov/ust/moc\\n8. Deadlines\\nThe bankruptcy clerk's office must\\nreceive these documents and any\\nrequired filing fee by the following\\ndeadlines.Deadline to file a complaint to challenge\\ndischargeability of certain debts:Filing deadline: 11/4/24\\nYou must file:\\n•   a motion if you assert that the debtors are not entitled to receive a discharge\\n    under U.S.C. § 1328(f), or\\n•   a complaint if you want to have a particular debt excepted from discharge\\n    under 11 U.S.C. § 523(a)(2) or (4).\\nDeadline for all creditors to file a proof of claim\\n(except governmental units):Filing deadline: 10/16/24\\nDeadline for governmental units to file a proof of\\nclaim:Filing deadline: 2/3/25\\nDeadlines for filing proof of claim:\\n A proof of claim is a signed statement describing a creditor's claim. A proof of claim form may be obtained at\\nwww.uscourts.gov or any bankruptcy clerk's office.\\nIf you do not file a proof of claim by the deadline, you might not be paid on your claim. To be paid, you must file a\\nproof of claim even if your claim is listed in the schedules that the debtor filed.\\nSecured creditors retain rights in their collateral regardless of whether they file a proof of claim. Filing a proof of\\nclaim submits the creditor to the jurisdiction of the bankruptcy court, with consequences a lawyer can explain. For\\nexample, a secured creditor who files a proof of claim may surrender important nonmonetary rights, including the\\nright to a jury trial.\\nDeadline to object to exemptions:\\nThe law permits debtors to keep certain property as exempt. If\\nyou believe that the law does not authorize an exemption\\nclaimed, you may file an objection.Filing\\ndeadline:30 days after the\\nconclusion  of the\\nmeeting of\\ncreditors\\n9. Filing of plan and\\nconfirmation hearing on\\ndocket\\nObjections to ConfirmationThe debtor has filed a plan.\\nThe hearing on confirmation will be held on:  10/10/24  at 10:00 AM\\nLocation:  This meeting is by Zoom. Go to, ZoomGov.com/join or call\\n1+(669)254−5252, Enter Meeting ID 161 5478 8875, Passcode 3834658\\n Objections to confirmation must be filed and served no later than 7 days before the date set for confirmation. If\\nthere are no timely filed objections to confirmation pending or if all objections to confirmation are resolved by a\\ncourt order or a stipulation signed by the debtor, the trustee and the objecting party, a plan may be confirmed\\nwithout objection, and the hearing stricken.\\n10. Creditors with a foreign\\naddressIf you are a creditor receiving a notice mailed to a foreign address, you may file a motion asking the court to\\nextend the deadline in this notice. Consult an attorney familiar with United States bankruptcy law if you have any\\nquestions about your rights in this case.\\n11. Filing a chapter 13\\nbankruptcy caseChapter 13 allows an individual with regular income and debts below a specified amount to adjust debts\\naccording to a plan. A plan is not effective unless the court confirms it. You may object to confirmation of the\\nplan and appear at the confirmation hearing. A copy of the plan, if not enclosed, will be sent to you later, and if\\nthe confirmation hearing is not indicated on this notice, you will be sent notice of the confirmation hearing. The\\ndebtor will remain in possession of the property and may continue to operate the business, if any, unless the\\ncourt orders otherwise.\\n12. Exempt property The law allows debtors to keep certain property as exempt. Fully exempt property will not be sold and distributed\\nto creditors, even if the case is converted to chapter 7. Debtors must file a list of property claimed as exempt.\\nYou may inspect that list at the bankruptcy clerk's office or online at https://pacer.uscourts.gov. If you believe\\nthat the law does not authorize an exemption that debtors claimed, you may file an objection by the deadline.\\n13. Discharge of debts Confirmation of a chapter 13 plan may result in a discharge of debts, which may include all or part of a debt.\\nHowever, unless the court orders otherwise, the debts will not be discharged until all payments under the plan\\nare made. A discharge means that creditors may never try to collect the debt from the debtors personally except\\nas provided in the plan. If you want to have a particular debt excepted from discharge under 11 U.S.C. §\\n523(a)(2) or (4), you must file a complaint and pay the filing fee in the bankruptcy clerk's office by the deadline. If\\nyou believe that the debtors are not entitled to a discharge of any of their debts under 11 U.S.C. § 1328(f), you\\nmust file a motion by the deadline.\\nOfficial Form 309I    Notice of Chapter 13 Bankruptcy Case page 2Powered by TCPDF (www.tcpdf.org)\\n                               2 / 2\\n\""
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9489"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-09-12 09:24:06.178737: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[0,512] = 512 is not in [0, 512)\n"
     ]
    },
    {
     "ename": "InvalidArgumentError",
     "evalue": "Exception encountered when calling layer 'embeddings' (type TFEmbeddings).\n\n{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather] name: \n\nCall arguments received by layer 'embeddings' (type TFEmbeddings):\n  • input_ids=tf.Tensor(shape=(1, 709), dtype=int32)\n  • position_ids=None\n  • inputs_embeds=None\n  • training=False",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mInvalidArgumentError\u001b[0m                      Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[131], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m inputs \u001b[38;5;241m=\u001b[39m tokenizer(question, text_all[:\u001b[38;5;241m3000\u001b[39m], return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     67\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m     69\u001b[0m     \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     72\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/modeling_tf_utils.py:437\u001b[0m, in \u001b[0;36munpack_inputs.<locals>.run_call_with_unpacked_inputs\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    434\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\n\u001b[1;32m    436\u001b[0m unpacked_inputs \u001b[38;5;241m=\u001b[39m input_processing(func, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfn_args_and_kwargs)\n\u001b[0;32m--> 437\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43munpacked_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:1091\u001b[0m, in \u001b[0;36mTFDistilBertForQuestionAnswering.call\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict, start_positions, end_positions, training)\u001b[0m\n\u001b[1;32m   1061\u001b[0m \u001b[38;5;129m@unpack_inputs\u001b[39m\n\u001b[1;32m   1062\u001b[0m \u001b[38;5;129m@add_start_docstrings_to_model_forward\u001b[39m(DISTILBERT_INPUTS_DOCSTRING\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_size, sequence_length\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m   1063\u001b[0m \u001b[38;5;129m@add_code_sample_docstrings\u001b[39m(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1079\u001b[0m     training: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m   1080\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[TFQuestionAnsweringModelOutput, Tuple[tf\u001b[38;5;241m.\u001b[39mTensor]]:\n\u001b[1;32m   1081\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1082\u001b[0m \u001b[38;5;124;03m    start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1083\u001b[0m \u001b[38;5;124;03m        Labels for position (index) of the start of the labelled span for computing the token classification loss.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1089\u001b[0m \u001b[38;5;124;03m        are not taken into account for computing the loss.\u001b[39;00m\n\u001b[1;32m   1090\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1091\u001b[0m     distilbert_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdistilbert\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1092\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1093\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1094\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1095\u001b[0m \u001b[43m        \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1096\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1098\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1099\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtraining\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtraining\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1100\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1101\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m distilbert_output[\u001b[38;5;241m0\u001b[39m]  \u001b[38;5;66;03m# (bs, max_query_len, dim)\u001b[39;00m\n\u001b[1;32m   1102\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(hidden_states, training\u001b[38;5;241m=\u001b[39mtraining)  \u001b[38;5;66;03m# (bs, max_query_len, dim)\u001b[39;00m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/modeling_tf_utils.py:437\u001b[0m, in \u001b[0;36munpack_inputs.<locals>.run_call_with_unpacked_inputs\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    434\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\n\u001b[1;32m    436\u001b[0m unpacked_inputs \u001b[38;5;241m=\u001b[39m input_processing(func, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfn_args_and_kwargs)\n\u001b[0;32m--> 437\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43munpacked_inputs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:454\u001b[0m, in \u001b[0;36mTFDistilBertMainLayer.call\u001b[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict, training)\u001b[0m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    452\u001b[0m     head_mask \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_hidden_layers\n\u001b[0;32m--> 454\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# (bs, seq_length, dim)\u001b[39;00m\n\u001b[1;32m    455\u001b[0m tfmr_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransformer(\n\u001b[1;32m    456\u001b[0m     embedding_output,\n\u001b[1;32m    457\u001b[0m     attention_mask,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    462\u001b[0m     training\u001b[38;5;241m=\u001b[39mtraining,\n\u001b[1;32m    463\u001b[0m )\n\u001b[1;32m    465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tfmr_output\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py:117\u001b[0m, in \u001b[0;36mTFEmbeddings.call\u001b[0;34m(self, input_ids, position_ids, inputs_embeds, training)\u001b[0m\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m position_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    115\u001b[0m     position_ids \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mexpand_dims(tf\u001b[38;5;241m.\u001b[39mrange(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, limit\u001b[38;5;241m=\u001b[39minput_shape[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]), axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m--> 117\u001b[0m position_embeds \u001b[38;5;241m=\u001b[39m \u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgather\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    118\u001b[0m final_embeddings \u001b[38;5;241m=\u001b[39m inputs_embeds \u001b[38;5;241m+\u001b[39m position_embeds\n\u001b[1;32m    119\u001b[0m final_embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mLayerNorm(inputs\u001b[38;5;241m=\u001b[39mfinal_embeddings)\n",
      "\u001b[0;31mInvalidArgumentError\u001b[0m: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).\n\n{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather] name: \n\nCall arguments received by layer 'embeddings' (type TFEmbeddings):\n  • input_ids=tf.Tensor(shape=(1, 709), dtype=int32)\n  • position_ids=None\n  • inputs_embeds=None\n  • training=False"
     ]
    }
   ],
   "source": [
    "inputs = tokenizer(question, text_all[:3000], return_tensors=\"tf\")\n",
    "outputs = model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])\n",
    "answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])\n",
    "predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]\n",
    "tokenizer.decode(predict_answer_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForQuestionAnswering: ['roberta.embeddings.position_ids']\n",
      "- This IS expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFRobertaForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the weights of TFRobertaForQuestionAnswering were initialized from the PyTorch model.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForQuestionAnswering for predictions without further training.\n",
      "/Users/camilayepes/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "ename": "ImportError",
     "evalue": "\nAutoModelForQuestionAnswering requires the PyTorch library but it was not found in your environment.\nHowever, we were able to find a TensorFlow installation. TensorFlow classes begin\nwith \"TF\", but are otherwise identically named to our PyTorch classes. This\nmeans that the TF equivalent of the class you tried to import would be \"TFAutoModelForQuestionAnswering\".\nIf you want to use TensorFlow, please use TF classes instead!\n\nIf you really do want to use PyTorch please go to\nhttps://pytorch.org/get-started/locally/ and follow the instructions that\nmatch your environment.\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[56], line 14\u001b[0m\n\u001b[1;32m     11\u001b[0m res \u001b[38;5;241m=\u001b[39m nlp(QA_input)\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# b) Load model & tokenizer\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForQuestionAnswering\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m(model_name)\n\u001b[1;32m     15\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/utils/import_utils.py:1543\u001b[0m, in \u001b[0;36mDummyObject.__getattribute__\u001b[0;34m(cls, key)\u001b[0m\n\u001b[1;32m   1541\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_from_config\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   1542\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__getattribute__\u001b[39m(key)\n\u001b[0;32m-> 1543\u001b[0m \u001b[43mrequires_backends\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backends\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Documents/bankruptcy_automation/env/lib/python3.11/site-packages/transformers/utils/import_utils.py:1522\u001b[0m, in \u001b[0;36mrequires_backends\u001b[0;34m(obj, backends)\u001b[0m\n\u001b[1;32m   1520\u001b[0m \u001b[38;5;66;03m# Raise an error for users who might not realize that classes without \"TF\" are torch-only\u001b[39;00m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_available() \u001b[38;5;129;01mand\u001b[39;00m is_tf_available():\n\u001b[0;32m-> 1522\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(PYTORCH_IMPORT_ERROR_WITH_TF\u001b[38;5;241m.\u001b[39mformat(name))\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;66;03m# Raise the inverse error for PyTorch users trying to load TF classes\u001b[39;00m\n\u001b[1;32m   1525\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m backends \u001b[38;5;129;01mand\u001b[39;00m is_torch_available() \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_tf_available():\n",
      "\u001b[0;31mImportError\u001b[0m: \nAutoModelForQuestionAnswering requires the PyTorch library but it was not found in your environment.\nHowever, we were able to find a TensorFlow installation. TensorFlow classes begin\nwith \"TF\", but are otherwise identically named to our PyTorch classes. This\nmeans that the TF equivalent of the class you tried to import would be \"TFAutoModelForQuestionAnswering\".\nIf you want to use TensorFlow, please use TF classes instead!\n\nIf you really do want to use PyTorch please go to\nhttps://pytorch.org/get-started/locally/ and follow the instructions that\nmatch your environment.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
    "\n",
    "model_name = \"deepset/roberta-base-squad2\"\n",
    "\n",
    "# a) Get predictions\n",
    "nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)\n",
    "QA_input = {\n",
    "    'question': 'Why is model conversion important?',\n",
    "    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'\n",
    "}\n",
    "res = nlp(QA_input)\n",
    "\n",
    "# b) Load model & tokenizer\n",
    "model = AutoModelForQuestionAnswering.from_pretrained(model_name)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = nlp(QA_input)\n",
    "print(res)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "# Define a custom prompt to provide instructions and any additional context.\n",
    "# 1) You can add examples into the prompt template to improve extraction quality\n",
    "# 2) Introduce additional parameters to take context into account (e.g., include metadata\n",
    "#    about the document from which the text was extracted.)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\n",
    "            \"system\",\n",
    "            \"You are an expert extraction algorithm. \"\n",
    "            \"Only extract relevant information from the text. \"\n",
    "            \"If you do not know the value of an attribute asked to extract, \"\n",
    "            \"return null for the attribute's value.\",\n",
    "        ),\n",
    "        # Please see the how-to about improving performance with\n",
    "        # reference examples.\n",
    "        # MessagesPlaceholder('examples'),\n",
    "        (\"human\", \"{text}\"),\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'langchain_mistralai'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_mistralai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatMistralAI\n\u001b[1;32m      3\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatMistralAI(model\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmistral-large-latest\u001b[39m\u001b[38;5;124m\"\u001b[39m, temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m      5\u001b[0m runnable \u001b[38;5;241m=\u001b[39m prompt \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mPerson)\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'langchain_mistralai'"
     ]
    }
   ],
   "source": [
    "from langchain_mistralai import ChatMistralAI\n",
    "\n",
    "llm = ChatMistralAI(model=\"mistral-large-latest\", temperature=0)\n",
    "\n",
    "runnable = prompt | llm.with_structured_output(schema=Person)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Optional\n",
    "\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "\n",
    "class Bankruptcy(BaseModel):\n",
    "    \"\"\"Information about a bankruptcy declaration.\"\"\"\n",
    "\n",
    "    # ^ Doc-string for the entity Person.\n",
    "    # This doc-string is sent to the LLM as the description of the schema Person,\n",
    "    # and it can help to improve extraction results.\n",
    "\n",
    "    # Note that:\n",
    "    # 1. Each field is an `optional` -- this allows the model to decline to extract it!\n",
    "    # 2. Each field has a `description` -- this description is used by the LLM.\n",
    "    # Having a good description can help improve extraction results.\n",
    "    ssns: Optional[list] = Field(default=None, description=\"The ssns of the persons\")\n",
    "    chapter: Optional[str] = Field(\n",
    "        default=None, description=\"The chapter of the bankruptcy declaration\"\n",
    "    )\n",
    "    country: Optional[str] = Field(\n",
    "        default=None, description=\"Country were the bankruptcy declaration is made\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Data(BaseModel):\n",
    "    \"\"\"Extracted data about bankruptcy declaration..\"\"\"\n",
    "\n",
    "    # Creates a model so that we can extract multiple entities.\n",
    "    people: List[Bankruptcy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'prompt' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m runnable \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m \u001b[38;5;241m|\u001b[39m llm\u001b[38;5;241m.\u001b[39mwith_structured_output(schema\u001b[38;5;241m=\u001b[39mData)\n\u001b[1;32m      2\u001b[0m runnable\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: text_all})\n",
      "\u001b[0;31mNameError\u001b[0m: name 'prompt' is not defined"
     ]
    }
   ],
   "source": [
    "runnable = prompt | llm.with_structured_output(schema=Data)\n",
    "runnable.invoke({\"text\": text_all})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(text_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find SSNs\n",
    "ssn_pattern = r'\\b(?:Social Security number|ITIN)\\D*(\\d{3}[−\\s]\\d{2}[−\\s]\\d{4})\\b'\n",
    "ssns = re.findall(ssn_pattern, text_all)\n",
    "\n",
    "def find_ssns(text):\n",
    "    ssns = re.findall(ssn_pattern, text_all)\n",
    "    return ssns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find chapter\n",
    "chapter_pattern = r'Notice of Chapter (\\d+) Bankruptcy Case \\d{1,2}/\\d{2}'\n",
    "\n",
    "def find_chapter(text):\n",
    "    chapters = re.findall(chapter_pattern, text_all)\n",
    "    return chapters[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "country_code = {\"United States\": \"US\", \"Canada\":\"CA\"}\n",
    "\n",
    "country_pattern = r'\\b(?:United States|Canada)\\b'\n",
    "\n",
    "def find_country_code(text):\n",
    "    country_match = re.search(country_pattern, text, re.IGNORECASE)\n",
    "    return country_code.get(country_match[0],None) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find State\n",
    "state_pattern = r'\\nDistrict of (\\w+)'\n",
    "\n",
    "# Dictionaries for state codes\n",
    "us_states = {\n",
    "    \"Alabama\": \"AL\", \"Alaska\": \"AK\", \"Arizona\": \"AZ\", \"Arkansas\": \"AR\", \"California\": \"CA\",\n",
    "    \"Colorado\": \"CO\", \"Connecticut\": \"CT\", \"Delaware\": \"DE\", \"Florida\": \"FL\", \"Georgia\": \"GA\",\n",
    "    \"Hawaii\": \"HI\", \"Idaho\": \"ID\", \"Illinois\": \"IL\", \"Indiana\": \"IN\", \"Iowa\": \"IA\",\n",
    "    \"Kansas\": \"KS\", \"Kentucky\": \"KY\", \"Louisiana\": \"LA\", \"Maine\": \"ME\", \"Maryland\": \"MD\",\n",
    "    \"Massachusetts\": \"MA\", \"Michigan\": \"MI\", \"Minnesota\": \"MN\", \"Mississippi\": \"MS\", \"Missouri\": \"MO\",\n",
    "    \"Montana\": \"MT\", \"Nebraska\": \"NE\", \"Nevada\": \"NV\", \"New Hampshire\": \"NH\", \"New Jersey\": \"NJ\",\n",
    "    \"New Mexico\": \"NM\", \"New York\": \"NY\", \"North Carolina\": \"NC\", \"North Dakota\": \"ND\", \"Ohio\": \"OH\",\n",
    "    \"Oklahoma\": \"OK\", \"Oregon\": \"OR\", \"Pennsylvania\": \"PA\", \"Rhode Island\": \"RI\", \"South Carolina\": \"SC\",\n",
    "    \"South Dakota\": \"SD\", \"Tennessee\": \"TN\", \"Texas\": \"TX\", \"Utah\": \"UT\", \"Vermont\": \"VT\",\n",
    "    \"Virginia\": \"VA\", \"Washington\": \"WA\", \"West Virginia\": \"WV\", \"Wisconsin\": \"WI\", \"Wyoming\": \"WY\"\n",
    "}\n",
    "\n",
    "canadian_provinces = {\n",
    "    \"Alberta\": \"AB\", \"British Columbia\": \"BC\", \"Manitoba\": \"MB\", \"New Brunswick\": \"NB\", \"Newfoundland and Labrador\": \"NL\",\n",
    "    \"Northwest Territories\": \"NT\", \"Nova Scotia\": \"NS\", \"Nunavut\": \"NU\", \"Ontario\": \"ON\", \"Prince Edward Island\": \"PE\",\n",
    "    \"Quebec\": \"QC\", \"Saskatchewan\": \"SK\", \"Yukon\": \"YT\"\n",
    "}\n",
    "\n",
    "def find_state_code(text,country_code):\n",
    "    state_match = re.search(state_pattern, text)\n",
    "    \n",
    "    if state_match:\n",
    "        # Extract the state or province name from the match\n",
    "        state_name = state_match.group(1).strip()\n",
    "    \n",
    "    if country_code == 'US':\n",
    "        state_code = us_states.get(state_name,None)\n",
    "    elif country_code == 'CA':\n",
    "        state_code = canadian_provinces.get(state_name,None)\n",
    "    else:\n",
    "        state_code = None\n",
    "    \n",
    "    return state_code\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find stage\n",
    "stage_patterns = {\n",
    "    'Petition': r'\\b(case filed|petition filed|automatic stay)\\b',\n",
    "    'Discharge': r'\\b(discharge of debts|discharge order|case discharged)\\b',\n",
    "    'Dismissed': r'\\b(case dismissed|dismissal|converted to Chapter 7)\\b'\n",
    "}\n",
    "\n",
    "# Function to categorize bankruptcy stages from text\n",
    "def categorize_stage(text):\n",
    "    categorized_stages = {'Petition': False, 'Discharge': False, 'Dismissed': False}\n",
    "    \n",
    "    for stage, pattern in stage_patterns.items():\n",
    "        if re.search(pattern, text, re.IGNORECASE):\n",
    "            categorized_stages[stage] = True\n",
    "    \n",
    "    # Determine the final stage based on the presence of keywords\n",
    "    if categorized_stages['Petition']:\n",
    "        return 'Petition'\n",
    "    elif categorized_stages['Discharge']:\n",
    "        return 'Discharge'\n",
    "    elif categorized_stages['Dismissed']:\n",
    "        return 'Dismissed'\n",
    "    else:\n",
    "        return 'Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data found: {'ssns': ['461−81−0513', '529−97−1200'], 'chapter': '13', 'country_code': 'US', 'state': 'UT', 'stage': 'Petition'}\n"
     ]
    }
   ],
   "source": [
    "data = { \"ssns\": find_ssns(text_all),\n",
    "        \"chapter\": find_chapter(text_all),\n",
    "        \"country_code\": find_country_code(text_all),\n",
    "        \"state\": find_state_code(text_all, find_country_code(text_all)),\n",
    "        \"stage\": categorize_stage(text_all)\n",
    "        }\n",
    "\n",
    "print(f\"Data found: {data}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}