{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e9ca44ab-68d4-4361-a7fb-1f887f1b06c0",
   "metadata": {
    "papermill": {
     "duration": 20.056463,
     "end_time": "2023-02-01T13:28:53.560235",
     "exception": false,
     "start_time": "2023-02-01T13:28:33.503772",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip install -q transformers datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d5482d72-f55e-4b09-befc-a0b71fb0f6b3",
   "metadata": {
    "papermill": {
     "duration": 0.126709,
     "end_time": "2023-02-01T13:28:53.696755",
     "exception": false,
     "start_time": "2023-02-01T13:28:53.570046",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: Unexpected command-line argument -f found.\n",
      "Warning: Unexpected command-line argument /root/.local/share/jupyter/runtime/kernel-92e2dce4-3520-4966-a7b3-b12619e1a0d7.json found.\n"
     ]
    }
   ],
   "source": [
    "import valohai\n",
    "\n",
    "valohai.prepare(\n",
    "    step='train-model',\n",
    "    image='pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime',    \n",
    "    default_parameters={        \n",
    "        'epochs': 10,\n",
    "        'model': 'google/mt5-small',\n",
    "    }\n",
    ")\n",
    "output_path = valohai.outputs().path('model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7d8321e3-caf8-4f1b-8f4e-568df5e9608c",
   "metadata": {
    "papermill": {
     "duration": 1.139645,
     "end_time": "2023-02-01T13:28:54.844272",
     "exception": false,
     "start_time": "2023-02-01T13:28:53.704627",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cuda\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
    "print(torch_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f4484a17-8ba2-45a8-b537-24c44bb5bb7c",
   "metadata": {
    "papermill": {
     "duration": 0.782457,
     "end_time": "2023-02-01T13:28:55.633345",
     "exception": false,
     "start_time": "2023-02-01T13:28:54.850888",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mon Mar 27 07:02:29 2023       \n",
      "+-----------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |\n",
      "|-------------------------------+----------------------+----------------------+\n",
      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                               |                      |               MIG M. |\n",
      "|===============================+======================+======================|\n",
      "|   0  NVIDIA RTX A6000    On   | 00000000:05:00.0 Off |                  Off |\n",
      "| 30%   31C    P8    15W / 300W |      3MiB / 48685MiB |      0%      Default |\n",
      "|                               |                      |                  N/A |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "                                                                               \n",
      "+-----------------------------------------------------------------------------+\n",
      "| Processes:                                                                  |\n",
      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
      "|        ID   ID                                                   Usage      |\n",
      "|=============================================================================|\n",
      "|  No running processes found                                                 |\n",
      "+-----------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "! nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "73334e06-3bf2-4e94-9870-fe3a487398c3",
   "metadata": {
    "papermill": {
     "duration": 45.306651,
     "end_time": "2023-02-01T13:29:40.951034",
     "exception": false,
     "start_time": "2023-02-01T13:28:55.644383",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikisql (/root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n",
      "Found cached dataset wikisql (/root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "train_data = load_dataset('wikisql', split='train+validation')\n",
    "test_data = load_dataset('wikisql', split='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cf5379de-aeb5-4a1c-8d23-9ad1e56dc445",
   "metadata": {
    "papermill": {
     "duration": 0.038407,
     "end_time": "2023-02-01T13:29:41.013026",
     "exception": false,
     "start_time": "2023-02-01T13:29:40.974619",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def format_dataset(example):\n",
    " return {'input': 'translate to SQL: ' + example['question'] + ' table ID: ' + ', '.join(str(x) for x in example['table']['header']), 'target': example['sql']['human_readable']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1ce6feef-eab2-4b7a-86f0-c663e5790c5d",
   "metadata": {
    "papermill": {
     "duration": 17.729786,
     "end_time": "2023-02-01T13:29:58.768354",
     "exception": false,
     "start_time": "2023-02-01T13:29:41.038568",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d/cache-1ea43016a8276f85.arrow\n"
     ]
    }
   ],
   "source": [
    "train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "03862b72-56e4-40ab-aae2-81604f69d608",
   "metadata": {
    "papermill": {
     "duration": 4.566604,
     "end_time": "2023-02-01T13:30:03.373278",
     "exception": false,
     "start_time": "2023-02-01T13:29:58.806674",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d/cache-b9e3da7e258b7aa5.arrow\n"
     ]
    }
   ],
   "source": [
    "test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6246e5c3-4d91-4c65-9ee9-bfc366339e97",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: sentencepiece in /opt/conda/lib/python3.7/site-packages (0.1.97)\n",
      "Collecting protobuf==3.20.*\n",
      "  Downloading protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)\n",
      "\u001b[K     |████████████████████████████████| 1.0 MB 4.4 MB/s eta 0:00:01\n",
      "\u001b[?25hInstalling collected packages: protobuf\n",
      "  Attempting uninstall: protobuf\n",
      "    Found existing installation: protobuf 4.22.1\n",
      "    Uninstalling protobuf-4.22.1:\n",
      "      Successfully uninstalled protobuf-4.22.1\n",
      "Successfully installed protobuf-3.20.3\n"
     ]
    }
   ],
   "source": [
    "!pip install sentencepiece\n",
    "!pip install protobuf==3.20.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f162ac75-aeda-409a-af8c-f70f5a1d7cbd",
   "metadata": {
    "papermill": {
     "duration": 16.204849,
     "end_time": "2023-02-01T13:30:19.617815",
     "exception": false,
     "start_time": "2023-02-01T13:30:03.412966",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/transformers/convert_slow_tokenizer.py:447: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.\n",
      "  \"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option\"\n",
      "You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.\n",
      "Downloading pytorch_model.bin: 100%|██████████| 1.20G/1.20G [00:16<00:00, 72.6MB/s]\n",
      "Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 31.2kB/s]\n"
     ]
    }
   ],
   "source": [
    "CKPT = valohai.parameters(\"model\").value\n",
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "tokenizer = AutoTokenizer.from_pretrained(CKPT)\n",
    "model = T5ForConditionalGeneration.from_pretrained(CKPT).to(torch_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6e2c9c3b-dfd1-4a34-ad77-3c8f69ac4854",
   "metadata": {
    "papermill": {
     "duration": 2.058386,
     "end_time": "2023-02-01T13:30:21.722091",
     "exception": false,
     "start_time": "2023-02-01T13:30:19.663705",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input Mean: 47.4798, %-Input > 256:0.0,  %-Input > 128:0.001, %-Input > 64:0.0684 Output Mean:19.4288, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0004\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "source": [
    "# map article and summary len to dict as well as if sample is longer than 512 tokens\n",
    "def map_to_length(x):\n",
    "  x[\"input_len\"] = len(tokenizer(x[\"input\"]).input_ids)\n",
    "  x[\"input_longer_256\"] = int(x[\"input_len\"] > 256)\n",
    "  x[\"input_longer_128\"] = int(x[\"input_len\"] > 128)\n",
    "  x[\"input_longer_64\"] = int(x[\"input_len\"] > 64)\n",
    "  x[\"out_len\"] = len(tokenizer(x[\"target\"]).input_ids)\n",
    "  x[\"out_longer_256\"] = int(x[\"out_len\"] > 256)\n",
    "  x[\"out_longer_128\"] = int(x[\"out_len\"] > 128)\n",
    "  x[\"out_longer_64\"] = int(x[\"out_len\"] > 64)\n",
    "  return x\n",
    "\n",
    "sample_size = 10000\n",
    "data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)\n",
    "\n",
    "def compute_and_print_stats(x):\n",
    "  if len(x[\"input_len\"]) == sample_size:\n",
    "    print(\n",
    "        \"Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}\".format(\n",
    "            sum(x[\"input_len\"]) / sample_size,\n",
    "            sum(x[\"input_longer_256\"]) / sample_size,\n",
    "            sum(x[\"input_longer_128\"]) / sample_size,\n",
    "            sum(x[\"input_longer_64\"]) / sample_size,   \n",
    "            sum(x[\"out_len\"]) / sample_size,\n",
    "            sum(x[\"out_longer_256\"]) / sample_size,\n",
    "            sum(x[\"out_longer_128\"]) / sample_size,\n",
    "            sum(x[\"out_longer_64\"]) / sample_size,\n",
    "        )\n",
    "    )\n",
    "\n",
    "output = data_stats.map(\n",
    "  compute_and_print_stats, \n",
    "  batched=True,\n",
    "  batch_size=-1,\n",
    ")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d6b69f36-bd57-46e4-b77e-a0017ffbf64e",
   "metadata": {
    "papermill": {
     "duration": 0.063495,
     "end_time": "2023-02-01T13:30:21.834853",
     "exception": false,
     "start_time": "2023-02-01T13:30:21.771358",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# tokenize the examples\n",
    "def convert_to_features(example_batch):\n",
    "    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=100, truncation=True)\n",
    "    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=100, truncation=True)\n",
    "\n",
    "    encodings = {\n",
    "        'input_ids': input_encodings['input_ids'], \n",
    "        'attention_mask': input_encodings['attention_mask'],\n",
    "        'labels': target_encodings['input_ids'],\n",
    "        'decoder_attention_mask': target_encodings['attention_mask']\n",
    "    }\n",
    "\n",
    "    return encodings "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "67b3b61d-e1ae-435e-8f55-46fa219ea3e2",
   "metadata": {
    "papermill": {
     "duration": 23.172287,
     "end_time": "2023-02-01T13:30:45.056685",
     "exception": false,
     "start_time": "2023-02-01T13:30:21.884398",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:   0%|          | 0/64776 [00:00<?, ? examples/s]/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2352: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  FutureWarning,\n",
      "                                                                   \r"
     ]
    }
   ],
   "source": [
    "train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)\n",
    "test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)\n",
    "\n",
    "columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']\n",
    "\n",
    "train_data.set_format(type='torch', columns=columns)\n",
    "test_data.set_format(type='torch', columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "69d37693-8c5a-45c2-a9fd-dfab43ed71fa",
   "metadata": {
    "papermill": {
     "duration": 0.106751,
     "end_time": "2023-02-01T13:30:45.221681",
     "exception": false,
     "start_time": "2023-02-01T13:30:45.114930",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainer\n",
    "from transformers import Seq2SeqTrainingArguments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "644e81ec-1c23-4a2d-a488-f9354c237815",
   "metadata": {
    "papermill": {
     "duration": 0.069207,
     "end_time": "2023-02-01T13:30:45.347009",
     "exception": false,
     "start_time": "2023-02-01T13:30:45.277802",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# set training arguments - Feel free to adapt it\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    output_dir=output_path,\n",
    "    per_device_train_batch_size=16,\n",
    "    num_train_epochs=valohai.parameters(\"epochs\").value,\n",
    "    per_device_eval_batch_size=16,\n",
    "    predict_with_generate=True,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    do_train=True,\n",
    "    do_eval=True,\n",
    "    logging_steps=500,\n",
    "    save_strategy=\"epoch\",\n",
    "    #save_steps=1000,\n",
    "    #eval_steps=1000,\n",
    "    overwrite_output_dir=True,\n",
    "    save_total_limit=1,\n",
    "    load_best_model_at_end=True,\n",
    "    push_to_hub=False\n",
    "    #fp16=True, \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "46d2344c-df83-4495-b700-71e1308f60f1",
   "metadata": {
    "papermill": {
     "duration": 4.757895,
     "end_time": "2023-02-01T13:30:50.160794",
     "exception": false,
     "start_time": "2023-02-01T13:30:45.402899",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "! pip install -q rouge_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "63ca930c-9cd3-4880-beb6-dd44057069bb",
   "metadata": {
    "papermill": {
     "duration": 1.098239,
     "end_time": "2023-02-01T13:30:51.318015",
     "exception": false,
     "start_time": "2023-02-01T13:30:50.219776",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "from datasets import load_metric\n",
    "rouge = load_metric(\"rouge\")\n",
    "\n",
    "def compute_metrics(pred):\n",
    "    labels_ids = pred.label_ids\n",
    "    pred_ids = pred.predictions\n",
    "\n",
    "    # all unnecessary tokens are removed\n",
    "    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
    "    labels_ids[labels_ids == -100] = tokenizer.pad_token_id\n",
    "    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)\n",
    "\n",
    "    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=[\"rouge2\"])[\"rouge2\"].mid\n",
    "\n",
    "    return {\n",
    "        \"rouge2_precision\": round(rouge_output.precision, 4),\n",
    "        \"rouge2_recall\": round(rouge_output.recall, 4),\n",
    "        \"rouge2_fmeasure\": round(rouge_output.fmeasure, 4),\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2977e566-8714-4164-b7ad-2706dbd26be8",
   "metadata": {
    "papermill": {
     "duration": 0.074325,
     "end_time": "2023-02-01T13:30:51.451387",
     "exception": false,
     "start_time": "2023-02-01T13:30:51.377062",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# instantiate trainer\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    compute_metrics=compute_metrics,\n",
    "    train_dataset=train_data,\n",
    "    eval_dataset=test_data,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8dce01a3-61b2-4cb4-b5d2-319d0e946083",
   "metadata": {
    "papermill": {
     "duration": 227.616733,
     "end_time": "2023-02-01T13:34:39.125675",
     "exception": false,
     "start_time": "2023-02-01T13:30:51.508942",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1986' max='993' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [993/993 14:21]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 42.09397506713867,\n",
       " 'eval_rouge2_precision': 0.002,\n",
       " 'eval_rouge2_recall': 0.0009,\n",
       " 'eval_rouge2_fmeasure': 0.0012,\n",
       " 'eval_runtime': 77.1,\n",
       " 'eval_samples_per_second': 205.94,\n",
       " 'eval_steps_per_second': 12.879}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "55e1a216-8034-49ef-aea4-5fee281d07f3",
   "metadata": {
    "papermill": {
     "duration": 6776.251162,
     "end_time": "2023-02-01T15:27:35.554942",
     "exception": false,
     "start_time": "2023-02-01T13:34:39.303780",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/transformers/optimization.py:395: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='40490' max='40490' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [40490/40490 2:12:07, Epoch 10/10]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Rouge2 Precision</th>\n",
       "      <th>Rouge2 Recall</th>\n",
       "      <th>Rouge2 Fmeasure</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.103200</td>\n",
       "      <td>0.051379</td>\n",
       "      <td>0.901000</td>\n",
       "      <td>0.817300</td>\n",
       "      <td>0.849700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.065800</td>\n",
       "      <td>0.038024</td>\n",
       "      <td>0.917400</td>\n",
       "      <td>0.838200</td>\n",
       "      <td>0.869300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.054700</td>\n",
       "      <td>0.033012</td>\n",
       "      <td>0.923000</td>\n",
       "      <td>0.844100</td>\n",
       "      <td>0.875000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.045900</td>\n",
       "      <td>0.030169</td>\n",
       "      <td>0.928600</td>\n",
       "      <td>0.847300</td>\n",
       "      <td>0.880000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.040100</td>\n",
       "      <td>0.028730</td>\n",
       "      <td>0.930800</td>\n",
       "      <td>0.849800</td>\n",
       "      <td>0.882400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.039300</td>\n",
       "      <td>0.027651</td>\n",
       "      <td>0.931800</td>\n",
       "      <td>0.850700</td>\n",
       "      <td>0.883300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.036000</td>\n",
       "      <td>0.027332</td>\n",
       "      <td>0.932900</td>\n",
       "      <td>0.852000</td>\n",
       "      <td>0.884600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.033500</td>\n",
       "      <td>0.026453</td>\n",
       "      <td>0.933100</td>\n",
       "      <td>0.852300</td>\n",
       "      <td>0.884900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.032800</td>\n",
       "      <td>0.026168</td>\n",
       "      <td>0.934200</td>\n",
       "      <td>0.853100</td>\n",
       "      <td>0.885800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.032300</td>\n",
       "      <td>0.026122</td>\n",
       "      <td>0.934300</td>\n",
       "      <td>0.853100</td>\n",
       "      <td>0.885900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=40490, training_loss=0.2895770631857524, metrics={'train_runtime': 7927.7437, 'train_samples_per_second': 81.708, 'train_steps_per_second': 5.107, 'total_flos': 6.689509761024e+16, 'train_loss': 0.2895770631857524, 'epoch': 10.0})"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0bd48e53-90c3-483f-b17d-ac996f801977",
   "metadata": {
    "papermill": {
     "duration": 1.119331,
     "end_time": "2023-02-01T15:27:37.410693",
     "exception": false,
     "start_time": "2023-02-01T15:27:36.291362",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('/valohai/outputs/model/tokenizer_config.json',\n",
       " '/valohai/outputs/model/special_tokens_map.json',\n",
       " '/valohai/outputs/model/spiece.model',\n",
       " '/valohai/outputs/model/added_tokens.json',\n",
       " '/valohai/outputs/model/tokenizer.json')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.save_model(output_path)\n",
    "tokenizer.save_pretrained(output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "38dbdcd0-14b3-4270-8ad8-03059b6d63de",
   "metadata": {
    "papermill": {
     "duration": 1.717893,
     "end_time": "2023-02-01T15:27:39.866396",
     "exception": false,
     "start_time": "2023-02-01T15:27:38.148503",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "CKPT = output_path\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(CKPT, local_files_only=True)\n",
    "model = T5ForConditionalGeneration.from_pretrained(CKPT, local_files_only=True).to(torch_device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "90587462-5328-4f32-bad9-4fbfb00d7bb7",
   "metadata": {
    "papermill": {
     "duration": 18.569639,
     "end_time": "2023-02-01T15:27:59.089768",
     "exception": false,
     "start_time": "2023-02-01T15:27:40.520129",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: sentencepiece in /opt/conda/lib/python3.7/site-packages (0.1.97)\n",
      "Collecting pandasql\n",
      "  Downloading pandasql-0.7.3.tar.gz (26 kB)\n",
      "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (from pandasql) (1.21.2)\n",
      "Requirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from pandasql) (1.3.5)\n",
      "Collecting sqlalchemy\n",
      "  Downloading SQLAlchemy-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n",
      "\u001b[K     |████████████████████████████████| 2.7 MB 6.9 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas->pandasql) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas->pandasql) (2021.3)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->pandasql) (1.16.0)\n",
      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.7/site-packages (from sqlalchemy->pandasql) (4.5.0)\n",
      "Collecting greenlet!=0.4.17\n",
      "  Downloading greenlet-2.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (566 kB)\n",
      "\u001b[K     |████████████████████████████████| 566 kB 111.8 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from sqlalchemy->pandasql) (6.1.0)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->sqlalchemy->pandasql) (3.15.0)\n",
      "Building wheels for collected packages: pandasql\n",
      "  Building wheel for pandasql (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26782 sha256=110b83989487b7b983fb80e3ede92a519027d1ddfd6988e2012175878ee93522\n",
      "  Stored in directory: /root/.cache/pip/wheels/5c/4b/ec/41f4e116c8053c3654e2c2a47c62b4fca34cc67ef7b55deb7f\n",
      "Successfully built pandasql\n",
      "Installing collected packages: greenlet, sqlalchemy, pandasql\n",
      "Successfully installed greenlet-2.0.2 pandasql-0.7.3 sqlalchemy-2.0.7\n",
      "Collecting python-Levenshtein\n",
      "  Downloading python_Levenshtein-0.20.9-py3-none-any.whl (9.4 kB)\n",
      "Collecting Levenshtein==0.20.9\n",
      "  Downloading Levenshtein-0.20.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (175 kB)\n",
      "\u001b[K     |████████████████████████████████| 175 kB 4.3 MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting rapidfuzz<3.0.0,>=2.3.0\n",
      "  Downloading rapidfuzz-2.13.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)\n",
      "\u001b[K     |████████████████████████████████| 2.2 MB 59.5 MB/s eta 0:00:01\n",
      "\u001b[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein\n",
      "Successfully installed Levenshtein-0.20.9 python-Levenshtein-0.20.9 rapidfuzz-2.13.7\n",
      "Collecting sacremoses\n",
      "  Downloading sacremoses-0.0.53.tar.gz (880 kB)\n",
      "\u001b[K     |████████████████████████████████| 880 kB 4.3 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: regex in /opt/conda/lib/python3.7/site-packages (from sacremoses) (2022.10.31)\n",
      "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from sacremoses) (1.16.0)\n",
      "Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from sacremoses) (8.1.3)\n",
      "Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from sacremoses) (1.2.0)\n",
      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from sacremoses) (4.65.0)\n",
      "Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click->sacremoses) (6.1.0)\n",
      "Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->sacremoses) (4.5.0)\n",
      "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->sacremoses) (3.15.0)\n",
      "Building wheels for collected packages: sacremoses\n",
      "  Building wheel for sacremoses (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895259 sha256=0f511e2624db29b2126dc7c3aec8ba54e44a1d89fa58a852332349de3af597b3\n",
      "  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9\n",
      "Successfully built sacremoses\n",
      "Installing collected packages: sacremoses\n",
      "Successfully installed sacremoses-0.0.53\n"
     ]
    }
   ],
   "source": [
    "!pip install sentencepiece\n",
    "!pip install pandasql\n",
    "!pip install python-Levenshtein\n",
    "!pip install sacremoses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "403dc883-13b6-4661-bb8c-678ec22840ab",
   "metadata": {
    "papermill": {
     "duration": 0.819632,
     "end_time": "2023-02-01T15:28:00.631919",
     "exception": false,
     "start_time": "2023-02-01T15:27:59.812287",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import Levenshtein\n",
    "import re\n",
    "from collections import Counter\n",
    "\n",
    "#Get columns in query\n",
    "def get_columns_name_in_query(query):\n",
    "  cols_from_select = get_cols_name_for_select(query) \n",
    "  cols_from_where = get_cols_name_for_where(query)\n",
    "  return list(set(cols_from_select + cols_from_where))\n",
    "\n",
    "#Translate query in natural language from italian to english (input: string; output: string)\n",
    "def translate2en(query):\n",
    "  translated = model_t.generate(**tokenizer_t(query, return_tensors=\"pt\", padding=True))\n",
    "  query = [tokenizer_t.decode(t, skip_special_tokens=True) for t in translated]\n",
    "  return query\n",
    "\n",
    "# Sometime column name maybe ill-defined. This function replace weird chars with underscore (input:list; output:string)\n",
    "def replace_nonalphanumeric_chars_with_us(l):\n",
    "  well_defined = [re.sub('[^0-9a-zA-Z]+', '_', s) for s in l]\n",
    "  return well_defined\n",
    "\n",
    "# Adjust column name using columns name from original table (input: column name in SQL query (string), \n",
    "#list of columns names from table (string); output: corrected column name (if needed) (string))\n",
    "def adjust_col_name(col_name, columns_available): \n",
    "  columns_available = [x.upper() for x in columns_available]\n",
    "  if col_name.upper() in set(columns_available):\n",
    "    return col_name\n",
    "  else:\n",
    "    max = -100\n",
    "    most_similar_column = 'column123456789011'\n",
    "    for col in columns_available:      \n",
    "      score = -Levenshtein.distance(col_name, col)               \n",
    "      if score > max:\n",
    "        most_similar_column = col  \n",
    "        max = score           \n",
    "    return most_similar_column\n",
    "\n",
    "def min_positive(a,b):\n",
    "  if (b < a) and (b > 0): return b\n",
    "  else: return a\n",
    "\n",
    "#Return corrected syntax for aggregator operators (input: string; output: string)\n",
    "#USE only for wikisql dataset\n",
    "def aggregator_parser(query): \n",
    "  query = query.upper() \n",
    "  if query.find('SELECT MAX') > -1:\n",
    "    end = min_positive(query.find('FROM'), query.find(','))    \n",
    "    adjusted_query = query.replace(query[10:end],'(' + query[11:end-1] + ') ')\n",
    "    return adjusted_query\n",
    "  elif query.find('SELECT COUNT') > -1:\n",
    "    end = min_positive(query.find('FROM'), query.find(','))\n",
    "    adjusted_query = query.replace(query[12:end],'(' + query[13:end-1] + ') ')\n",
    "    return adjusted_query\n",
    "  elif query.find('SELECT MIN') > -1:\n",
    "    end = min_positive(query.find('FROM'), query.find(','))\n",
    "    adjusted_query = query.replace(query[10:end],'(' + query[11:end-1] + ') ')\n",
    "    return adjusted_query\n",
    "  #elif query.find('SELECT DISTINCT') > -1:\n",
    "   #end = query.find('FROM')\n",
    "    #adjusted_query = query.replace(query[15:end],'(' + query[16:end-1] + ') ')\n",
    "    #return adjusted_query\n",
    "  else: \n",
    "    return query\n",
    "\n",
    "#Return columns name from SELECT operator (input: string; output: list)\n",
    "def get_cols_name_for_select(query):\n",
    "  query = query.upper()  \n",
    "  if query.find('SELECT DISTINCT') > -1:\n",
    "    end = query.find('FROM')\n",
    "    cols = query[15:end-1].split(',')\n",
    "  elif query.find('SELECT MAX') > -1:\n",
    "    end = query.find('FROM')\n",
    "    cols = query[10:end-1].split(',')  \n",
    "  elif query.find('SELECT MIN') > -1:\n",
    "    end = query.find('FROM')\n",
    "    cols = query[10:end-1].split(',')     \n",
    "  elif query.find('SELECT COUNT') > -1:\n",
    "    end = query.find('FROM')\n",
    "    cols = query[13:end-1].split(',')    \n",
    "  elif query.find('SELECT') > -1:\n",
    "    end = query.find('FROM')\n",
    "    cols = query[7:end-1].split(',')    \n",
    "  else:  \n",
    "    cols = ['']    \n",
    "  return [x.replace(' ','').replace(')','').replace('(','').upper() for x in cols]\n",
    "\n",
    "def get_indexes(l):\n",
    "  ops = []\n",
    "  idx = []\n",
    "  for i in range(len(l)):\n",
    "    if l[i] in ['=', '>', '<', '>=', '<=', '<>', 'LIKE', 'AND', 'OR']:\n",
    "      idx.append(i)\n",
    "  return idx\n",
    "\n",
    "def add_spaces_cmp_operators(string):\n",
    "  ops = ['=', '>', '<', '>=', '<=', '<>']\n",
    "  for op in ops:\n",
    "    string = string.replace(op, ' ' + op + ' ') \n",
    "  return ' '.join(string.split())\n",
    "\n",
    "#Check if string and add quotes (input: string; output: string)\n",
    "#USE only for wikisql dataset\n",
    "def add_quotes_to_string(query):\n",
    "  query = query.upper()\n",
    "  if query.find('WHERE') > 0:\n",
    "    query_list = query.split(' ')\n",
    "    query_list = [x.replace(' ','') for x in query_list]\n",
    "    query_list[:] = filter(None, query_list)  \n",
    "    idx_list = get_indexes(query_list)  \n",
    "    idx_list.append(len(query_list))  \n",
    "    subs = []\n",
    "    for i in range(len(idx_list)):\n",
    "      if i % 2 == 0:\n",
    "        b = idx_list[i] + 1\n",
    "        e = idx_list[i+1] - 1\n",
    "        if b != e:\n",
    "          s = ''\n",
    "          for ix in range(b,e + 1):          \n",
    "            s = s + query_list[ix] + ' ' \n",
    "          s = s[:-1]   \n",
    "        else:\n",
    "          s = query_list[b]     \n",
    "        if not(s.isnumeric()):\n",
    "          s = \"'\" + s + \"'\"\n",
    "        subs.append((idx_list[i] + 1, idx_list[i+1] - 1, s))  \n",
    "    subs = subs[::-1]       \n",
    "    for i in range(len(subs)):\n",
    "      e = subs[i]\n",
    "      if e[0] == e[1]:\n",
    "        query_list[e[0]] = e[2]\n",
    "      else:\n",
    "        query_list[e[0]] = e[2]\n",
    "        idx = e[1]\n",
    "        while idx > e[0]:\n",
    "          query_list.pop(idx)\n",
    "          idx = idx - 1\n",
    "    final_query = ''\n",
    "    for word in query_list:\n",
    "      final_query = final_query + word + ' '     \n",
    "    return final_query[:-1]\n",
    "  else:\n",
    "    return query\n",
    "\n",
    "#Get values from where clause (input: string; output: list)\n",
    "def get_values_for_query_filter(query):\n",
    "  query = query.upper()\n",
    "  if query.find('WHERE') > 0:\n",
    "    query_list = query.split(' ')\n",
    "    query_list = [x.replace(' ','') for x in query_list]\n",
    "    query_list[:] = filter(None, query_list)  \n",
    "    idx_list = get_indexes(query_list)  \n",
    "    idx_list.append(len(query_list))  \n",
    "    subs = []\n",
    "    for i in range(len(idx_list)):\n",
    "      if i % 2 == 0:\n",
    "        b = idx_list[i] + 1\n",
    "        e = idx_list[i+1] - 1\n",
    "        if b != e:\n",
    "          s = ''\n",
    "          for ix in range(b,e + 1):          \n",
    "            s = s + query_list[ix] + ' ' \n",
    "          s = s[:-1]   \n",
    "        else:\n",
    "          s = query_list[b]        \n",
    "        subs.append(s.replace(\"'\",\"\"))\n",
    "  return subs\n",
    "\n",
    "\n",
    "# Get columns name after where (input: string, output: list)\n",
    "def get_cols_name_for_where(query):\n",
    "  query = query.upper()\n",
    "  subs = []  \n",
    "  if query.find('WHERE') > 0:\n",
    "    query_list = query.split(' ')\n",
    "    query_list = [x.replace(' ','') for x in query_list]\n",
    "    query_list[:] = filter(None, query_list)  \n",
    "    idx_list = get_indexes(query_list)  \n",
    "    #idx_list.append(len(query_list))\n",
    "    idx_list.insert(0, query_list.index('WHERE'))      \n",
    "    for i in range(len(idx_list)-1):\n",
    "      if i % 2 == 0:     \n",
    "        b = idx_list[i] + 1\n",
    "        e = idx_list[i+1] - 1\n",
    "        if b != e:\n",
    "          s = ''\n",
    "          for ix in range(b,e + 1):          \n",
    "            s = s + query_list[ix] + ' ' \n",
    "          s = s[:-1]   \n",
    "        else:\n",
    "          s = query_list[b]\n",
    "        subs.append(s)    \n",
    "  return subs   \n",
    "\n",
    "def check_if_number(s):\n",
    "  try:\n",
    "    a = float(s)\n",
    "    return True\n",
    "  except:\n",
    "    return False\n",
    "\n",
    "#Correct missing compare operator (input: string; output: string)\n",
    "#T5 seems to have problem with '<' operator so if there is none this is used.\n",
    "def check_if_correct_cmp_operators(query):\n",
    "  query = query.upper()\n",
    "  if query.find('WHERE') > 0:\n",
    "    query = add_spaces_cmp_operators(query)\n",
    "    query_list = query.split(' ')\n",
    "    w = query_list.index('WHERE')\n",
    "    cmp_operators = ['=', '>', '<', '>=', '<=', '<>', 'LIKE']\n",
    "    s = 0\n",
    "    for op in cmp_operators:\n",
    "      s = s + query_list.count(op)\n",
    "    if s == 0:      \n",
    "      if check_if_number(query_list[-1]):\n",
    "        query_list.insert(len(query_list)-1,'<')\n",
    "      else:\n",
    "        query_list.insert(len(query_list)-1,'=')\n",
    "      return ' '.join(query_list)\n",
    "    else:\n",
    "      return query\n",
    "  else: return query\n",
    "    \n",
    "\n",
    "\n",
    "#Correct SQL syntax using info from table (input: string, list; ouput:string)\n",
    "#Use only for wikisql dataset\n",
    "def correct_query(query, table_columns):  \n",
    "    query = check_if_correct_cmp_operators(query)\n",
    "    query = add_spaces_cmp_operators(query)    \n",
    "  #try: \n",
    "    query = aggregator_parser(query) \n",
    "  #except: pass \n",
    "  #try: \n",
    "    query = add_quotes_to_string(query) \n",
    "  #except: pass \n",
    "  #try:\n",
    "    cols_name = get_columns_name_in_query(query)      \n",
    "    for col in cols_name:    \n",
    "      corrected_col = adjust_col_name(col, table_columns)      \n",
    "      query = query.replace(col, corrected_col)\n",
    "  #except: pass\n",
    "    return query\n",
    "\n",
    "def correct_mispelling(question, query):  \n",
    "  query = query.upper()\n",
    "  if query.find('WHERE') > 0:\n",
    "    question = question.upper()\n",
    "    corrections = []\n",
    "    values = get_values_for_query_filter(query)\n",
    "    for value in values:    \n",
    "      l = len(value.split(' '))\n",
    "      tokens = question.replace('  ', ' ').split(' ')\n",
    "      l_gram = ''\n",
    "      max = -100\n",
    "      for i in range(0, len(tokens)-l+1, 1):\n",
    "        filter = ' '.join(tokens[i:i+l]).strip('.,?')\n",
    "        #filter = re.sub(r\"[,.;@#?!&$]+\\ *\", \" \", filter).strip()    \n",
    "        score = -Levenshtein.distance(value, filter)        \n",
    "        if score > max:\n",
    "          max = score\n",
    "          correct_filter = filter        \n",
    "      corrections.append([value, correct_filter])    \n",
    "    for corr in corrections:\n",
    "      query = query.replace(corr[0], corr[1])\n",
    "  return query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "4683a145-3f8c-4e0e-a7e1-e8b8573ecc35",
   "metadata": {
    "papermill": {
     "duration": 0.740263,
     "end_time": "2023-02-01T15:28:02.036850",
     "exception": false,
     "start_time": "2023-02-01T15:28:01.296587",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def translate_to_sql(text):\n",
    "    inputs = tokenizer(text, padding='longest', max_length=64, return_tensors='pt').to(torch_device)\n",
    "    input_ids = inputs.input_ids\n",
    "    attention_mask = inputs.attention_mask\n",
    "    output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)\n",
    "\n",
    "    return tokenizer.decode(output[0], skip_special_tokens=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0be55a1e-2ad4-4a4b-9beb-57623059c768",
   "metadata": {
    "papermill": {
     "duration": 1669.9823,
     "end_time": "2023-02-01T15:55:52.681707",
     "exception": false,
     "start_time": "2023-02-01T15:28:02.699407",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:datasets.builder:Found cached dataset wikisql (/root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15878\n",
      "0.0 0.01 %\n",
      "0.6595744680851063 0.51 %\n",
      "0.7263157894736842 1.01 %\n",
      "0.6928571428571428 1.51 %\n",
      "0.6878306878306878 2.01 %\n",
      "0.7058823529411765 2.51 %\n",
      "0.7142857142857143 3.01 %\n",
      "0.6795252225519288 3.51 %\n",
      "0.6909090909090909 4.01 %\n",
      "0.7020785219399538 4.51 %\n",
      "0.7 5.01 %\n",
      "0.7 5.51 %\n",
      "0.6972318339100346 6.01 %\n",
      "0.6990445859872612 6.51 %\n",
      "0.7071005917159763 7.01 %\n",
      "0.7158620689655173 7.51 %\n",
      "0.7174193548387097 8.01 %\n",
      "0.7127272727272728 8.51 %\n",
      "0.7177142857142857 9.01 %\n",
      "0.7096424702058505 9.51 %\n",
      "0.7057613168724279 10.01 %\n",
      "0.7045009784735812 10.51 %\n",
      "0.7052238805970149 11.01 %\n",
      "0.6978609625668449 11.51 %\n",
      "0.7028181041844578 12.01 %\n",
      "0.7 12.51 %\n",
      "0.7021276595744681 13.01 %\n",
      "0.7025796661608498 13.51 %\n",
      "0.706140350877193 14.01 %\n",
      "0.7080394922425952 14.51 %\n",
      "0.7100954979536153 15.01 %\n",
      "0.7058047493403694 15.51 %\n",
      "0.7049808429118773 16.01 %\n",
      "0.7054455445544554 16.51 %\n",
      "0.7063063063063063 17.01 %\n",
      "0.7046117921774664 17.51 %\n",
      "0.7060830017055145 18.01 %\n",
      "0.7037037037037037 18.51 %\n",
      "0.7002152852529602 19.01 %\n",
      "0.7017819706498952 19.51 %\n",
      "0.7017364657814096 20.01 %\n",
      "0.7016932270916335 20.51 %\n",
      "0.6987366375121478 21.01 %\n",
      "0.7009034712315739 21.51 %\n",
      "0.7007910656119125 22.01 %\n",
      "0.6995903504779244 22.51 %\n",
      "0.6994657168299199 23.01 %\n",
      "0.6980392156862745 23.51 %\n",
      "0.6961538461538461 24.01 %\n",
      "0.6961071578066137 24.51 %\n",
      "0.6978269782697827 25.01 %\n",
      "0.6994777018883086 25.51 %\n",
      "0.6992510839574301 26.01 %\n",
      "0.699265558562041 26.51 %\n",
      "0.6994307400379507 27.01 %\n",
      "0.6994413407821229 27.51 %\n",
      "0.6998171846435101 28.01 %\n",
      "0.7023339317773788 28.51 %\n",
      "0.702893436838391 29.01 %\n",
      "0.7018763029881863 29.51 %\n",
      "0.7004781420765027 30.01 %\n",
      "0.7021490933512424 30.51 %\n",
      "0.7007926023778072 31.01 %\n",
      "0.701885565669701 31.51 %\n",
      "0.7037747920665387 32.01 %\n",
      "0.7034005037783375 32.51 %\n",
      "0.703657780533168 33.01 %\n",
      "0.7042124542124543 33.51 %\n",
      "0.7037593984962406 34.01 %\n",
      "0.7045925925925925 34.51 %\n",
      "0.7033576642335766 35.01 %\n",
      "0.7020725388601037 35.51 %\n",
      "0.7021881216254617 36.01 %\n",
      "0.7045135968601065 36.51 %\n",
      "0.703816371681416 37.01 %\n",
      "0.7054009819967266 37.51 %\n",
      "0.7053283100107642 38.01 %\n",
      "0.7063197026022305 38.51 %\n",
      "0.7072851153039832 39.01 %\n",
      "0.707815734989648 39.51 %\n",
      "0.7072049054675523 40.01 %\n",
      "0.7080494574817058 40.51 %\n",
      "0.706556968337073 41.01 %\n",
      "0.7067224821472544 41.51 %\n",
      "0.7041135434207361 42.51 %\n",
      "0.7061340941512125 43.01 %\n",
      "0.7056195626616506 43.51 %\n",
      "0.7048570764582849 44.01 %\n",
      "0.70434183321847 44.51 %\n",
      "0.7051340299863699 45.01 %\n",
      "0.7056179775280899 45.51 %\n",
      "0.7052678372971771 46.01 %\n",
      "0.7047702791822379 46.51 %\n",
      "0.7036312241791693 47.01 %\n",
      "0.703807270380727 47.51 %\n",
      "0.704405192594169 48.01 %\n",
      "0.7039376710886502 48.51 %\n",
      "0.7032715148989372 49.01 %\n",
      "0.7025577557755776 49.51 %\n",
      "0.7021233156390363 50.01 %\n",
      "0.7030523549626035 50.51 %\n",
      "0.7028216930158094 51.01 %\n",
      "0.7049147839873167 51.51 %\n",
      "0.7049469964664311 52.01 %\n",
      "0.7056765163297045 52.51 %\n",
      "0.7060069310743166 53.01 %\n",
      "0.7065673921344024 53.51 %\n",
      "0.7078290468986385 54.01 %\n",
      "0.7077557137504683 54.51 %\n",
      "0.707629478373863 55.01 %\n",
      "0.709475620975161 55.51 %\n",
      "0.7108477666362808 56.01 %\n",
      "0.7125813449023861 56.51 %\n",
      "0.7136200716845879 57.01 %\n",
      "0.7140319715808171 57.51 %\n",
      "0.7151408450704225 58.01 %\n",
      "0.715782122905028 58.51 %\n",
      "0.7158186223606784 59.01 %\n",
      "0.7163289630512515 60.01 %\n",
      "0.7156366092536305 60.51 %\n",
      "0.7153382451440053 61.01 %\n",
      "0.7152108933909 61.51 %\n",
      "0.7151565074135091 62.01 %\n",
      "0.7163398692810458 62.51 %\n",
      "0.7165316045380875 63.01 %\n",
      "0.7157099212091976 63.51 %\n",
      "0.7162226830435476 64.01 %\n",
      "0.7173603418262383 64.51 %\n",
      "0.7172240540116188 65.01 %\n",
      "0.7178927680798005 65.51 %\n",
      "0.7174013921113689 66.01 %\n",
      "0.7172678434382195 66.51 %\n",
      "0.7177456207159177 67.01 %\n",
      "0.7183673469387755 67.51 %\n",
      "0.7195798949737434 68.01 %\n",
      "0.7207743857036486 68.51 %\n",
      "0.721359940872136 69.01 %\n",
      "0.7217901687454146 69.51 %\n",
      "0.7223597960670065 70.01 %\n",
      "0.7228410241573846 70.51 %\n",
      "0.722820623294557 71.01 %\n",
      "0.7224757558471192 71.51 %\n",
      "0.7225998300764656 72.01 %\n",
      "0.7231439820022497 72.51 %\n",
      "0.7228898826159866 73.01 %\n",
      "0.7233865371269952 73.51 %\n",
      "0.7239762856748931 74.01 %\n",
      "0.7236373596274993 74.51 %\n",
      "0.7238846572361263 75.01 %\n",
      "0.7239935152661443 75.51 %\n",
      "0.724771873322598 76.01 %\n",
      "0.7247400693148494 76.51 %\n",
      "0.7251655629139073 77.01 %\n",
      "0.7263157894736842 77.51 %\n",
      "0.727141922825376 78.01 %\n",
      "0.7273554256010396 78.51 %\n",
      "0.7268853305785123 79.01 %\n",
      "0.7272260713369259 79.51 %\n",
      "0.7276785714285714 80.01 %\n",
      "0.7281368821292775 80.51 %\n",
      "0.7278911564625851 81.01 %\n",
      "0.728274480340596 81.51 %\n",
      "0.7276007964161274 82.01 %\n",
      "0.7273964131106988 82.51 %\n",
      "0.7280885064535956 83.01 %\n",
      "0.7286726961623075 83.51 %\n",
      "0.7278911564625851 84.01 %\n",
      "0.7276020284955325 84.51 %\n",
      "0.728233457427645 85.01 %\n",
      "0.7277068162826787 85.51 %\n",
      "0.7281034892000949 86.01 %\n",
      "0.728259587020649 86.51 %\n",
      "0.7276246334310851 87.01 %\n",
      "0.7276630308656301 88.01 %\n",
      "0.7275245239469129 88.51 %\n",
      "0.7265006312406749 89.01 %\n",
      "0.726649920073076 89.51 %\n",
      "0.727262404905189 90.01 %\n",
      "0.7269030946464875 90.51 %\n",
      "0.7272012578616353 91.01 %\n",
      "0.7273235031277927 91.51 %\n",
      "0.7276061346965993 92.01 %\n",
      "0.7284687672747374 92.51 %\n",
      "0.7286327136728633 93.01 %\n",
      "0.7277601488127804 93.51 %\n",
      "0.7278267493742518 94.01 %\n",
      "0.7268593699253004 94.51 %\n",
      "0.7275665194441452 95.01 %\n",
      "0.7268245632836781 95.51 %\n",
      "0.7272824232081911 96.01 %\n",
      "0.7267614601018676 96.51 %\n",
      "0.7260346283783784 97.01 %\n",
      "0.7256878806973325 97.51 %\n",
      "0.7253447555369829 98.01 %\n",
      "0.7251559251559252 98.51 %\n",
      "0.7258248009101251 99.01 %\n",
      "0.7258130918073281 99.51 %\n"
     ]
    }
   ],
   "source": [
    "test_data = load_dataset('wikisql', split='test')\n",
    "\n",
    "print(len(test_data))\n",
    "n =10000\n",
    "\n",
    "count = 0\n",
    "correct_samples = 0\n",
    "for i in range(0,n,1):\n",
    "  #print('processed', 100*(i+1)/n,'%')  \n",
    "  question = 'translate to SQL: ' + test_data[i]['question'] + ' table ID: ' + ', '.join(str(x) for x in test_data[i]['table']['header'])   \n",
    "  sql = translate_to_sql(question)\n",
    "  #print(sql, test_data[i]['question'])\n",
    "  #output = correct_query(sql, test_data[i]['table']['header'])  \n",
    "  #output = correct_mispelling(test_data[i]['question'], output)\n",
    "  #target = correct_query(test_data[i]['sql']['human_readable'], test_data[i]['table']['header'])\n",
    "  try:     \n",
    "    output = correct_query(sql, test_data[i]['table']['header'])\n",
    "    output = correct_mispelling(test_data[i]['question'], output)\n",
    "    target = correct_query(test_data[i]['sql']['human_readable'], test_data[i]['table']['header'])\n",
    "    #output = sql\n",
    "    #target = test_data[i]['sql']['human_readable']\n",
    "    correct_samples = correct_samples + 1\n",
    "    if output.lower() == target.lower():\n",
    "      count = count + 1     \n",
    "    else:\n",
    "      #print(question)\n",
    "      #print(output)   \n",
    "      #print(target)      \n",
    "      pass\n",
    "    if i % 50 == 0:\n",
    "        print(count/correct_samples, 100*(i+1)/n,'%')   \n",
    "  except Exception as err:\n",
    "    #print(f\"Unexpected {err=}, {type(err)=}\")\n",
    "    #print('---Error-- ')  \n",
    "    #print(sql) \n",
    "    #print(test_data[i]['sql']['human_readable'])\n",
    "    #print(test_data[i]['table']['header'])\n",
    "    pass\n",
    "  #output = translate_to_sql(question)\n",
    "  #target = test_data[i]['sql']['human_readable']\n",
    "  #print(question)\n",
    "  #print(output)  \n",
    "  #print(target) \n",
    "print(count/n)\n",
    "print(count/correct_samples)\n",
    "print(correct_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a64367c-63e8-4621-9dc8-f80c7944809f",
   "metadata": {
    "papermill": {
     "duration": 1.247871,
     "end_time": "2023-02-01T15:55:55.285210",
     "exception": false,
     "start_time": "2023-02-01T15:55:54.037339",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "with valohai.logger() as logger:\n",
    "    logger.log('accuracy', count/correct_samples)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94830d9c-7688-4b66-8b07-0bc5b0b0f8d1",
   "metadata": {
    "papermill": {
     "duration": 1.337796,
     "end_time": "2023-02-01T15:55:57.940900",
     "exception": false,
     "start_time": "2023-02-01T15:55:56.603104",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(count)\n",
    "print(correct_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4d06297-007d-4d42-8fc4-e153a857a953",
   "metadata": {
    "papermill": {
     "duration": 1.317653,
     "end_time": "2023-02-01T15:56:00.574479",
     "exception": false,
     "start_time": "2023-02-01T15:55:59.256826",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 8853.368468,
   "end_time": "2023-02-01T15:56:05.444982",
   "environment_variables": {},
   "exception": null,
   "input_path": "/valohai/repository/txt2sql_t5_small_training.ipynb",
   "output_path": "/valohai/outputs/txt2sql_t5_small_training.ipynb",
   "parameters": {},
   "start_time": "2023-02-01T13:28:32.076514",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}