{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyM+RDc6oT9ugP7mwCzejJZk"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"a84764be72c94e9b92b4bfa1c39f5a5d":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_203be9dfa34d44c48cf81c407af93e7f","IPY_MODEL_23c261bb2a7643d0b157435d23bfa579","IPY_MODEL_e1ab3c1fd28b41baa1b03c41263c5e8a"],"layout":"IPY_MODEL_f0730089d3c148d7930d64d1fb32baf3"}},"203be9dfa34d44c48cf81c407af93e7f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_7d732638e92f4003a188de33b11728d9","placeholder":"","style":"IPY_MODEL_3c47de8fb43445dd8da87c0c92b66a41","value":"Map: 100%"}},"23c261bb2a7643d0b157435d23bfa579":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_98ad2005ca124300be0bc8a164a9c47d","max":50001,"min":0,"orientation":"horizontal","style":"IPY_MODEL_bb57322fe0a444bc8a6a4d763b5a048f","value":50001}},"e1ab3c1fd28b41baa1b03c41263c5e8a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_4476cc1ba6604a17b1edf9f02b61e961","placeholder":"","style":"IPY_MODEL_fe699bfd6b824a7d93986846e6300206","value":" 50001/50001 [00:30<00:00, 1937.62 examples/s]"}},"f0730089d3c148d7930d64d1fb32baf3":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7d732638e92f4003a188de33b11728d9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3c47de8fb43445dd8da87c0c92b66a41":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"98ad2005ca124300be0bc8a164a9c47d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bb57322fe0a444bc8a6a4d763b5a048f":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"4476cc1ba6604a17b1edf9f02b61e961":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"fe699bfd6b824a7d93986846e6300206":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"c8015686d7b64cd8b538f41e10126dfc":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_eafdd13a06d8465687dc3f6313064fb5","IPY_MODEL_c0ec5bdd73b5488fa6804b5d97d12899","IPY_MODEL_444cf550822e42399f2bdd4634c4acf5"],"layout":"IPY_MODEL_ac8428a7457f4c37b0328bb8131c6b31"}},"eafdd13a06d8465687dc3f6313064fb5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_8c78493333f442ecb93b5b7ee5b80f76","placeholder":"","style":"IPY_MODEL_5506934b176b41eea436c10ca6e95aaa","value":"Saving the dataset (1/1 shards): 100%"}},"c0ec5bdd73b5488fa6804b5d97d12899":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_d9b238b007c04e7db7539d05cd384ff2","max":50001,"min":0,"orientation":"horizontal","style":"IPY_MODEL_85d38d2774f84bdeaea6468d5306a709","value":50001}},"444cf550822e42399f2bdd4634c4acf5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_6b22798cff1f493297a2c02882d95c23","placeholder":"","style":"IPY_MODEL_b2bbb20b70c547388993e2464250c48f","value":" 50001/50001 [00:00<00:00, 58265.94 examples/s]"}},"ac8428a7457f4c37b0328bb8131c6b31":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8c78493333f442ecb93b5b7ee5b80f76":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5506934b176b41eea436c10ca6e95aaa":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"d9b238b007c04e7db7539d05cd384ff2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"85d38d2774f84bdeaea6468d5306a709":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"6b22798cff1f493297a2c02882d95c23":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"b2bbb20b70c547388993e2464250c48f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0XwNnf3-TEb5","executionInfo":{"status":"ok","timestamp":1733208941177,"user_tz":-300,"elapsed":9307,"user":{"displayName":"qusai aamir","userId":"15867633959240373143"}},"outputId":"5c62bd97-38a9-4770-9eac-0d6deb4befad"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.46.2)\n","Collecting datasets\n"," Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n","Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (1.1.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.26.2)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n","Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n","Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.3)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n","Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n","Collecting dill<0.3.9,>=0.3.0 (from datasets)\n"," Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n","Collecting xxhash (from datasets)\n"," Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n","Collecting multiprocess<0.70.17 (from datasets)\n"," Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n","Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n"," Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n","Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n","Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.5.1+cu121)\n","Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n","Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n","Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n","Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.4.2)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.4)\n","Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.13.1)\n","Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate) (1.3.0)\n","Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n","Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (3.0.2)\n","Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: xxhash, fsspec, dill, multiprocess, datasets\n"," Attempting uninstall: fsspec\n"," Found existing installation: fsspec 2024.10.0\n"," Uninstalling fsspec-2024.10.0:\n"," Successfully uninstalled fsspec-2024.10.0\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n","\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 xxhash-3.5.0\n"]}],"source":["pip install transformers datasets accelerate"]},{"cell_type":"code","source":["!pip install datasets"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"H9d4TyaVya8D","executionInfo":{"status":"ok","timestamp":1733382026740,"user_tz":-300,"elapsed":4691,"user":{"displayName":"qusai aamir","userId":"15867633959240373143"}},"outputId":"bf832afa-8e27-4313-c2ea-5bfee3745ac4"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting datasets\n"," Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n","Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n","Collecting dill<0.3.9,>=0.3.0 (from datasets)\n"," Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n","Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n","Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n","Collecting xxhash (from datasets)\n"," Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n","Collecting multiprocess<0.70.17 (from datasets)\n"," Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n","Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n"," Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n","Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n","Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n","Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n","Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n","Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n","Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n","Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n","Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hInstalling collected packages: xxhash, fsspec, dill, multiprocess, datasets\n"," Attempting uninstall: fsspec\n"," Found existing installation: fsspec 2024.10.0\n"," Uninstalling fsspec-2024.10.0:\n"," Successfully uninstalled fsspec-2024.10.0\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n","\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 xxhash-3.5.0\n"]}]},{"cell_type":"code","source":["from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq\n","from datasets import load_dataset\n","\n","# Load GPT-Neo tokenizer and model\n","tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n","model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-125M\")\n","\n","# Set pad token to eos token\n","tokenizer.pad_token = tokenizer.eos_token\n","\n","# Load the subset dataset\n","dataset = load_dataset(\"json\", data_files=\"financial_accounting_subset.json\")\n","\n","# Inspect the first row to verify the structure of the dataset\n","first_row = next(iter(dataset[\"train\"]))\n","print(\"Keys in the dataset:\", first_row.keys())\n","\n","# Define input and output columns based on your dataset structure\n","# Adjust these as necessary for your task\n","input_column = \"Description\" # Example input: financial transaction description\n","output_column = \"Category\" # Example output: transaction category\n","\n","# Check if the specified columns exist in the dataset\n","if input_column not in first_row or output_column not in first_row:\n"," raise KeyError(\n"," f\"Columns '{input_column}' or '{output_column}' not found in dataset. \"\n"," f\"Available keys: {list(first_row.keys())}\"\n"," )\n","\n","# Define the tokenization function\n","def tokenize_function(examples):\n"," inputs = examples[input_column]\n"," targets = examples[output_column]\n"," model_inputs = tokenizer(\n"," inputs,\n"," truncation=True,\n"," padding=\"max_length\",\n"," max_length=512\n"," )\n"," with tokenizer.as_target_tokenizer():\n"," labels = tokenizer(\n"," targets,\n"," truncation=True,\n"," padding=\"max_length\",\n"," max_length=512\n"," )\n"," model_inputs[\"labels\"] = labels[\"input_ids\"]\n"," return model_inputs\n","\n","# Tokenize the dataset\n","tokenized_dataset = dataset.map(tokenize_function, batched=True)\n","\n","# Optional: Save the tokenized dataset for inspection\n","tokenized_dataset.save_to_disk(\"./tokenized_dataset\")\n","\n","print(\"Tokenization complete!\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":174,"referenced_widgets":["a84764be72c94e9b92b4bfa1c39f5a5d","203be9dfa34d44c48cf81c407af93e7f","23c261bb2a7643d0b157435d23bfa579","e1ab3c1fd28b41baa1b03c41263c5e8a","f0730089d3c148d7930d64d1fb32baf3","7d732638e92f4003a188de33b11728d9","3c47de8fb43445dd8da87c0c92b66a41","98ad2005ca124300be0bc8a164a9c47d","bb57322fe0a444bc8a6a4d763b5a048f","4476cc1ba6604a17b1edf9f02b61e961","fe699bfd6b824a7d93986846e6300206","c8015686d7b64cd8b538f41e10126dfc","eafdd13a06d8465687dc3f6313064fb5","c0ec5bdd73b5488fa6804b5d97d12899","444cf550822e42399f2bdd4634c4acf5","ac8428a7457f4c37b0328bb8131c6b31","8c78493333f442ecb93b5b7ee5b80f76","5506934b176b41eea436c10ca6e95aaa","d9b238b007c04e7db7539d05cd384ff2","85d38d2774f84bdeaea6468d5306a709","6b22798cff1f493297a2c02882d95c23","b2bbb20b70c547388993e2464250c48f"]},"id":"h3DHVIvlySz0","executionInfo":{"status":"ok","timestamp":1733382302221,"user_tz":-300,"elapsed":32633,"user":{"displayName":"qusai aamir","userId":"15867633959240373143"}},"outputId":"ae8c24fb-c29b-43cb-9e62-5f352ff3b60d"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Keys in the dataset: dict_keys(['Date', 'Account', 'Description', 'Debit', 'Credit', 'Category', 'Transaction_Type', 'Customer_Vendor', 'Payment_Method', 'Reference'])\n"]},{"output_type":"display_data","data":{"text/plain":["Map: 0%| | 0/50001 [00:00, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a84764be72c94e9b92b4bfa1c39f5a5d"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:4114: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["Saving the dataset (0/1 shards): 0%| | 0/50001 [00:00, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c8015686d7b64cd8b538f41e10126dfc"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Tokenization complete!\n"]}]},{"cell_type":"code","source":["# Define a custom data collator to handle loss computation\n","class CustomDataCollator(DataCollatorForSeq2Seq):\n"," def __call__(self, features):\n"," # Ensure correct padding and label creation\n"," batch = super().__call__(features)\n"," labels = batch['input_ids'].clone()\n"," labels[labels == tokenizer.pad_token_id] = -100 # Ignore padding tokens in the loss computation\n"," batch['labels'] = labels\n"," return batch\n","\n","# Define the training arguments\n","training_args = TrainingArguments(\n"," output_dir=\"./gpt_fine_tuned_model\",\n"," num_train_epochs=1,\n"," per_device_train_batch_size=8,\n"," per_device_eval_batch_size=8,\n"," logging_dir='./logs',\n"," logging_steps=500,\n"," do_train=True,\n"," do_eval=True,\n"," report_to=\"none\", # Change to \"wandb\" if using Weights & Biases\n",")\n","\n","# Initialize the Trainer with the custom data collator\n","trainer = Trainer(\n"," model=model,\n"," args=training_args,\n"," train_dataset=tokenized_dataset[\"train\"], # Use the tokenized training dataset\n"," eval_dataset=tokenized_dataset[\"train\"], # Optional: if you have a validation set\n"," data_collator=CustomDataCollator(tokenizer), # Use custom data collator\n",")\n","\n","# Start training\n","trainer.train()\n","\n","# Save the fine-tuned model\n","trainer.save_model(\"./gpt_fine_tuned_model\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":854},"id":"AGnSEH4KUSqA","executionInfo":{"status":"ok","timestamp":1733389506675,"user_tz":-300,"elapsed":808345,"user":{"displayName":"qusai aamir","userId":"15867633959240373143"}},"outputId":"4bde9e00-89d8-457d-9824-4d67239aec04"},"execution_count":6,"outputs":[{"data":{"text/html":["\n","
Step | \n","Training Loss | \n","
---|---|
500 | \n","5.629600 | \n","
1000 | \n","5.499100 | \n","
1500 | \n","5.465100 | \n","
2000 | \n","5.445500 | \n","
2500 | \n","5.419400 | \n","
3000 | \n","5.433800 | \n","
3500 | \n","5.417200 | \n","
4000 | \n","5.385300 | \n","
4500 | \n","5.385200 | \n","
5000 | \n","5.386900 | \n","
5500 | \n","5.388800 | \n","
"],"text/plain":[" "]},"metadata":{}}]},{"cell_type":"code","source":["from transformers import AutoTokenizer\n","\n","# Save model and tokenizer\n","model.save_pretrained(\"path_to_save_model\")\n","tokenizer.save_pretrained(\"path_to_save_model\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6h7qjNWfdzvI","executionInfo":{"status":"ok","timestamp":1733389816795,"user_tz":-300,"elapsed":3653,"user":{"displayName":"qusai aamir","userId":"15867633959240373143"}},"outputId":"da787b7c-fafa-41cb-a3b9-5609496ca751"},"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["('path_to_save_model/tokenizer_config.json',\n"," 'path_to_save_model/special_tokens_map.json',\n"," 'path_to_save_model/vocab.json',\n"," 'path_to_save_model/merges.txt',\n"," 'path_to_save_model/added_tokens.json',\n"," 'path_to_save_model/tokenizer.json')"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","source":["results = trainer.evaluate()\n","print(\"Evaluation Results:\", results)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":37},"id":"FmYpQ5iqFcd9","outputId":"51406d29-1ffa-4d54-a6da-8f189197da67"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["\n"," \n","
\n"," \n"," \n"," \n"," Step \n"," Training Loss \n"," \n"," \n"," 500 \n"," 5.629600 \n"," \n"," \n"," 1000 \n"," 5.499100 \n"," \n"," \n"," 1500 \n"," 5.465100 \n"," \n"," \n"," 2000 \n"," 5.445500 \n"," \n"," \n"," 2500 \n"," 5.419400 \n"," \n"," \n"," 3000 \n"," 5.433800 \n"," \n"," \n"," 3500 \n"," 5.417200 \n"," \n"," \n"," 4000 \n"," 5.385300 \n"," \n"," \n"," 4500 \n"," 5.385200 \n"," \n"," \n"," 5000 \n"," 5.386900 \n"," \n"," \n"," 5500 \n"," 5.388800 \n"," \n"," \n"," \n","6000 \n"," 5.345600 \n","