{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UXKT8SDQQ1tI"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "import torch\n",
        "import re\n",
        "from pprint import pprint\n",
        "major_version, minor_version = torch.cuda.get_device_capability()\n",
        "if major_version >= 8:\n",
        "    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)\n",
        "    !pip install \"unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git\"\n",
        "else:\n",
        "    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)\n",
        "    !pip install \"unsloth[colab] @ git+https://github.com/unslothai/unsloth.git\"\n",
        "pass"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth import FastLanguageModel\n",
        "import torch\n",
        "max_seq_length = 2048\n",
        "# Choose any! We auto support RoPE Scaling internally!\n",
        "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
        "load_in_4bit = True"
      ],
      "metadata": {
        "id": "Q6gVomWzQ7hU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "    model_name = \"neuralwebtech/mental_health_counseling_gemma_7b_4bit_q\", # YOUR MODEL YOU USED FOR TRAINING\n",
        "    max_seq_length = max_seq_length,\n",
        "    dtype = dtype,\n",
        "    load_in_4bit = load_in_4bit,\n",
        ")\n",
        "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
        "\n",
        "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context.\n",
        " Write a response that appropriately completes the request.\n",
        "\n",
        "### Context:\n",
        "{}\n",
        "\n",
        "### Response:\n",
        "{}\"\"\""
      ],
      "metadata": {
        "id": "_ItV-FhgRC5t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "inputs = tokenizer(\n",
        "[\n",
        "    alpaca_prompt.format(\n",
        "        text, # instruction\n",
        "        \"\", # output - leave this blank for generation!\n",
        "    )\n",
        "], return_tensors = \"pt\").to(\"cuda\")\n",
        "\n",
        "outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)\n",
        "final_out=tokenizer.batch_decode(outputs)\n"
      ],
      "metadata": {
        "id": "8eTx88KiRDiL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def print_response(lines):\n",
        "    text = '\\n'.join(lines)\n",
        "    response_match = re.search(r'### Response:\\s*(.*)', text)\n",
        "    if response_match:\n",
        "        response = response_match.group(1)\n",
        "        return response\n",
        "    else:\n",
        "        return \"No response\""
      ],
      "metadata": {
        "id": "z5s-5_0MRHPt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "pprint(print_response(final_out))"
      ],
      "metadata": {
        "id": "_DlE2xjBRHUk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "xHwuwJ-6RHck"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}