{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# nanoBERT Example\n", "\n", "Here we present nanoBERT, a nanobody-specific transformer to predict amino\n", " acids in a given position in a query sequence" ], "metadata": { "id": "JU2dnhr24egK" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gxL4QKeNqYXI", "outputId": "ad6c9ed6-8d6a-45f7-ba15-4026b17906d4" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.34.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.6)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n" ] } ], "source": [ "# Install stadard library\n", "! pip install --upgrade transformers" ] }, { "cell_type": "code", "source": [ "from transformers import pipeline, RobertaTokenizer, AutoModel" ], "metadata": { "id": "vG5ndbr_rYjL" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialise the tokenizer\n", "tokenizer = RobertaTokenizer.from_pretrained(\"tadsatlawa/nanoBERT\", return_tensors=\"pt\")" ], "metadata": { "id": "1GNqH8HlrzmF" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialise model\n", "unmasker = pipeline('fill-mask', model=\"tadsatlawa/nanoBERT\", tokenizer=tokenizer, top_k=20 )" ], "metadata": { "id": "3CYcwIOU3xCY" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Predict the residue probability at one or more masked positions\n", "# mark position to predict with ''\n", "seq = \"QLVSGPEVKKPASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS\"\n", "\n", "residueProbability = unmasker(seq)\n", "\n", "# Print residue probabilities\n", "for probability in residueProbability:\n", " print(probability)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6rtUxgbYsygY", "outputId": "38fdd80d-cf30-4573-dbe5-c40f9b306470" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'score': 0.7448901534080505, 'token': 10, 'token_str': 'G', 'sequence': 'QLVSGPEVKKPGASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.04520424082875252, 'token': 19, 'token_str': 'R', 'sequence': 'QLVSGPEVKKPRASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.029332099482417107, 'token': 5, 'token_str': 'A', 'sequence': 'QLVSGPEVKKPAASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.023554226383566856, 'token': 20, 'token_str': 'S', 'sequence': 'QLVSGPEVKKPSASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.022556299343705177, 'token': 17, 'token_str': 'P', 'sequence': 'QLVSGPEVKKPPASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.02046232856810093, 'token': 22, 'token_str': 'V', 'sequence': 'QLVSGPEVKKPVASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.017790036275982857, 'token': 8, 'token_str': 'E', 'sequence': 'QLVSGPEVKKPEASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.015881769359111786, 'token': 6, 'token_str': 'C', 'sequence': 'QLVSGPEVKKPCASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.014478186145424843, 'token': 23, 'token_str': 'W', 'sequence': 'QLVSGPEVKKPWASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.013189132325351238, 'token': 14, 'token_str': 'L', 'sequence': 'QLVSGPEVKKPLASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.010759864002466202, 'token': 9, 'token_str': 'F', 'sequence': 'QLVSGPEVKKPFASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.010044544003903866, 'token': 7, 'token_str': 'D', 'sequence': 'QLVSGPEVKKPDASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.00823446735739708, 'token': 21, 'token_str': 'T', 'sequence': 'QLVSGPEVKKPTASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.005904716905206442, 'token': 24, 'token_str': 'Y', 'sequence': 'QLVSGPEVKKPYASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.004586651921272278, 'token': 12, 'token_str': 'I', 'sequence': 'QLVSGPEVKKPIASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.004159640986472368, 'token': 18, 'token_str': 'Q', 'sequence': 'QLVSGPEVKKPQASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.0033481556456536055, 'token': 15, 'token_str': 'M', 'sequence': 'QLVSGPEVKKPMASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.0021347403526306152, 'token': 13, 'token_str': 'K', 'sequence': 'QLVSGPEVKKPKASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.0021168291568756104, 'token': 16, 'token_str': 'N', 'sequence': 'QLVSGPEVKKPNASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n", "{'score': 0.0013719164999201894, 'token': 11, 'token_str': 'H', 'sequence': 'QLVSGPEVKKPHASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS'}\n" ] } ] } ] }