{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os \n", "from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "model_name_or_path = \"/datadrive/namlh31/codebridge/Codebert-docstring-inconsistency\"\n", "config = AutoConfig.from_pretrained(\n", " model_name_or_path,\n", ")\n", "tokenizer = AutoTokenizer.from_pretrained(\n", " model_name_or_path\n", ")\n", "model = AutoModelForSequenceClassification.from_pretrained(\n", "model_name_or_path,\n", "config=config,\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "examples = {'code': \"function(str){\\r\\n var ret = new Array(str.length), len = str.length;\\r\\n while(len--) ret[len] = str.charCodeAt(len);\\r\\n return Uint8Array.from(ret);\\r\\n}\",\n", " 'docstring': 'we do not need Buffer pollyfill for now'}" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "texts = (\n", " (examples['docstring'], examples['code'])\n", " )\n", "result = tokenizer(*texts, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "512\n" ] } ], "source": [ "tokenizer.decode(result['input_ids'])\n", "print(len(result['input_ids']))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "input = \"\"\"we do not need Buffer pollyfill for nowfunction(str){\\r\\n var ret = new Array(str.length), len = str.length;\\r\\n while(len--) ret[len] = str.charCodeAt(len);\\r\\n return Uint8Array.from(ret);\\r\\n}\"\"\"\n", "rs_2 = tokenizer(input, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2598, -0.2636]], grad_fn=), hidden_states=None, attentions=None)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model(**rs_2)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n", "import torch\n", "device = 0 if torch.cuda.is_available() else -1\n", "pipeline = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=device)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'label': 'Inconsistency', 'score': 0.5601343512535095}]\n" ] } ], "source": [ "inputs = \"\"\"we do not need Buffer pollyfill for nowfunction(str){\n", " var ret = new Array(str.length), len = str.length;\n", " while(len--) ret[len] = str.charCodeAt(len);\n", " return Uint8Array.from(ret);\n", "}\"\"\"\n", "prediction = pipeline(inputs)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "namlh31", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }