first commit

Browse files

Files changed (12) hide show

README.md +131 -0
config.json +36 -0
flax_model.msgpack +3 -0
handler.py +33 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
requirements.txt +2 -0
special_tokens_map.json +15 -0
test.ipynb +186 -0
tokenizer.json +0 -0
tokenizer_config.json +64 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,131 @@

+---
+language:
+- code
+- en
+task_categories:
+- text-classification
+metrics:
+- accuracy
+widget:
+- text: |-
+    Sum two integers</s></s>def sum(a, b):
+        return a + b
+  example_title: Simple toy
+- text: |-
+    Look for methods that might be dynamically defined and define them for lookup.</s></s>def respond_to_missing?(name, include_private = false)
+      if name == :to_ary || name == :empty?
+        false
+      else
+        return true if mapping(name).present?
+        mounting = all_mountings.find{ |mount| mount.respond_to?(name) }
+        return false if mounting.nil?
+      end
+    end
+  example_title: Ruby example
+- text: |-
+    Method that adds a candidate to the party @param c the candidate that will be added to the party</s></s>public void addCandidate(Candidate c)
+    {
+        this.votes += c.getVotes();
+        candidates.add(c);
+    }
+  example_title: Java example
+- text: |-
+    we do not need Buffer pollyfill for now</s></s>function(str){
+      var ret = new Array(str.length), len = str.length;
+      while(len--) ret[len] = str.charCodeAt(len);
+      return Uint8Array.from(ret);
+    }
+  example_title: JavaScript example
+pipeline_tag: text-classification
+---
+## Table of Contents
+- [Model Description](#model-description)
+- [Model Details](#model-details)
+- [Usage](#usage)
+- [Limitations](#limitations)
+- [Additional Information](#additional-information)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+## Model Description
+This model is trained based on [Codebert](https://github.com/microsoft/CodeBERT) and a 5M subset of [The Vault](https://huggingface.co/datasets/Fsoft-AIC/thevault-function-level) to detect the inconsistency between docstring/comment and function. It is used to remove noise examples in The Vault dataset.
+More information:
+- **Repository:** [FSoft-AI4Code/TheVault](https://github.com/FSoft-AI4Code/TheVault)
+- **Paper:** The Vault: A Comprehensive Multilingual Dataset for Advancing Code Understanding and Generation
+- **Contact:** support.ailab@fpt.com
+## Model Details
+* Developed by: [Fsoft AI Center](https://www.fpt-aicenter.com/ai-residency/)
+* License: Nan
+* Model type: Transformer-Encoder based Language Model
+* Architecture: BERT-base
+* Data set: [The Vault](https://huggingface.co/datasets/Fsoft-AIC/thevault-function-level)
+* Tokenizer: Byte Pair Encoding
+* Vocabulary Size: 50265
+* Sequence Length: 512
+* Language: English and 10 Programming languages (Python, Java, JavaScript, PHP, C#, C, C++, Go, Rust, Ruby)
+* Training details:
+  * Self-supervised learning, binary classification
+  * Positive class: Original code-docstring pair
+  * Negative class: Random pairing code and docstring
+## Usage
+The input to the model follows the below template:
+```python
+"""
+Template:
+<s>{docstring}</s></s>{code}</s>
+Example:
+from transformers import AutoTokenizer
+#Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("Fsoft-AIC/Codebert-docstring-inconsistency")
+input = "<s>Sum two integers</s></s>def sum(a, b):\n    return a + b</s>"
+tokenized_input = tokenizer(input, add_special_tokens= False)
+"""
+```
+Using model with Jax
+```python
+from transformers import AutoTokenizer, FlaxAutoModelForSequenceClassification
+#Load jax model
+model = FlaxAutoModelForSequenceClassification.from_pretrained("Fsoft-AIC/Codebert-docstring-inconsistency")
+```
+Using model with Pytorch
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+#Load torch model
+model = AutoModelForSequenceClassification.from_pretrained("Fsoft-AIC/Codebert-docstring-inconsistency")
+```
+## Limitations
+This model is trained on a subset of 5M data in The Vault in the self-supervised manner. Since the negative samples are generated artificially, the model's ability to identify instances that require a strong semantic understanding between the code and the docstring might be restricted.
+It is hard to evaluate the model due to the unavailable labeled datasets. ChatGPT is adopted as a reference to measure the correlation between the model and ChatGPT's scores. However, the result could be influenced by ChatGPT's potential biases and ambiguous conditions. Therefore, we recommend having human labeling dataset and finetune this model to achieve the best result.
+## Additional information
+### Licensing Information
+### Citation Information
+```
+@article{thevault,
+  title={The Vault: A Comprehensive Multilingual Dataset for Advancing Code Understanding and Generation},
+  author={},
+  journal={},
+  pages={},
+  year={2023}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "/datadrive/namlh31/Codebert-docstring-inconsistency",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "Inconsistency",
+    "1": "Consistency"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "Consistency": "1",
+    "Inconsistency": "0"
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.28.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1cd31f97dc5d2ee4e85922acc7e7e352644436d57e4ff582d4d8df19192c938
+size 498595901

handler.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+from typing import Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+# check for GPU
+device = 0 if torch.cuda.is_available() else -1
+# id2label = {
+#     0: "Inconsistency",
+#     1: "Consistency"
+# }
+class EndpointHandler:
+    def __init__(self, path=""):
+        # load the model
+        tokenizer = AutoTokenizer.from_pretrained(path)
+        model = AutoModelForSequenceClassification.from_pretrained(path, low_cpu_mem_usage=True)
+        # create inference pipeline
+        self.pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
+    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", None)
+        # pass inputs with all kwargs in data
+        if parameters is not None:
+            prediction = self.pipeline(inputs, **parameters)
+        else:
+            prediction = self.pipeline(inputs)
+        # postprocess the prediction
+        return prediction

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:481e4699a0589ea0af3e2b36671aa677662e830f597f5d1bc60f3cc8bc5cec45
+size 498659253

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ accelerate
2	+ jax

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

test.ipynb ADDED Viewed

	@@ -0,0 +1,186 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name_or_path = \"/datadrive/namlh31/codebridge/Codebert-docstring-inconsistency\"\n",
+    "config = AutoConfig.from_pretrained(\n",
+    " model_name_or_path,\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_name_or_path\n",
+    ")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "model_name_or_path,\n",
+    "config=config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = {'code': \"function(str){\\r\\n  var ret = new Array(str.length), len = str.length;\\r\\n  while(len--) ret[len] = str.charCodeAt(len);\\r\\n  return Uint8Array.from(ret);\\r\\n}\",\n",
+    "            'docstring': 'we do not need Buffer pollyfill for now'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = (\n",
+    "        (examples['docstring'], examples['code'])\n",
+    "    )\n",
+    "result = tokenizer(*texts, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "512\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer.decode(result['input_ids'])\n",
+    "print(len(result['input_ids']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\\r\\n  var ret = new Array(str.length), len = str.length;\\r\\n  while(len--) ret[len] = str.charCodeAt(len);\\r\\n  return Uint8Array.from(ret);\\r\\n}\"\"\"\n",
+    "rs_2 = tokenizer(input, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2598, -0.2636]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model(**rs_2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
+    "import torch\n",
+    "device = 0 if torch.cuda.is_available() else -1\n",
+    "pipeline = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'label': 'Inconsistency', 'score': 0.5601343512535095}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "inputs = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\n",
+    "  var ret = new Array(str.length), len = str.length;\n",
+    "  while(len--) ret[len] = str.charCodeAt(len);\n",
+    "  return Uint8Array.from(ret);\n",
+    "}\"\"\"\n",
+    "prediction = pipeline(inputs)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "namlh31",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "special_tokens_map_file": "/home/namlh31aic/.cache/huggingface/hub/models--microsoft--codebert-base/snapshots/3b0952feddeffad0063f274080e3c23d75e7eb39/special_tokens_map.json",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff