{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":9069761,"sourceType":"datasetVersion","datasetId":5470604},{"sourceId":85584,"sourceType":"modelInstanceVersion","isSourceIdPinned":true,"modelInstanceId":71904,"modelId":96864}],"dockerImageVersionId":30747,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%capture\n!pip install sentence_transformers lime","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Обучение модели","metadata":{}},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom IPython.display import HTML\nfrom functools import partial\nimport lime\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.optim.lr_scheduler import StepLR\nfrom torch.utils.data import Dataset, DataLoader\nfrom sklearn.metrics import f1_score\nfrom lime.lime_text import LimeTextExplainer\nfrom sklearn.model_selection import train_test_split\nfrom transformers import AutoTokenizer, AutoModel\n\n\nclass TextDataset(Dataset):\n def __init__(self, texts, labels, tokenizer, max_length=512):\n self.texts = texts\n self.labels = labels\n self.tokenizer = tokenizer\n self.max_length = max_length\n\n def __len__(self):\n return len(self.texts)\n\n def __getitem__(self, idx):\n text = self.texts[idx]\n inputs = self.tokenizer.encode_plus(\n text,\n None,\n add_special_tokens=True,\n max_length=self.max_length,\n padding='max_length',\n return_token_type_ids=True,\n return_attention_mask=True,\n truncation=True\n )\n input_ids = inputs['input_ids']\n attention_mask = inputs['attention_mask']\n return {\n 'input_ids': torch.tensor(input_ids, dtype=torch.long),\n 'attention_mask': torch.tensor(attention_mask, dtype=torch.long),\n 'labels': torch.tensor(self.labels[idx], dtype=torch.float)\n }\n\n \ndata = pd.read_csv('/kaggle/input/zvon-v-ushah/zvon_v_ushah.csv').drop(columns='Unnamed: 0')\n\nX, y = data['code'].values, data['by_human'].values\nX_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=data['by_human'])\nX_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')","metadata":{"execution":{"iopub.status.busy":"2024-07-31T02:55:25.522623Z","iopub.execute_input":"2024-07-31T02:55:25.523439Z","iopub.status.idle":"2024-07-31T02:55:31.332338Z","shell.execute_reply.started":"2024-07-31T02:55:25.523394Z","shell.execute_reply":"2024-07-31T02:55:31.331385Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"class LogRegClassifier(nn.Module):\n def __init__(self, transformer_output_dim):\n super(LogRegClassifier, self).__init__()\n self.linear = nn.Linear(transformer_output_dim, 1)\n\n def forward(self, x):\n return torch.sigmoid(self.linear(x))\n \n\nclass CombinedModel(nn.Module):\n def __init__(self, transformer, classifier):\n super(CombinedModel, self).__init__()\n self.transformer = transformer\n self.classifier = classifier\n \n# for param in self.transformer.parameters():\n# param.requires_grad = False\n\n def forward(self, input_ids, attention_mask):\n outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)\n pooled_output = outputs.last_hidden_state[:, 0, :]\n return self.classifier(pooled_output)\n\n \ndef train_model(combined_model, train_dataloader, val_dataloader, scheduler, epochs=5):\n combined_model.train()\n best_f1 = 0\n best_model_state = None\n\n for epoch in range(epochs):\n for batch in train_dataloader:\n input_ids = batch['input_ids'].to(device)\n attention_mask = batch['attention_mask'].to(device)\n labels = batch['labels'].to(device)\n\n outputs = combined_model(input_ids=input_ids, attention_mask=attention_mask).squeeze(1)\n loss = criterion(outputs, labels)\n\n optimizer.zero_grad()\n loss.backward()\n optimizer.step()\n\n scheduler.step()\n \n val_f1 = validate_model(combined_model, val_dataloader)\n if val_f1 > best_f1:\n best_f1 = val_f1\n best_model_state = combined_model.state_dict()\n \n print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Val F1: {val_f1:.4f}')\n \n if best_model_state:\n combined_model.load_state_dict(best_model_state)\n torch.save(combined_model.state_dict(), 'best_model.pth')\n print(f'Best Validation F1: {best_f1:.4f}')\n\n \ndef validate_model(combined_model, val_dataloader):\n combined_model.eval()\n all_labels = []\n all_predictions = []\n\n with torch.no_grad():\n for batch in val_dataloader:\n input_ids = batch['input_ids'].to(device)\n attention_mask = batch['attention_mask'].to(device)\n labels = batch['labels'].to(device)\n\n outputs = combined_model(input_ids=input_ids, attention_mask=attention_mask).squeeze()\n predictions = (outputs > 0.5).int()\n all_labels.extend(labels.cpu().numpy())\n all_predictions.extend(predictions.cpu().numpy())\n \n f1 = f1_score(all_labels, all_predictions)\n return f1\n\n\ndef test_model(combined_model, test_dataloader):\n combined_model.eval()\n all_labels = []\n all_predictions = []\n total_loss = 0\n\n with torch.no_grad():\n for batch in test_dataloader:\n input_ids = batch['input_ids'].to(device)\n attention_mask = batch['attention_mask'].to(device)\n labels = batch['labels'].to(device)\n\n outputs = combined_model(input_ids=input_ids, attention_mask=attention_mask).squeeze(1)\n loss = criterion(outputs, labels)\n total_loss += loss.item()\n\n predictions = (outputs > 0.5).int()\n all_labels.extend(labels.cpu().numpy())\n all_predictions.extend(predictions.cpu().numpy())\n \n average_loss = total_loss / len(test_dataloader)\n accuracy = f1_score(all_labels, all_predictions)\n print(f'Test Loss: {average_loss:.4f}, Test F1: {accuracy:.4f}')\n\n \ndef predict(texts, tokenizer, combined_model):\n combined_model.eval()\n inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n input_ids = inputs['input_ids'].to(device)\n attention_mask = inputs['attention_mask'].to(device)\n\n with torch.no_grad():\n outputs = combined_model(input_ids=input_ids, attention_mask=attention_mask).squeeze()\n predictions = (outputs > 0.5).int().cpu().numpy()\n return predictions","metadata":{"execution":{"iopub.status.busy":"2024-07-31T02:55:31.334377Z","iopub.execute_input":"2024-07-31T02:55:31.334867Z","iopub.status.idle":"2024-07-31T02:55:31.357883Z","shell.execute_reply.started":"2024-07-31T02:55:31.334834Z","shell.execute_reply":"2024-07-31T02:55:31.357021Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"# tokenizer = AutoTokenizer.from_pretrained('microsoft/graphcodebert-base')\n# transformer = AutoModel.from_pretrained('microsoft/graphcodebert-base')\n\n# train_dataset = TextDataset(X_train, y_train, tokenizer)\n# train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n# val_dataset = TextDataset(X_val, y_val, tokenizer)\n# val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)\n# test_dataset = TextDataset(X_test, y_test, tokenizer)\n# test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n\n# transformer_output_dim = transformer.config.hidden_size\n# classifier = LogRegClassifier(transformer_output_dim)\n# combined_model = CombinedModel(transformer, classifier).to(device)\n\n# criterion = nn.BCELoss()\n# optimizer = torch.optim.Adam(combined_model.classifier.parameters(), lr=0.01)\n# scheduler = StepLR(optimizer, step_size=6, gamma=0.8)\n\n# train_model(combined_model, train_dataloader, val_dataloader, scheduler, epochs=15)\n# test_model(combined_model, test_dataloader)","metadata":{"execution":{"iopub.status.busy":"2024-07-31T00:07:08.829757Z","iopub.execute_input":"2024-07-31T00:07:08.830461Z","iopub.status.idle":"2024-07-31T00:07:26.689837Z","shell.execute_reply.started":"2024-07-31T00:07:08.830426Z","shell.execute_reply":"2024-07-31T00:07:26.688928Z"},"trusted":true},"execution_count":9,"outputs":[{"name":"stdout","text":"Test Loss: 0.1154, Test F1: 0.9614\n","output_type":"stream"}]},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained('microsoft/graphcodebert-base')\ntransformer = AutoModel.from_pretrained('microsoft/graphcodebert-base')\n\ntrain_dataset = TextDataset(X_train, y_train, tokenizer)\ntrain_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)\nval_dataset = TextDataset(X_val, y_val, tokenizer)\nval_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)\ntest_dataset = TextDataset(X_test, y_test, tokenizer)\ntest_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n\ntransformer_output_dim = transformer.config.hidden_size\nclassifier = LogRegClassifier(transformer_output_dim)\ncombined_model = CombinedModel(transformer, classifier)","metadata":{"execution":{"iopub.status.busy":"2024-07-31T02:55:50.225832Z","iopub.execute_input":"2024-07-31T02:55:50.226734Z","iopub.status.idle":"2024-07-31T02:55:55.883173Z","shell.execute_reply.started":"2024-07-31T02:55:50.226697Z","shell.execute_reply":"2024-07-31T02:55:55.882234Z"},"trusted":true},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/25.0 [00:00 anchors;\n for (const auto &anchor : M.getDebugInfoAnchors()) {\n if (anchor.getInstruction() == &I) {\n anchors.push_back(anchor.getDebugInfo());\n }\n }\n\n // Print the collected debug info anchors\n for (const auto &anchor : anchors) {\n std::cout << anchor << std::endl;\n }\n }\n};\n''',\n'''\nvoid llvm::install_out_of_memory_new_handler() {\n std::new_handler old = std::set_new_handler(out_of_memory_new_handler);\n (void)old;\n assert((old == nullptr || old == out_of_memory_new_handler) &&\n \"new-handler already installed\");\n}\n''',\n'''\nunsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {\n // Create a new register unit\n CodeGenRegisterUnit *unit = new CodeGenRegisterUnit();\n\n // Associate the root registers with the unit\n unit->rootRegisters.push_back(R0);\n if (R1!= nullptr) {\n unit->rootRegisters.push_back(R1);\n }\n\n // Return the new register unit\n return unit;\n}\n'''\n]\n# llm, human, llm\npredictions = predict(sample_texts, tokenizer, combined_model)\nprint(\"Predictions:\", predictions)","metadata":{"execution":{"iopub.status.busy":"2024-07-31T02:51:25.926539Z","iopub.execute_input":"2024-07-31T02:51:25.927156Z","iopub.status.idle":"2024-07-31T02:51:27.006592Z","shell.execute_reply.started":"2024-07-31T02:51:25.927124Z","shell.execute_reply":"2024-07-31T02:51:27.005626Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"Predictions: [0 1 0]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Interpretability","metadata":{}},{"cell_type":"code","source":"def predict_for_lime(texts, tokenizer, combined_model):\n inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n input_ids = inputs['input_ids'].to(device)\n attention_mask = inputs['attention_mask'].to(device)\n with torch.no_grad():\n outputs = combined_model(input_ids=input_ids, attention_mask=attention_mask).squeeze().cpu().numpy()\n res = []\n for i in range(len(outputs)):\n res.append((1 - outputs[i], outputs[i]))\n return np.array(res)\n\n\ntext = '''\nstd::vector> predict1(const std::vector& texts, \n torch::jit::script::Module& model, \n /* Your tokenizer type */ tokenizer) {\n std::vector> proba_labels;\n\n // Tokenize the input texts\n // Assume tokenize function returns tensors\n auto inputs = tokenize(texts, tokenizer);\n auto input_ids = std::get<0>(inputs);\n auto attention_mask = std::get<1>(inputs);\n\n // Move tensors to the same device as the model\n input_ids = input_ids.to(torch::kCUDA);\n attention_mask = attention_mask.to(torch::kCUDA);\n\n // Perform inference\n torch::NoGradGuard no_grad;\n auto outputs = model.forward({input_ids, attention_mask}).toTensor().squeeze();\n\n // Convert logits to predictions and probabilities\n auto predictions = (outputs > 0.5).to(torch::kInt32);\n auto outputs_cpu = outputs.cpu();\n auto predictions_cpu = predictions.cpu();\n\n // Prepare the result\n for (int i = 0; i < predictions_cpu.size(0); ++i) {\n proba_labels.push_back({outputs_cpu[i].item(), predictions_cpu[i].item()});\n }\n\n return proba_labels;\n}\n\n'''\n\nexplainer = LimeTextExplainer(class_names=['0', '1'])\nexp = explainer.explain_instance(text,\n partial(predict_for_lime, tokenizer=tokenizer, \n combined_model=combined_model), \n num_features=20, num_samples=200)\n \ndisplay(HTML(exp.as_html()))","metadata":{"execution":{"iopub.status.busy":"2024-07-31T04:11:06.835804Z","iopub.execute_input":"2024-07-31T04:11:06.836180Z","iopub.status.idle":"2024-07-31T04:11:10.442770Z","shell.execute_reply.started":"2024-07-31T04:11:06.836149Z","shell.execute_reply":"2024-07-31T04:11:10.441379Z"},"trusted":true},"execution_count":79,"outputs":[{"output_type":"display_data","data":{"text/plain":"","text/html":"\n \n \n
\n \n \n "},"metadata":{}}]}]}