Spaces:
Runtime error
Runtime error
File size: 3,943 Bytes
f7594ce 643e598 0883338 f7594ce 0883338 f7594ce 643e598 f7594ce 0883338 f7594ce 643e598 f7594ce 0883338 643e598 0883338 643e598 f7594ce 0883338 86b720b f1b8f44 0f0b9bc f1b8f44 0883338 f7594ce 643e598 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
from streamlit_option_menu import option_menu
import numpy as np
import os
import datasets
import argparse
from typing import Tuple
import transformers
import torch
from torch.utils.data import Dataset
import matplotlib as plt
import random
from tqdm import tqdm
import pandas as pd
from huggingface_hub import login
from torch.optim import lr_scheduler
from typing import Callable, Dict, List, Tuple, Union
import csv
from timeit import default_timer as timer
def load_tokenizer(tokenizer_name:str)->object:
"""
Function to load the tokenizer by the model's name
Args:
- tokenizer_name -> the name of the tokenizerto download
Returns:
- tokenizer -> returns respectively the model and the tokenizer
"""
tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")
return tokenizer
def load_model(model_name:str)->object:
"""
Function for model loading
Args:
- model_name -> the name of the model
Returns:
- model,tokenizer -> returns respectively the model and the tokenizer
"""
print(f'Loading model {model_name}...')
model_kwargs = {}
model_kwargs.update(dict( torch_dtype=torch.bfloat16))
transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)
print("---MODEL LOADED---")
return model_encoder
class stylometer_classifier(torch.nn.Module):
def __init__(self,pretrained_encoder,dimensionality):
super(stylometer_classifier, self).__init__()
self.modelBase = pretrained_encoder
self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
self.activation = torch.nn.ReLU()
self.dropout = torch.nn.Dropout(0.2)
self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)
def forward(self, input_ids, padding_mask):
output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
hidden_state = output_1[0]
#Here i take only the cls token representation for further classification
cls_output = hidden_state[:, 0]
pooler = self.pre_classifier(cls_output)
afterActivation = self.activation(pooler)
pooler_after_act = self.dropout(afterActivation)
output = torch.sigmoid(self.classifier(pooler_after_act))
if output>=0.07:
return {"my_class":"It's a Human!",
"prob":output}
else:
return {"my_class":"It's an LLM!",
"prob":output}
return output
def adapt_model(model:object, dim:int=1024) -> object:
"""
This function returns the model with a classification head
"""
newModel = stylometer_classifier(model,dimensionality=dim)
return newModel
def main():
print("----starting enviroment----")
model_name = "Salesforce/codet5p-770m"
checkpoint = "checkpoint.bin"
DEVICE = "cpu"
#load tokenizer
tokenizer = load_tokenizer(model_name)
print("tokenizer loaded!")
#loading model and tokenizer for functional translation
model = load_model(model_name)
#adding classification head to the model
model = adapt_model(model, dim=model.shared.embedding_dim)
model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
model = model.eval()
st.title("Human-AI stylometer - Multilingual")
st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry')
text = st.text_area("insert your code here")
button = st.button("send")
if button or text:
input = tokenizer([text])
out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask))
st.write(out["my_class"])
if __name__ == '__main__':
main() |