File size: 3,943 Bytes
f7594ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643e598
0883338
f7594ce
 
0883338
 
 
f7594ce
643e598
f7594ce
0883338
f7594ce
643e598
 
 
f7594ce
 
0883338
643e598
0883338
643e598
f7594ce
 
 
0883338
 
86b720b
f1b8f44
0f0b9bc
f1b8f44
0883338
 
 
 
 
 
f7594ce
 
 
643e598
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import streamlit as st
from streamlit_option_menu import option_menu
import numpy as np
import os
import datasets
import argparse
from typing import Tuple
import transformers
import torch
from torch.utils.data import Dataset
import matplotlib as plt
import random
from tqdm import tqdm
import pandas as pd
from huggingface_hub import login
from torch.optim import lr_scheduler
from typing import Callable, Dict, List, Tuple, Union
import csv
from timeit import default_timer as timer





def load_tokenizer(tokenizer_name:str)->object:
    """
    Function to load the tokenizer by the model's name
    Args: 
     - tokenizer_name -> the name of the tokenizerto download
     Returns:
     - tokenizer -> returns respectively the model and the tokenizer
    """
    tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")


    return tokenizer


def load_model(model_name:str)->object:
    """
     Function for model loading
     Args: 
     - model_name -> the name of the model
     Returns:
     - model,tokenizer -> returns respectively the model and the tokenizer
    """

    print(f'Loading  model {model_name}...')


    model_kwargs = {}

    model_kwargs.update(dict( torch_dtype=torch.bfloat16))
    transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
    model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)

    print("---MODEL LOADED---")

   

    return model_encoder

class stylometer_classifier(torch.nn.Module):
    def __init__(self,pretrained_encoder,dimensionality):
        super(stylometer_classifier, self).__init__()
        self.modelBase = pretrained_encoder
        self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
        self.activation = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)




    def forward(self, input_ids, padding_mask):
        output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
        hidden_state = output_1[0]
        #Here i take only the cls token representation for further classification
        cls_output = hidden_state[:, 0]
        pooler = self.pre_classifier(cls_output)
        afterActivation = self.activation(pooler)
        pooler_after_act = self.dropout(afterActivation)
        output = torch.sigmoid(self.classifier(pooler_after_act))

        if output>=0.07:
            return {"my_class":"It's a Human!",
                   "prob":output}
        else:
            return {"my_class":"It's an LLM!",
                   "prob":output}


        return output

def adapt_model(model:object, dim:int=1024) -> object:
    """
    This function returns the model with a classification head
    """
    newModel = stylometer_classifier(model,dimensionality=dim)

    return newModel





def main():
    print("----starting enviroment----")


    model_name = "Salesforce/codet5p-770m"
    checkpoint = "checkpoint.bin"


    DEVICE = "cpu"



    #load tokenizer
    tokenizer = load_tokenizer(model_name)
    print("tokenizer  loaded!")
 

    #loading model and tokenizer for functional translation
    model = load_model(model_name)
    #adding classification head to the model
    model = adapt_model(model, dim=model.shared.embedding_dim)



    model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
    model = model.eval()
    st.title("Human-AI stylometer - Multilingual")
    
    st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry')
    
    text = st.text_area("insert your code here")
    button = st.button("send")
    if button or text:
        input = tokenizer([text])
        out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask))
        st.write(out["my_class"]) 
    



if __name__ == '__main__':
    main()