Spaces:

isThisYouLLM
/

Human-Ai

Running

File size: 3,943 Bytes

import streamlit as st
from streamlit_option_menu import option_menu
import numpy as np
import os
import datasets
import argparse
from typing import Tuple
import transformers
import torch
from torch.utils.data import Dataset
import matplotlib as plt
import random
from tqdm import tqdm
import pandas as pd
from huggingface_hub import login
from torch.optim import lr_scheduler
from typing import Callable, Dict, List, Tuple, Union
import csv
from timeit import default_timer as timer





def load_tokenizer(tokenizer_name:str)->object:
    """
    Function to load the tokenizer by the model's name
    Args: 
     - tokenizer_name -> the name of the tokenizerto download
     Returns:
     - tokenizer -> returns respectively the model and the tokenizer
    """
    tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")


    return tokenizer


def load_model(model_name:str)->object:
    """
     Function for model loading
     Args: 
     - model_name -> the name of the model
     Returns:
     - model,tokenizer -> returns respectively the model and the tokenizer
    """

    print(f'Loading  model {model_name}...')


    model_kwargs = {}

    model_kwargs.update(dict( torch_dtype=torch.bfloat16))
    transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
    model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)

    print("---MODEL LOADED---")

   

    return model_encoder

class stylometer_classifier(torch.nn.Module):
    def __init__(self,pretrained_encoder,dimensionality):
        super(stylometer_classifier, self).__init__()
        self.modelBase = pretrained_encoder
        self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
        self.activation = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)




    def forward(self, input_ids, padding_mask):
        output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
        hidden_state = output_1[0]
        #Here i take only the cls token representation for further classification
        cls_output = hidden_state[:, 0]
        pooler = self.pre_classifier(cls_output)
        afterActivation = self.activation(pooler)
        pooler_after_act = self.dropout(afterActivation)
        output = torch.sigmoid(self.classifier(pooler_after_act))

        if output>=0.07:
            return {"my_class":"It's a Human!",
                   "prob":output}
        else:
            return {"my_class":"It's an LLM!",
                   "prob":output}


        return output

def adapt_model(model:object, dim:int=1024) -> object:
    """
    This function returns the model with a classification head
    """
    newModel = stylometer_classifier(model,dimensionality=dim)

    return newModel





def main():
    print("----starting enviroment----")


    model_name = "Salesforce/codet5p-770m"
    checkpoint = "checkpoint.bin"


    DEVICE = "cpu"



    #load tokenizer
    tokenizer = load_tokenizer(model_name)
    print("tokenizer  loaded!")
 

    #loading model and tokenizer for functional translation
    model = load_model(model_name)
    #adding classification head to the model
    model = adapt_model(model, dim=model.shared.embedding_dim)



    model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
    model = model.eval()
    st.title("Human-AI stylometer - Multilingual")
    
    st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry')
    
    text = st.text_area("insert your code here")
    button = st.button("send")
    if button or text:
        input = tokenizer([text])
        out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask))
        st.write(out["my_class"]) 
    



if __name__ == '__main__':
    main()