Human-Ai / app.py
isThisYouLLM's picture
Update app.py
0f0b9bc verified
import streamlit as st
from streamlit_option_menu import option_menu
import numpy as np
import os
import datasets
import argparse
from typing import Tuple
import transformers
import torch
from torch.utils.data import Dataset
import matplotlib as plt
import random
from tqdm import tqdm
import pandas as pd
from huggingface_hub import login
from torch.optim import lr_scheduler
from typing import Callable, Dict, List, Tuple, Union
import csv
from timeit import default_timer as timer
def load_tokenizer(tokenizer_name:str)->object:
"""
Function to load the tokenizer by the model's name
Args:
- tokenizer_name -> the name of the tokenizerto download
Returns:
- tokenizer -> returns respectively the model and the tokenizer
"""
tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")
return tokenizer
def load_model(model_name:str)->object:
"""
Function for model loading
Args:
- model_name -> the name of the model
Returns:
- model,tokenizer -> returns respectively the model and the tokenizer
"""
print(f'Loading model {model_name}...')
model_kwargs = {}
model_kwargs.update(dict( torch_dtype=torch.bfloat16))
transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)
print("---MODEL LOADED---")
return model_encoder
class stylometer_classifier(torch.nn.Module):
def __init__(self,pretrained_encoder,dimensionality):
super(stylometer_classifier, self).__init__()
self.modelBase = pretrained_encoder
self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
self.activation = torch.nn.ReLU()
self.dropout = torch.nn.Dropout(0.2)
self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)
def forward(self, input_ids, padding_mask):
output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
hidden_state = output_1[0]
#Here i take only the cls token representation for further classification
cls_output = hidden_state[:, 0]
pooler = self.pre_classifier(cls_output)
afterActivation = self.activation(pooler)
pooler_after_act = self.dropout(afterActivation)
output = torch.sigmoid(self.classifier(pooler_after_act))
if output>=0.07:
return {"my_class":"It's a Human!",
"prob":output}
else:
return {"my_class":"It's an LLM!",
"prob":output}
return output
def adapt_model(model:object, dim:int=1024) -> object:
"""
This function returns the model with a classification head
"""
newModel = stylometer_classifier(model,dimensionality=dim)
return newModel
def main():
print("----starting enviroment----")
model_name = "Salesforce/codet5p-770m"
checkpoint = "checkpoint.bin"
DEVICE = "cpu"
#load tokenizer
tokenizer = load_tokenizer(model_name)
print("tokenizer loaded!")
#loading model and tokenizer for functional translation
model = load_model(model_name)
#adding classification head to the model
model = adapt_model(model, dim=model.shared.embedding_dim)
model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
model = model.eval()
st.title("Human-AI stylometer - Multilingual")
st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry')
text = st.text_area("insert your code here")
button = st.button("send")
if button or text:
input = tokenizer([text])
out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask))
st.write(out["my_class"])
if __name__ == '__main__':
main()