Spaces:

sasha
/

MetricCompare

Runtime error

File size: 6,376 Bytes

b8be36c
 
 
 
 
 
 
 
 
 
 
 
d276596
b8be36c
 
db77a12
b8be36c
ab87d79
87f42d6
ab87d79
b8be36c
02758b3
 
 
 
 
 
 
b8be36c
 
d276596
b8be36c
d276596
cb4cca6
b8be36c
d276596
 
 
 
 
 
 
 
 
 
 
 
0e8e07c
d276596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e5c4f6
d276596
5e5c4f6
d276596
 
 
 
 
 
 
 
 
 
 
b8be36c
d276596
b8be36c
d276596
b8be36c
d276596
b8be36c
d276596
 
 
 
 
 
 
 
 
 
 
 
 
b8be36c

import streamlit as st
from evaluate import evaluator
import evaluate
import datasets
from huggingface_hub import HfApi, ModelFilter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline
import matplotlib.pyplot as plt

st.title("Metric Compare")

st.markdown("### Choose the dataset you want to use for the comparison:")

api = HfApi()
dsets = [d.id for d in api.list_datasets(filter="task_categories:text-classification", sort = "downloads", direction=-1, limit = 20) if d.id !='glue']

dset = st.selectbox('Choose a dataset from the Hub', options=dsets)

info = datasets.get_dataset_infos(dset)

dset_config = st.selectbox('What config do you want to use?', options=list(info))

splitlist= []
for s in info[dset_config].splits:
  splitlist.append(s)

dset_split = st.selectbox('Choose a dataset split for evaluation', options=splitlist)


st.markdown("### Now select up to 5 models to compare their performance:")

filt = ModelFilter(trained_dataset=dset)
all_models = [m.modelId for m in api.list_models(filter=filt, sort = "downloads", direction=-1, limit = 20) if 't5' not in m.tags]

models = st.multiselect(
     'Choose the models that have been trained/finetuned on this dataset', 
     options=all_models)
     

st.markdown("### What two metrics do you want to compare?")

metrics = st.multiselect(
     'Choose the metrics for the comparison', 
     options=['f1', 'accuracy', 'precision', 'recall'])
     
     
st.markdown("### Please wait for the dataset and models to load (this can take some time if they are big!")

### Loading data
try:
  data = datasets.load_dataset(dset, split=dset_split)
  st.text("Loaded the validation split of dataset "+ str(dset))
except:
  data = datasets.load_dataset(dset, split="test") 
  st.text("Loaded the test split of dataset "+ str(dset))
                   
  st.text("Sorry, I can't load this dataset... try another one!")
     
### Loading models

for i in range (len(models)):
  try:
    globals()[f"tokenizer_{i}"] = AutoTokenizer.from_pretrained(models[i])
    globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(models[i])
    st.text("Loaded model "+ str(models[i]))
  except:
    st.text("Sorry, I can't load model "+ str(models[i]))

### Defining metrics 
for i in range (len(metrics)):
  try:
    globals()[f"metrics[i]"] = evaluate.load(metrics[i])
  except:
     st.text("Sorry, I can't load metric "+ str(metrics[i]) +"... Try another one!")


### Defining Evaluator
eval = evaluator("text-classification")

### Defining pipelines

st.markdown("### Help us pick the right labels for your models")

st.text("The labels for your dataset are: "+ str(data.features['label'].names))

"""
for i in range (len(model_list)):
  st.text("The labels for your dataset are: "+ str(data.features['label'].names))
  print(model_list[i])
  print(AutoConfig.from_pretrained(models[0]).id2label)

for i in range (len(models)):
  try:
    globals()[f"pipe1_{i}"] = AutoTokenizer.from_pretrained(models[i])
    globals()[f"model_{i}"] = AutoModelForSequenceClassification.from_pretrained(models[i])
     st.text("Loaded model "+ str(models[i]))
  except:
     st.text("Sorry, I can't load model "+ str(models[i]))

pipe1 = pipeline("text-classification", model=model1, tokenizer= tokenizer1, device=0)
res_accuracy1 = eval.compute(model_or_pipeline=pipe1, data=data, metric=accuracy,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
res_f11 = eval.compute(model_or_pipeline=pipe1, data=data, metric=f1,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print({**res_accuracy1, **res_f11})

pipe2 = pipeline("text-classification", model=model2, tokenizer= tokenizer2, device=0)
res_accuracy2 = eval.compute(model_or_pipeline=pipe2, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f12 = eval.compute(model_or_pipeline=pipe2, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy2, **res_f12})

pipe3 = pipeline("text-classification", model=model3, tokenizer= tokenizer3, device=0)
res_accuracy3 = eval.compute(model_or_pipeline=pipe3, data=data, metric=accuracy,
                       label_mapping={"neg": 0, "pos": 1},)
res_f13 = eval.compute(model_or_pipeline=pipe3, data=data, metric=f1,
                       label_mapping={"neg": 0, "pos": 1},)
print({**res_accuracy3, **res_f13})

pipe4 = pipeline("text-classification", model=model4, tokenizer= tokenizer4, device=0)
res_accuracy4 = eval.compute(model_or_pipeline=pipe4, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f14 = eval.compute(model_or_pipeline=pipe4, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy4, **res_f14})

pipe5 = pipeline("text-classification", model=model5, tokenizer= tokenizer5, device=0)
res_accuracy5 = eval.compute(model_or_pipeline=pipe5, data=data, metric=accuracy,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
res_f15 = eval.compute(model_or_pipeline=pipe5, data=data, metric=f1,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)
print({**res_accuracy5, **res_f15})

plt.plot(res_accuracy1['accuracy'], res_f11['f1'], marker='o', markersize=6, color="red")
plt.annotate('distilbert', xy=(res_accuracy1['accuracy']+0.001, res_f11['f1']))
plt.plot(res_accuracy2['accuracy'], res_f12['f1'], marker='o', markersize=6, color="blue")
plt.annotate('distilbert-base-uncased-finetuned', xy=(res_accuracy2['accuracy']+0.001, res_f12['f1']))
plt.plot(res_accuracy3['accuracy'], res_f13['f1'], marker='o', markersize=6, color="green")
plt.annotate('roberta-base', xy=(res_accuracy3['accuracy']-0.009, res_f13['f1']))
plt.plot(res_accuracy4['accuracy'], res_f14['f1'], marker='o', markersize=6, color="purple")
plt.annotate('funnel-transformer-small', xy=(res_accuracy4['accuracy']-0.015, res_f14['f1']))
plt.plot(res_accuracy5['accuracy'], res_f15['f1'], marker='o', markersize=6, color="black")
plt.annotate('SENATOR', xy=(res_accuracy5['accuracy']+0.001, res_f15['f1']))

plt.xlabel('Accuracy')
plt.ylabel('F1 Score') 
#plt.xlim([0.9, 1.0])
#plt.ylim([0.9, 1.0])
plt.title('Comparing the Models')
"""