Spaces:

vonewman
/

ner_app

Runtime error

File size: 4,227 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import base64
import uuid

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer


st.set_page_config(
    page_title="Named Entity Recognition Wolof", page_icon="📘"
)


def convert_df(df:pd.DataFrame):
     return df.to_csv(index=False).encode('utf-8')

#@st.cache
def convert_json(df:pd.DataFrame):
    result = df.to_json(orient="index")
    parsed = json.loads(result)
    json_string = json.dumps(parsed)
    #st.json(json_string, expanded=True)
    return json_string

st.title("📘Named Entity Recognition Wolof")

@st.cache(allow_output_mutation=True)
def load_model():

    model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
    trainer = Trainer(model=model)

    tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")

    return trainer, model, tokenizer

id2tag = {0: 'O',
         1: 'B-LOC',
         2: 'B-PER',
         3: 'I-PER',
         4: 'B-ORG',
         5: 'I-DATE',
         6: 'B-DATE',
         7: 'I-ORG',
         8: 'I-LOC'
        }

def tag_sentence(text:str):
      # convert our text to a tokenized sequence
      inputs = tokenizer(text, truncation=True, return_tensors="pt")
      # get outputs
      outputs = model(**inputs)
      # convert to probabilities with softmax
      probs = outputs[0][0].softmax(1)
      # get the tags with the highest probability
      word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) ) 
                    for i, tagid in enumerate (probs.argmax(axis=1))]

      df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
      return df


with st.form(key='my_form'):

    x1 = st.text_input(label='Enter a sentence:', max_chars=250)
    print(x1)
    submit_button = st.form_submit_button(label='🏷️ Create tags')


if submit_button:
    if re.sub('\s+','',x1)=='':
        st.error('Please enter a non-empty sentence.')

    elif re.match(r'\A\s*\w+\s*\Z', x1):
        st.error("Please enter a sentence with at least one word")
    
    else:
        st.markdown("### Tagged Sentence")
        st.header("")

        Trainer, model, tokenizer = load_model()  
        results=tag_sentence(x1)
        
        cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])

        with c1:
            #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
            csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results), 
                                           file_name= "results.csv", mime='text/csv', key='csv')
        with c2:
            #textbutton = download_button(results, "results.txt", "📥 Download .txt")
            textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results), 
                                            file_name= "results.text", mime='text/plain',  key='text')
        with c3:
            #jsonbutton = download_button(results, "results.json", "📥 Download .json")
            jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results), 
                                            file_name= "results.json", mime='application/json',  key='json')

        st.header("")
        
        c1, c2, c3 = st.columns([1, 3, 1])
        
        with c2:

             st.table(results.style.background_gradient(subset=['probability']).format(precision=2))

st.header("")
st.header("")
st.header("")
with st.expander("ℹ️ - About this app", expanded=True):


    st.write(
        """     
-   The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
-   The available entitites are: *corporation*, *location*, *person* and *date*.
-   The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.      
-   The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.  
       """
    )