|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
import json |
|
import base64 |
|
import uuid |
|
|
|
import transformers |
|
from datasets import Dataset,load_dataset, load_from_disk |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer |
|
|
|
|
|
st.set_page_config( |
|
page_title="Named Entity Recognition Wolof", page_icon="π" |
|
) |
|
|
|
|
|
def convert_df(df:pd.DataFrame): |
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
|
|
def convert_json(df:pd.DataFrame): |
|
result = df.to_json(orient="index") |
|
parsed = json.loads(result) |
|
json_string = json.dumps(parsed) |
|
|
|
return json_string |
|
|
|
st.title("πNamed Entity Recognition Wolof") |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_model(): |
|
|
|
model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner") |
|
trainer = Trainer(model=model) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner") |
|
|
|
return trainer, model, tokenizer |
|
|
|
id2tag = {0: 'O', |
|
1: 'B-LOC', |
|
2: 'B-PER', |
|
3: 'I-PER', |
|
4: 'B-ORG', |
|
5: 'I-DATE', |
|
6: 'B-DATE', |
|
7: 'I-ORG', |
|
8: 'I-LOC' |
|
} |
|
|
|
def tag_sentence(text:str): |
|
|
|
inputs = tokenizer(text, truncation=True, return_tensors="pt") |
|
|
|
outputs = model(**inputs) |
|
|
|
probs = outputs[0][0].softmax(1) |
|
|
|
word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) ) |
|
for i, tagid in enumerate (probs.argmax(axis=1))] |
|
|
|
df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability']) |
|
return df |
|
|
|
|
|
with st.form(key='my_form'): |
|
|
|
x1 = st.text_input(label='Enter a sentence:', max_chars=250) |
|
print(x1) |
|
submit_button = st.form_submit_button(label='π·οΈ Create tags') |
|
|
|
|
|
if submit_button: |
|
if re.sub('\s+','',x1)=='': |
|
st.error('Please enter a non-empty sentence.') |
|
|
|
elif re.match(r'\A\s*\w+\s*\Z', x1): |
|
st.error("Please enter a sentence with at least one word") |
|
|
|
else: |
|
st.markdown("### Tagged Sentence") |
|
st.header("") |
|
|
|
Trainer, model, tokenizer = load_model() |
|
results=tag_sentence(x1) |
|
|
|
cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75]) |
|
|
|
with c1: |
|
|
|
csvbutton = st.download_button(label="π₯ Download .csv", data=convert_df(results), |
|
file_name= "results.csv", mime='text/csv', key='csv') |
|
with c2: |
|
|
|
textbutton = st.download_button(label="π₯ Download .txt", data=convert_df(results), |
|
file_name= "results.text", mime='text/plain', key='text') |
|
with c3: |
|
|
|
jsonbutton = st.download_button(label="π₯ Download .json", data=convert_json(results), |
|
file_name= "results.json", mime='application/json', key='json') |
|
|
|
st.header("") |
|
|
|
c1, c2, c3 = st.columns([1, 3, 1]) |
|
|
|
with c2: |
|
|
|
st.table(results.style.background_gradient(subset=['probability']).format(precision=2)) |
|
|
|
st.header("") |
|
st.header("") |
|
st.header("") |
|
with st.expander("βΉοΈ - About this app", expanded=True): |
|
|
|
|
|
st.write( |
|
""" |
|
- The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof. |
|
- The available entitites are: *corporation*, *location*, *person* and *date*. |
|
- The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset. |
|
- The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized. |
|
""" |
|
) |