File size: 4,339 Bytes
4d9dd77 cb0039e 4d9dd77 cb0039e 4d9dd77 cb0039e 4d9dd77 cb0039e 967c296 cb0039e 4d9dd77 cb0039e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import base64
import uuid
import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
st.set_page_config(
page_title="Named Entity Recognition Tagger", page_icon="π"
)
def convert_df(df:pd.DataFrame):
return df.to_csv(index=False).encode('utf-8')
#@st.cache
def convert_json(df:pd.DataFrame):
result = df.to_json(orient="index")
parsed = json.loads(result)
json_string = json.dumps(parsed)
#st.json(json_string, expanded=True)
return json_string
st.title("πNamed Entity Recognition Tagger")
@st.cache(allow_output_mutation=True)
def load_model():
model = AutoModelForTokenClassification.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
trainer = Trainer(model=model)
tokenizer = AutoTokenizer.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
return trainer, model, tokenizer
id2tag = {0: 'O',
1: 'B-LOC',
2: 'B-PER',
3: 'I-PER',
4: 'B-ORG',
5: 'I-DATE',
6: 'B-DATE',
7: 'I-ORG',
8: 'I-LOC'
}
def tag_sentence(text:str):
# convert our text to a tokenized sequence
inputs = tokenizer(text, truncation=True, return_tensors="pt")
# get outputs
outputs = model(**inputs)
# convert to probabilities with softmax
probs = outputs[0][0].softmax(1)
# get the tags with the highest probability
word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) )
for i, tagid in enumerate (probs.argmax(axis=1))]
df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
return df
with st.form(key='my_form'):
x1 = st.text_input(label='Enter a sentence:', max_chars=250)
print(x1)
submit_button = st.form_submit_button(label='π·οΈ Create tags')
if submit_button:
if re.sub('\s+','',x1)=='':
st.error('Please enter a non-empty sentence.')
elif re.match(r'\A\s*\w+\s*\Z', x1):
st.error("Please enter a sentence with at least one word")
else:
st.markdown("### Tagged Sentence")
st.header("")
Trainer, model, tokenizer = load_model()
results=tag_sentence(x1)
cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
with c1:
#csvbutton = download_button(results, "results.csv", "π₯ Download .csv")
csvbutton = st.download_button(label="π₯ Download .csv", data=convert_df(results), file_name= "results.csv", mime='text/csv', key='csv')
with c2:
#textbutton = download_button(results, "results.txt", "π₯ Download .txt")
textbutton = st.download_button(label="π₯ Download .txt", data=convert_df(results), file_name= "results.text", mime='text/plain', key='text')
with c3:
#jsonbutton = download_button(results, "results.json", "π₯ Download .json")
jsonbutton = st.download_button(label="π₯ Download .json", data=convert_json(results), file_name= "results.json", mime='application/json', key='json')
st.header("")
c1, c2, c3 = st.columns([1, 3, 1])
with c2:
st.table(results.style.background_gradient(subset=['probability']).format(precision=2))
st.header("")
st.header("")
st.header("")
with st.expander("βΉοΈ - About this app", expanded=True):
st.write(
"""
- The **Named Entity Recognition Tagger** app is a tool that performs named entity recognition.
- The available entitites are: *corporation*, *creative-work*, *group*, *location*, *person* and *product*.
- The app uses the [RoBERTa model](https://huggingface.co/roberta-large), fine-tuned on the [wnut](https://huggingface.co/datasets/wnut_17) dataset.
- The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.
- For more info regarding the data science part, check this [post](https://towardsdatascience.com/named-entity-recognition-with-deep-learning-bert-the-essential-guide-274c6965e2d?sk=c3c3699e329e45a8ed93d286ae04ef10).
"""
) |