import streamlit as st import pandas as pd import re import json import transformers import torch from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer st.set_page_config( page_title="Named Entity Recognition Wolof", page_icon="📘" ) def convert_df(df: pd.DataFrame): return df.to_csv(index=False).encode('utf-8') def convert_json(df: pd.DataFrame): result = df.to_json(orient="index") parsed = json.loads(result) json_string = json.dumps(parsed) return json_string def load_model(): model = AutoModelForTokenClassification.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof") trainer = Trainer(model=model) tokenizer = AutoTokenizer.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof") return trainer, model, tokenizer def align_word_ids(texts): trainer, model, tokenizer = load_model() tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True) word_ids = tokenized_inputs.word_ids() previous_word_idx = None label_ids = [] for word_idx in word_ids: if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: try: label_ids.append(1) except: label_ids.append(-100) else: try: label_ids.append(1 if label_all_tokens else -100) except: label_ids.append(-100) previous_word_idx = word_idx return label_ids def predict_ner_labels(model, tokenizer, sentence): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") if use_cuda: model = model.cuda() text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt") mask = text['attention_mask'].to(device) input_id = text['input_ids'].to(device) label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device) logits = model(input_id, mask, None) logits_clean = logits[0][label_ids != -100] predictions = logits_clean.argmax(dim=1).tolist() prediction_label = [id2tag[i] for i in predictions] return prediction_label id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'} def tag_sentence(text): trainer, model, tokenizer = load_model() predictions = predict_ner_labels(model, tokenizer, text) # Créez un DataFrame avec les colonnes "words" et "tags" df = pd.DataFrame({'words': text.split(), 'tags': predictions}) return df st.title("📘 Named Entity Recognition Wolof") with st.form(key='my_form'): x1 = st.text_input(label='Enter a sentence:', max_chars=250) submit_button = st.form_submit_button(label='🏷️ Create tags') if submit_button: if re.sub('\s+', '', x1) == '': st.error('Please enter a non-empty sentence.') elif re.match(r'\A\s*\w+\s*\Z', x1): st.error("Please enter a sentence with at least one word") else: st.markdown("### Tagged Sentence") st.header("") results = tag_sentence(x1) cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75]) with c1: csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results), file_name="results.csv", mime='text/csv', key='csv') with c2: textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results), file_name="results.text", mime='text/plain', key='text') with c3: jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results), file_name="results.json", mime='application/json', key='json') st.header("") c1, c2, c3 = st.columns([1, 3, 1]) with c2: st.table(results[['words', 'tags']]) st.header("") st.header("") st.header("") with st.expander("ℹ️ - About this app", expanded=True): st.write( """ - The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof. - The available entities are: *corporation*, *location*, *person*, and *date*. - The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset. - The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized. """ )