vonewman commited on
Commit
cb0039e
·
1 Parent(s): 967c296

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -15
app.py CHANGED
@@ -1,24 +1,123 @@
1
  import streamlit as st
2
- from transformers import pipeline
 
 
 
 
 
3
 
4
- # Créez un widget pour télécharger le fichier
5
- uploaded_file = st.file_uploader("Téléchargez un document (PDF, TXT, CSV, JSON)", type=["pdf", "txt", "csv", "json"])
 
6
 
7
- # Chargement du modèle DistilBERT pour la reconnaissance d'entités nommées
8
- nlp = pipeline("ner", model="distilbert-base-cased",
9
- aggregation_strategy="simple")
10
 
11
- if uploaded_file is not None:
12
- # Lecture du contenu du fichier
13
- text = uploaded_file.read()
14
 
15
- # Utilisation du modèle de traitement du langage naturel pour la reconnaissance d'entités nommées
16
- entities = nlp(text)
17
 
18
- st.subheader("Entités nommées détectées dans le document :")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- for entity in entities:
21
- st.write(f"Texte : {entity['word']}, Étiquette : {entity['entity']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Vous pouvez également afficher d'autres informations sur les entités détectées si nécessaire.
24
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import json
6
+ import base64
7
+ import uuid
8
 
9
+ import transformers
10
+ from datasets import Dataset,load_dataset, load_from_disk
11
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
12
 
 
 
 
13
 
14
+ st.set_page_config(
15
+ page_title="Named Entity Recognition Tagger", page_icon="📘"
16
+ )
17
 
 
 
18
 
19
+ def convert_df(df:pd.DataFrame):
20
+ return df.to_csv(index=False).encode('utf-8')
21
+
22
+ #@st.cache
23
+ def convert_json(df:pd.DataFrame):
24
+ result = df.to_json(orient="index")
25
+ parsed = json.loads(result)
26
+ json_string = json.dumps(parsed)
27
+ #st.json(json_string, expanded=True)
28
+ return json_string
29
+
30
+ st.title("📘Named Entity Recognition Tagger")
31
+
32
+ @st.cache(allow_output_mutation=True)
33
+ def load_model():
34
+
35
+ model = AutoModelForTokenClassification.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
36
+ trainer = Trainer(model=model)
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
39
+
40
+ return trainer, model, tokenizer
41
+
42
+ id2tag = {0: 'O',
43
+ 1: 'B-LOC',
44
+ 2: 'B-PER',
45
+ 3: 'I-PER',
46
+ 4: 'B-ORG',
47
+ 5: 'I-DATE',
48
+ 6: 'B-DATE',
49
+ 7: 'I-ORG',
50
+ 8: 'I-LOC'
51
+ }
52
+
53
+ def tag_sentence(text:str):
54
+ # convert our text to a tokenized sequence
55
+ inputs = tokenizer(text, truncation=True, return_tensors="pt")
56
+ # get outputs
57
+ outputs = model(**inputs)
58
+ # convert to probabilities with softmax
59
+ probs = outputs[0][0].softmax(1)
60
+ # get the tags with the highest probability
61
+ word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) )
62
+ for i, tagid in enumerate (probs.argmax(axis=1))]
63
+
64
+ df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
65
+ return df
66
+
67
+
68
+ with st.form(key='my_form'):
69
+
70
+ x1 = st.text_input(label='Enter a sentence:', max_chars=250)
71
+ print(x1)
72
+ submit_button = st.form_submit_button(label='🏷️ Create tags')
73
+
74
+
75
+ if submit_button:
76
+ if re.sub('\s+','',x1)=='':
77
+ st.error('Please enter a non-empty sentence.')
78
+
79
+ elif re.match(r'\A\s*\w+\s*\Z', x1):
80
+ st.error("Please enter a sentence with at least one word")
81
 
82
+ else:
83
+ st.markdown("### Tagged Sentence")
84
+ st.header("")
85
+
86
+ Trainer, model, tokenizer = load_model()
87
+ results=tag_sentence(x1)
88
+
89
+ cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
90
+
91
+ with c1:
92
+ #csvbutton = download_button(results, "results.csv", "📥 Download .csv")
93
+ csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results), file_name= "results.csv", mime='text/csv', key='csv')
94
+ with c2:
95
+ #textbutton = download_button(results, "results.txt", "📥 Download .txt")
96
+ textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results), file_name= "results.text", mime='text/plain', key='text')
97
+ with c3:
98
+ #jsonbutton = download_button(results, "results.json", "📥 Download .json")
99
+ jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results), file_name= "results.json", mime='application/json', key='json')
100
+
101
+ st.header("")
102
+
103
+ c1, c2, c3 = st.columns([1, 3, 1])
104
+
105
+ with c2:
106
+
107
+ st.table(results.style.background_gradient(subset=['probability']).format(precision=2))
108
+
109
+ st.header("")
110
+ st.header("")
111
+ st.header("")
112
+ with st.expander("ℹ️ - About this app", expanded=True):
113
 
 
114
 
115
+ st.write(
116
+ """
117
+ - The **Named Entity Recognition Tagger** app is a tool that performs named entity recognition.
118
+ - The available entitites are: *corporation*, *creative-work*, *group*, *location*, *person* and *product*.
119
+ - The app uses the [RoBERTa model](https://huggingface.co/roberta-large), fine-tuned on the [wnut](https://huggingface.co/datasets/wnut_17) dataset.
120
+ - The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.
121
+ - For more info regarding the data science part, check this [post](https://towardsdatascience.com/named-entity-recognition-with-deep-learning-bert-the-essential-guide-274c6965e2d?sk=c3c3699e329e45a8ed93d286ae04ef10).
122
+ """
123
+ )