vonewman commited on
Commit
447a922
1 Parent(s): 66f3b3e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -116
app.py DELETED
@@ -1,116 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import re
4
- import json
5
- import transformers
6
- import torch
7
- from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
8
-
9
- st.set_page_config(
10
- page_title="Named Entity Recognition Wolof",
11
- page_icon="📘"
12
- )
13
-
14
- def convert_df(df: pd.DataFrame):
15
- return df.to_csv(index=False).encode('utf-8')
16
-
17
- def convert_json(df: pd.DataFrame):
18
- result = df.to_json(orient="index")
19
- parsed = json.loads(result)
20
- json_string = json.dumps(parsed)
21
- return json_string
22
-
23
- def load_model():
24
- model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
25
- trainer = Trainer(model=model)
26
- tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
27
- return trainer, model, tokenizer
28
-
29
- def align_word_ids(texts):
30
- trainer, model, tokenizer = load_model()
31
- tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
32
- word_ids = tokenized_inputs.word_ids()
33
- previous_word_idx = None
34
- label_ids = []
35
- for word_idx in word_ids:
36
- if word_idx is None:
37
- label_ids.append(-100)
38
- elif word_idx != previous_word_idx:
39
- try:
40
- label_ids.append(1)
41
- except:
42
- label_ids.append(-100)
43
- else:
44
- try:
45
- label_ids.append(1 if label_all_tokens else -100)
46
- except:
47
- label_ids.append(-100)
48
- previous_word_idx = word_idx
49
- return label_ids
50
-
51
- def predict_ner_labels(model, tokenizer, sentence):
52
- use_cuda = torch.cuda.is_available()
53
- device = torch.device("cuda" if use_cuda else "cpu")
54
- if use_cuda:
55
- model = model.cuda()
56
- text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
57
- mask = text['attention_mask'].to(device)
58
- input_id = text['input_ids'].to(device)
59
- label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
60
- logits = model(input_id, mask, None)
61
- logits_clean = logits[0][label_ids != -100]
62
- predictions = logits_clean.argmax(dim=1).tolist()
63
- prediction_label = [id2tag[i] for i in predictions]
64
- return prediction_label
65
-
66
- id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
67
-
68
- def tag_sentence(text):
69
- trainer, model, tokenizer = load_model()
70
- predictions = predict_ner_labels(model, tokenizer, text)
71
- # Créez un DataFrame avec les colonnes "words" et "tags"
72
- df = pd.DataFrame({'words': text.split(), 'tags': predictions})
73
- return df
74
-
75
- st.title("📘 Named Entity Recognition Wolof")
76
-
77
- with st.form(key='my_form'):
78
- x1 = st.text_input(label='Enter a sentence:', max_chars=250)
79
- submit_button = st.form_submit_button(label='🏷️ Create tags')
80
-
81
- if submit_button:
82
- if re.sub('\s+', '', x1) == '':
83
- st.error('Please enter a non-empty sentence.')
84
- elif re.match(r'\A\s*\w+\s*\Z', x1):
85
- st.error("Please enter a sentence with at least one word")
86
- else:
87
- st.markdown("### Tagged Sentence")
88
- st.header("")
89
- results = tag_sentence(x1)
90
- cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
91
- with c1:
92
- csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
93
- file_name="results.csv", mime='text/csv', key='csv')
94
- with c2:
95
- textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
96
- file_name="results.text", mime='text/plain', key='text')
97
- with c3:
98
- jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
99
- file_name="results.json", mime='application/json', key='json')
100
- st.header("")
101
- c1, c2, c3 = st.columns([1, 3, 1])
102
- with c2:
103
- st.table(results[['words', 'tags']])
104
-
105
- st.header("")
106
- st.header("")
107
- st.header("")
108
- with st.expander("ℹ️ - About this app", expanded=True):
109
- st.write(
110
- """
111
- - The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
112
- - The available entities are: *corporation*, *location*, *person*, and *date*.
113
- - The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
114
- - The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized.
115
- """
116
- )