File size: 3,208 Bytes
c6f0cd5
 
 
 
 
8080ccc
 
 
c6f0cd5
 
87a44f6
c6f0cd5
 
 
 
 
 
 
 
 
 
 
8080ccc
 
c6f0cd5
8080ccc
 
 
 
 
 
 
 
 
 
c6f0cd5
8080ccc
 
 
c6f0cd5
 
8080ccc
 
 
 
 
 
 
c6f0cd5
8080ccc
 
c6f0cd5
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

import streamlit as st
from transformers import T5ForConditionalGeneration, T5TokenizerFast, T5Config


if 'textbox' not in st.session_state:
    st.session_state['textbox'] = "We are not using punctation or capital letters when we are speaking. Actually we hardly even use pauses between words. Still we are able to make sense out of the text. This demo shows that a T5-model is able to reconstruct the sentences, even if these parts of the text is removed. Try removing spaces, capital letters and puctation from this text, and then see if the model can recpnstruct it. Did it work?"
 
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def load_model():
    model_name = "pere/multi-sentencefix-mt5-large"
    config = T5Config.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name,config=config)
    tokenizer = T5TokenizerFast.from_pretrained(model_name)
    return (model, tokenizer)

def deuncase(model, tokenizer, text):
        encoded_txt = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(
            **encoded_txt
        )
        return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
def uncase():
    st.session_state['textbox'] = st.session_state['textbox'].lower()

def unpunct():
    trans_chars = "'\",.:;-_*?/\n"
    trans_table = st.session_state['textbox'].maketrans("", "", trans_chars)
    st.session_state['textbox'] = st.session_state['textbox'].translate(trans_table)
    
def unspace():
    st.session_state['textbox'] = st.session_state['textbox'].replace(" ","")
    
def sidebar_callback():
    st.session_state['textbox'] = st.session_state['prefilled']

st.title("DeUnCaser")
st.sidebar.write("This web app adds spaces, punctation and capitalisation back into the text.")
st.sidebar.write("You can use the examples below, but too really test the effect of the model: Write or copy text from the Internet, and then manually remove spaces, puctation, cases etc. Try to restore the text.")
option = st.sidebar.selectbox(
     "Examples:",
     ("This model is crated by Per Egil Kummervold. This is not a common name in English. Do you think the model will understand that it is a name?","Some Norwegian text: Vi bruker ikke tegnsetting eller store bokstaver når vi prater. Vi slår også sammen ord, og i praksis er dermed heller ikke mellomrom meningsbærende. Prøv å fjerne tegnsetting, store bokstaver og mellomrom fra dette avsnittet. Se om den nye North-T5-modellen greier å sette sammen til et nytt meningsbærende avsnitt.","areyouabletoreadthistextwillthemodelbeableto"),key='prefilled',on_change=sidebar_callback)

placeholder = st.empty()
st.sidebar.write("\nText Tools:")
st.sidebar.button('Remove Punctation', on_click=unpunct)
st.sidebar.button('Remove Casing', on_click=uncase)
st.sidebar.button('Remove Spaces', on_click=unspace)

with placeholder:
    text = st.text_area(f"Input text",max_chars=1000,height=140,key="textbox")

run = st.button('Run DeUnCaser')

if run:
    model, tokenizer = load_model()
    translated_text = deuncase(model, tokenizer, text)
    st.write(translated_text[0] if translated_text else "Unknown Error Translating Text")