import streamlit as st from transformers import T5ForConditionalGeneration, T5TokenizerFast, T5Config if 'textbox' not in st.session_state: st.session_state['textbox'] = "We are not using punctation or capital letters when we are speaking. Still we are able to make sense out of the text. This demo shows that a T5-model is able to reconstruct the sentences, even if these parts of the text is removed. Try removing capital letters and punctation from this text, and then see if the model can reconstruct it. Did it work?" @st.cache(allow_output_mutation=True, suppress_st_warning=True) def load_model(): model_name = "pere/multi-sentencefix-mt5-large" config = T5Config.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name,config=config) tokenizer = T5TokenizerFast.from_pretrained(model_name) return (model, tokenizer) def deuncase(model, tokenizer, text): encoded_txt = tokenizer(text, return_tensors="pt") generated_tokens = model.generate( **encoded_txt ) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) def uncase(): st.session_state['textbox'] = st.session_state['textbox'].lower() def unpunct(): trans_chars = "'\",.:;-_*?/\n" trans_table = st.session_state['textbox'].maketrans("", "", trans_chars) st.session_state['textbox'] = st.session_state['textbox'].translate(trans_table) def unspace(): st.session_state['textbox'] = st.session_state['textbox'].replace(" ","") def sidebar_callback(): st.session_state['textbox'] = st.session_state['prefilled'] st.title("DeUnCaser") st.sidebar.write("This web app adds punctation and capitalisation back into the text.") st.sidebar.write("You can use the examples below, but too really test the effect of the model: Write or copy text from the Internet, and then manually remove punctation and casing etc. Try to restore the text.") option = st.sidebar.selectbox( "Examples:", ("This model is crated by Per Egil Kummervold. This is not a common name in English. Do you think the model will understand that it is a name?","Some Norwegian text: Vi bruker ikke tegnsetting eller store bokstaver når vi prater. Prøv å fjerne tegnsetting og store bokstaver. Se om denne T5-modellen greier å sette sammen til et nytt meningsbærende avsnitt."),key='prefilled',on_change=sidebar_callback) placeholder = st.empty() st.sidebar.write("\nText Tools:") st.sidebar.button('Remove Punctation', on_click=unpunct) st.sidebar.button('Remove Casing', on_click=uncase) with placeholder: text = st.text_area(f"Input text",max_chars=1000,height=140,key="textbox") run = st.button('Run DeUnCaser') if run: model, tokenizer = load_model() translated_text = deuncase(model, tokenizer, text) st.write(translated_text[0] if translated_text else "Unknown Error Translating Text")