AleksBlacky's picture
update - more checking user input
a432184
import streamlit as st
import transformers
import pickle
import seaborn as sns
from pandas import DataFrame
from transformers import AutoTokenizer, AutoModelForSequenceClassification
st.markdown("# Hello, friend!")
st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
try:
model_name_global = "allenai/scibert_scivocab_uncased"
tokenizer_ = AutoTokenizer.from_pretrained(model_name_global)
with open('./models/scibert/decode_dict.pkl', 'rb') as f:
decode_dict = pickle.load(f)
except ValueError:
st.error("Load tokenizer or decode answer dict goes wrong! Pls contact author alekseystepin13@gmail.com")
with st.form(key="my_form"):
st.markdown("### 🎈 Do you want a little magic? ")
st.markdown(" Write your article title and abstract to textboxes bellow and I'll gues topic of your paper! ")
ce, c2, c3 = st.columns([0.07, 7, 0.07])
with c2:
doc_title = st.text_area(
"Paste your abstract title below (1 to 50 words)",
height=210,
)
doc_abstract = st.text_area(
"Paste your abstract text below (1 to 500 words)",
height=410,
)
MAX_WORDS_TITLE, MAX_WORDS_ABSTRACT = 50, 500
import re
len_title = len(re.findall(r"\w+", doc_title))
len_abstract = len(re.findall(r"\w+", doc_abstract))
if len_title > MAX_WORDS_TITLE:
st.warning(
"⚠️ Your title contains "
+ str(len_title)
+ " words."
+ " Only the first 50 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
)
doc_title = doc_title[:MAX_WORDS_TITLE]
if len_abstract > MAX_WORDS_ABSTRACT:
st.warning(
"⚠️ Your abstract contains "
+ str(len_abstract)
+ " words."
+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
)
doc_abstract = doc_abstract[:MAX_WORDS_ABSTRACT]
submit_button = st.form_submit_button(label="✨ Let's play, try it!")
if not submit_button:
st.stop()
if len_title < 1:
st.error("Article without any words in title? Pls give me correct title!")
st.stop()
if len_abstract < 1:
st.error("Article without any words in abstract? Pls give me correct abstract!")
st.stop()
# allow_output_mutation=True
@st.cache(suppress_st_warning=True)
def load_model():
st.write("Loading big model")
return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
def make_predict(tokens, decode_dict):
model_ = load_model()
outs = model_(tokens.input_ids)
probs = outs["logits"].softmax(dim=-1).tolist()[0]
topic_probs = {}
for i, p in enumerate(probs):
if p > 0.1:
topic_probs[decode_dict[i]] = p
return topic_probs
model_local = "models/scibert/"
title = doc_title
abstract = doc_abstract
try:
tokens = tokenizer_(title + abstract, return_tensors="pt")
except ValueError:
st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")
predicts = make_predict(tokens, decode_dict)
st.markdown("## 🎈 Yor article probably about: ")
st.header("")
df = (
DataFrame(predicts.items(), columns=["Topic", "Prob"])
.sort_values(by="Prob", ascending=False)
.reset_index(drop=True)
)
df.index += 1
# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
cmap=cmGreen,
subset=[
"Prob",
],
)
c1, c2, c3 = st.columns([1, 3, 1])
format_dictionary = {
"Prob": "{:.1%}",
}
df = df.format(format_dictionary)
with c2:
st.table(df)