File size: 2,871 Bytes
2981a00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782dfba
2981a00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a6ad68
2981a00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from langdetect import detect
from newspaper import Article
from PIL import Image
import streamlit as st
import requests
import torch

st.markdown("## Prediction of Fakeness by Given URL")
background = Image.open('logo.jpg')
st.image(background)

st.markdown(f"### Article URL")
text = st.text_area("Insert some url here", 
        value="https://en.globes.co.il/en/article-yandex-looks-to-expand-activities-in-israel-1001406519")

@st.cache(allow_output_mutation=True)
def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.load_state_dict(torch.load('./model.pth', map_location='cpu'))

    model_name_translator = "facebook/wmt19-ru-en"
    tokenizer_translator = FSMTTokenizer.from_pretrained(model_name_translator)
    model_translator = FSMTForConditionalGeneration.from_pretrained(model_name_translator)
    model_translator.eval()
    return model, tokenizer, model_translator, tokenizer_translator

model, tokenizer, model_translator, tokenizer_translator = get_models_and_tokenizers()

article = Article(text)
article.download()
article.parse()
concated_text = article.title + '. ' + article.text
lang = detect(concated_text)

st.markdown(f"### Language detection")

if lang == 'ru':
    with st.spinner('Waiting for translation'):
        st.markdown(f"The language of this article is {lang.upper()} so we translated it!")
        input_ids = tokenizer_translator.encode(concated_text, 
            return_tensors="pt", max_length=512, truncation=True)
        outputs = model_translator.generate(input_ids)
        decoded = tokenizer_translator.decode(outputs[0], skip_special_tokens=True)
        st.markdown("### Translated Text")
        st.markdown(f"{decoded[:777]}")
        concated_text = decoded
else:
    st.markdown(f"The language of this article for sure:  {lang.upper()}!")

    st.markdown("### Extracted Text")
    st.markdown(f"{concated_text[:777]}")

tokens_info = tokenizer(concated_text, truncation=True, return_tensors="pt")
with torch.no_grad():
    raw_predictions = model(**tokens_info)
softmaxed = int(torch.nn.functional.softmax(raw_predictions.logits[0], dim=0)[1] * 100)
st.markdown("### Fakeness Prediction")
st.progress(softmaxed)
st.markdown(f"This is fake by **{softmaxed}%**!")
if (softmaxed > 70):
    st.error('We would not trust this text!')
elif (softmaxed > 40):
    st.warning('We are not sure about this text!')
else:
    st.success('We would trust this text!')