istassiy commited on
Commit
4355387
·
1 Parent(s): f232b20

commit from

Browse files
Files changed (1) hide show
  1. app.py +77 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import torch
4
+
5
+ my_model_name = "istassiy/ysda_2022_ml2_hw3_distilbert_base_uncased"
6
+
7
+ arxiv_code_to_topic = {
8
+ 'cs' : 'computer science',
9
+
10
+ 'q-bio' : 'biology',
11
+
12
+ 'q-fin' : 'finance',
13
+
14
+ 'astro-ph' : 'physics',
15
+ 'cond-mat' : 'physics',
16
+ 'gr-qc' : 'physics',
17
+ 'hep-ex' : 'physics',
18
+ 'hep-lat' : 'physics',
19
+ 'hep-ph' : 'physics',
20
+ 'hep-th' : 'physics',
21
+ 'math-ph' : 'physics',
22
+ 'nlin' : 'physics',
23
+ 'nucl-ex' : 'physics',
24
+ 'nucl-th' : 'physics',
25
+ 'quant-ph' : 'physics',
26
+ 'physics' : 'physics',
27
+
28
+ 'eess' : 'electrical engineering',
29
+
30
+ 'econ' : 'economics',
31
+
32
+ 'math' : 'mathematics',
33
+
34
+ 'stat' : 'statistics',
35
+ }
36
+
37
+ sorted_arxiv_topics = sorted(set(arxiv_code_to_topic.values()))
38
+
39
+ NUM_LABELS = len(sorted_arxiv_topics)
40
+
41
+ @st.cache
42
+ def load_model():
43
+ from transformers import AutoTokenizer, AutoModel
44
+ tokenizer = AutoTokenizer.from_pretrained(my_model_name)
45
+ model = AutoModel.from_pretrained(my_model_name)
46
+ return tokenizer, model
47
+
48
+ def get_top_predictions(predictions):
49
+ probs = (torch.sigmoid(predictions) > 0).int().numpy()
50
+ probs = probs / np.sum(probs)
51
+
52
+ res = {}
53
+ total_prob = 0
54
+ for topic, prob in zip(sorted_arxiv_topics, probs):
55
+ total_prob += prob
56
+ res[topic] = prob
57
+ if total_prob > 0.95:
58
+ break
59
+ return res
60
+
61
+ tokenizer, model = load_model()
62
+
63
+ st.markdown("# Scientific paper classificator")
64
+ st.markdown(
65
+ "Fill in paper summary and / or title below:",
66
+ unsafe_allow_html=False
67
+ )
68
+
69
+ paper_title = st.text_area("Paper title")
70
+ paper_summary = st.text_area("Paper abstract")
71
+
72
+ if not paper_title and not paper_summary:
73
+ st.markdown(f"Must have non-empty title or summary")
74
+ else:
75
+ raw_predictions = model(**tokenizer(paper_title + "." + paper_summary))
76
+ results = get_top_predictions(raw_predictions)
77
+ st.markdown(f"{results}")