Spaces:
Sleeping
Sleeping
Nick Sorros
commited on
Commit
β’
b493a01
1
Parent(s):
cacf814
Tag more grants and implement most common
Browse files- app.py +18 -11
- preprocess.py +8 -1
- tag.py +2 -2
- tagged_grants.jsonl +0 -0
app.py
CHANGED
@@ -1,39 +1,46 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import srsly
|
3 |
|
|
|
4 |
def search(query):
|
5 |
results = []
|
6 |
for grant in grants:
|
7 |
if query in grant["tags"]:
|
8 |
-
results.append({
|
9 |
-
"title": grant["title"],
|
10 |
-
"tags": grant["tags"]
|
11 |
-
})
|
12 |
st.session_state["results"] = results
|
13 |
|
|
|
14 |
st.header("Search π grants using MeSH π")
|
15 |
st.sidebar.header("Information βΉ")
|
16 |
-
st.sidebar.write(
|
|
|
|
|
17 |
st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
|
18 |
-
st.sidebar.write(
|
|
|
|
|
19 |
|
20 |
if "grants" not in st.session_state:
|
21 |
st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
|
22 |
|
23 |
grants = st.session_state["grants"]
|
24 |
|
25 |
-
query = st.text_input("", value="
|
26 |
st.button("Search π", on_click=search, kwargs={"query": query})
|
27 |
|
28 |
if "results" in st.session_state:
|
29 |
st.caption("Related MeSH terms")
|
30 |
-
|
|
|
|
|
|
|
31 |
columns = st.columns(5)
|
32 |
for row_i in range(3):
|
33 |
for col_i, col in enumerate(columns):
|
34 |
with col:
|
35 |
-
tag_i = row_i*5 + col_i
|
36 |
-
if tag_i < len(
|
37 |
-
tag =
|
38 |
st.button(tag, on_click=search, kwargs={"query": tag})
|
39 |
st.table(st.session_state["results"])
|
|
|
1 |
+
from collections import Counter
|
2 |
import streamlit as st
|
3 |
import srsly
|
4 |
|
5 |
+
|
6 |
def search(query):
|
7 |
results = []
|
8 |
for grant in grants:
|
9 |
if query in grant["tags"]:
|
10 |
+
results.append({"title": grant["title"], "tags": grant["tags"]})
|
|
|
|
|
|
|
11 |
st.session_state["results"] = results
|
12 |
|
13 |
+
|
14 |
st.header("Search π grants using MeSH π")
|
15 |
st.sidebar.header("Information βΉ")
|
16 |
+
st.sidebar.write(
|
17 |
+
"A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView"
|
18 |
+
)
|
19 |
st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
|
20 |
+
st.sidebar.write(
|
21 |
+
"The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh"
|
22 |
+
)
|
23 |
|
24 |
if "grants" not in st.session_state:
|
25 |
st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
|
26 |
|
27 |
grants = st.session_state["grants"]
|
28 |
|
29 |
+
query = st.text_input("", value="Malaria")
|
30 |
st.button("Search π", on_click=search, kwargs={"query": query})
|
31 |
|
32 |
if "results" in st.session_state:
|
33 |
st.caption("Related MeSH terms")
|
34 |
+
|
35 |
+
retrieved_tags = [tag for res in st.session_state["results"] for tag in res["tags"]]
|
36 |
+
most_common_tags = [tag for tag, _ in Counter(retrieved_tags).most_common(20)]
|
37 |
+
|
38 |
columns = st.columns(5)
|
39 |
for row_i in range(3):
|
40 |
for col_i, col in enumerate(columns):
|
41 |
with col:
|
42 |
+
tag_i = row_i * 5 + col_i
|
43 |
+
if tag_i < len(most_common_tags):
|
44 |
+
tag = most_common_tags[tag_i]
|
45 |
st.button(tag, on_click=search, kwargs={"query": tag})
|
46 |
st.table(st.session_state["results"])
|
preprocess.py
CHANGED
@@ -3,14 +3,21 @@ import json
|
|
3 |
from tqdm import tqdm
|
4 |
import typer
|
5 |
|
|
|
6 |
def preprocess(data_path, processed_data_path):
|
7 |
with open(data_path) as f:
|
8 |
data = json.loads(f.read())
|
9 |
|
10 |
with open(processed_data_path, "w") as f:
|
11 |
for grant in tqdm(data["grants"]):
|
12 |
-
if any(
|
|
|
|
|
|
|
|
|
|
|
13 |
f.write(json.dumps(grant) + "\n")
|
14 |
|
|
|
15 |
if __name__ == "__main__":
|
16 |
typer.run(preprocess)
|
|
|
3 |
from tqdm import tqdm
|
4 |
import typer
|
5 |
|
6 |
+
|
7 |
def preprocess(data_path, processed_data_path):
|
8 |
with open(data_path) as f:
|
9 |
data = json.loads(f.read())
|
10 |
|
11 |
with open(processed_data_path, "w") as f:
|
12 |
for grant in tqdm(data["grants"]):
|
13 |
+
if any(
|
14 |
+
[
|
15 |
+
org["name"] == "The Wellcome Trust"
|
16 |
+
for org in grant["fundingOrganization"]
|
17 |
+
]
|
18 |
+
):
|
19 |
f.write(json.dumps(grant) + "\n")
|
20 |
|
21 |
+
|
22 |
if __name__ == "__main__":
|
23 |
typer.run(preprocess)
|
tag.py
CHANGED
@@ -24,13 +24,13 @@ def tag(data_path, tagged_data_path, sample_size: int = 10):
|
|
24 |
|
25 |
texts = [grant["title_and_description"] for grant in data]
|
26 |
for batch_index in tqdm(range(0, len(texts), 10)):
|
27 |
-
batch_texts = texts[batch_index:batch_index+10]
|
28 |
|
29 |
inputs = tokenizer(batch_texts, padding="max_length")
|
30 |
labels = model(**inputs, return_labels=True)
|
31 |
|
32 |
for i, tags in enumerate(labels):
|
33 |
-
data[batch_index+i]["tags"] = tags
|
34 |
|
35 |
srsly.write_jsonl(tagged_data_path, data)
|
36 |
|
|
|
24 |
|
25 |
texts = [grant["title_and_description"] for grant in data]
|
26 |
for batch_index in tqdm(range(0, len(texts), 10)):
|
27 |
+
batch_texts = texts[batch_index : batch_index + 10]
|
28 |
|
29 |
inputs = tokenizer(batch_texts, padding="max_length")
|
30 |
labels = model(**inputs, return_labels=True)
|
31 |
|
32 |
for i, tags in enumerate(labels):
|
33 |
+
data[batch_index + i]["tags"] = tags
|
34 |
|
35 |
srsly.write_jsonl(tagged_data_path, data)
|
36 |
|
tagged_grants.jsonl
CHANGED
The diff for this file is too large to render.
See raw diff
|
|