Jan Mühlnikel
commited on
Commit
•
f123b98
1
Parent(s):
fd7cbe7
added matching functionality and viz
Browse files- __pycache__/similarity_page.cpython-310.pyc +0 -0
- functions/__pycache__/calc_matches.cpython-310.pyc +0 -0
- functions/__pycache__/filter_projects.cpython-310.pyc +0 -0
- functions/calc_matches.py +31 -0
- modules/__pycache__/result_table.cpython-310.pyc +0 -0
- modules/result_table.py +106 -51
- similarity_page.py +8 -41
__pycache__/similarity_page.cpython-310.pyc
CHANGED
Binary files a/__pycache__/similarity_page.cpython-310.pyc and b/__pycache__/similarity_page.cpython-310.pyc differ
|
|
functions/__pycache__/calc_matches.cpython-310.pyc
ADDED
Binary file (810 Bytes). View file
|
|
functions/__pycache__/filter_projects.cpython-310.pyc
ADDED
Binary file (983 Bytes). View file
|
|
functions/calc_matches.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def calc_matches(filtered_df, project_df, similarity_matrix):
|
5 |
+
# matching project2 can be nay project
|
6 |
+
# indecies (rows) = project1
|
7 |
+
# columns = project2
|
8 |
+
# -> find matches
|
9 |
+
|
10 |
+
# filter out all row considering the filter
|
11 |
+
filtered_df_indecies_list = filtered_df.index
|
12 |
+
|
13 |
+
np.fill_diagonal(similarity_matrix, 0)
|
14 |
+
match_matrix = similarity_matrix[filtered_df_indecies_list]
|
15 |
+
|
16 |
+
# get row (project1) and column (project2) with highest similarity in filtered df
|
17 |
+
top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-60:], match_matrix.shape)
|
18 |
+
|
19 |
+
# get the corresponding similarity values
|
20 |
+
top_values = match_matrix[top_indices]
|
21 |
+
|
22 |
+
p1_df = filtered_df.iloc[top_indices[0]]
|
23 |
+
p1_df["similarity"] = top_values
|
24 |
+
p2_df = project_df.iloc[top_indices[1]]
|
25 |
+
p2_df["similarity"] = top_values
|
26 |
+
|
27 |
+
return p1_df, p2_df
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
|
modules/__pycache__/result_table.cpython-310.pyc
CHANGED
Binary files a/modules/__pycache__/result_table.cpython-310.pyc and b/modules/__pycache__/result_table.cpython-310.pyc differ
|
|
modules/result_table.py
CHANGED
@@ -1,53 +1,108 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
def show_table(
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
def show_table(p1_df, p2_df):
|
4 |
+
|
5 |
+
col1, col2 = st.columns([1, 1])
|
6 |
+
with col1:
|
7 |
+
st.write("------------------")
|
8 |
+
|
9 |
+
st.dataframe(
|
10 |
+
p1_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
|
11 |
+
use_container_width = True,
|
12 |
+
height = 35 + 35 * len(p1_df),
|
13 |
+
column_config={
|
14 |
+
"orga_abbreviation": st.column_config.TextColumn(
|
15 |
+
"Organization",
|
16 |
+
help="If description not in English, description in other language provided",
|
17 |
+
disabled=True
|
18 |
+
),
|
19 |
+
"client": st.column_config.TextColumn(
|
20 |
+
"Client",
|
21 |
+
help="Client organization of customer",
|
22 |
+
disabled=True
|
23 |
+
),
|
24 |
+
"title_main": st.column_config.TextColumn(
|
25 |
+
"Title",
|
26 |
+
help="If title not in English, title in other language provided",
|
27 |
+
disabled=True
|
28 |
+
),
|
29 |
+
"description_main": st.column_config.TextColumn(
|
30 |
+
"Description",
|
31 |
+
help="If description not in English, description in other language provided",
|
32 |
+
disabled=True
|
33 |
+
),
|
34 |
+
"country": st.column_config.TextColumn(
|
35 |
+
"Country",
|
36 |
+
help="Country of project",
|
37 |
+
disabled=True
|
38 |
+
),
|
39 |
+
"sgd_pred_code": st.column_config.TextColumn(
|
40 |
+
"SDG Prediction",
|
41 |
+
help="Prediction of SDG's",
|
42 |
+
disabled=True
|
43 |
+
),
|
44 |
+
"crs_3_code": st.column_config.TextColumn(
|
45 |
+
"CRS 3",
|
46 |
+
help="CRS 3 code given by organization",
|
47 |
+
disabled=True
|
48 |
+
),
|
49 |
+
"crs_5_code": st.column_config.TextColumn(
|
50 |
+
"CRS 5",
|
51 |
+
help="CRS 5 code given by organization",
|
52 |
+
disabled=True
|
53 |
+
),
|
54 |
+
},
|
55 |
+
hide_index=True,
|
56 |
+
)
|
57 |
+
|
58 |
+
with col2:
|
59 |
+
st.write("------------------")
|
60 |
+
|
61 |
+
st.dataframe(
|
62 |
+
p2_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
|
63 |
+
use_container_width = True,
|
64 |
+
height = 35 + 35 * len(p2_df),
|
65 |
+
column_config={
|
66 |
+
"orga_abbreviation": st.column_config.TextColumn(
|
67 |
+
"Organization",
|
68 |
+
help="If description not in English, description in other language provided",
|
69 |
+
disabled=True
|
70 |
+
),
|
71 |
+
"client": st.column_config.TextColumn(
|
72 |
+
"Client",
|
73 |
+
help="Client organization of customer",
|
74 |
+
disabled=True
|
75 |
+
),
|
76 |
+
"title_main": st.column_config.TextColumn(
|
77 |
+
"Title",
|
78 |
+
help="If title not in English, title in other language provided",
|
79 |
+
disabled=True
|
80 |
+
),
|
81 |
+
"description_main": st.column_config.TextColumn(
|
82 |
+
"Description",
|
83 |
+
help="If description not in English, description in other language provided",
|
84 |
+
disabled=True
|
85 |
+
),
|
86 |
+
"country": st.column_config.TextColumn(
|
87 |
+
"Country",
|
88 |
+
help="Country of project",
|
89 |
+
disabled=True
|
90 |
+
),
|
91 |
+
"sgd_pred_code": st.column_config.TextColumn(
|
92 |
+
"SDG Prediction",
|
93 |
+
help="Prediction of SDG's",
|
94 |
+
disabled=True
|
95 |
+
),
|
96 |
+
"crs_3_code": st.column_config.TextColumn(
|
97 |
+
"CRS 3",
|
98 |
+
help="CRS 3 code given by organization",
|
99 |
+
disabled=True
|
100 |
+
),
|
101 |
+
"crs_5_code": st.column_config.TextColumn(
|
102 |
+
"CRS 5",
|
103 |
+
help="CRS 5 code given by organization",
|
104 |
+
disabled=True
|
105 |
+
),
|
106 |
+
},
|
107 |
+
hide_index=True,
|
108 |
+
)
|
similarity_page.py
CHANGED
@@ -11,9 +11,10 @@ from scipy.sparse import load_npz
|
|
11 |
import pickle
|
12 |
import faiss
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
-
|
15 |
import modules.semantic_search as semantic_search
|
16 |
from functions.filter_projects import filter_projects
|
|
|
17 |
import psutil
|
18 |
import os
|
19 |
|
@@ -131,45 +132,11 @@ def show_page():
|
|
131 |
# CRS CODE LIST
|
132 |
crs3_list = [i[-3:] for i in crs3_option]
|
133 |
|
134 |
-
|
|
|
135 |
|
136 |
-
|
137 |
-
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
"""
|
142 |
-
#semantic_search.show_search(model, faiss_index, sentences)
|
143 |
-
|
144 |
-
df_subset = projects_df.head(10)
|
145 |
-
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
|
146 |
-
|
147 |
-
st.write(selected_index)
|
148 |
-
|
149 |
-
# add index and similarity together
|
150 |
-
indecies = range(0, len(sim_matrix))
|
151 |
-
similarities = sim_matrix[selected_index]
|
152 |
-
zipped_sims = list(zip(indecies, similarities))
|
153 |
-
|
154 |
-
# remove all 0 similarities
|
155 |
-
filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
|
156 |
-
|
157 |
-
# Select and sort top 20 most similar projects
|
158 |
-
sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
|
159 |
-
top_20_sims = sorted_sims[:20]
|
160 |
-
|
161 |
-
# create result data frame
|
162 |
-
index_list = [tup[0] for tup in top_20_sims]
|
163 |
-
print(index_list)
|
164 |
-
result_df = projects_df.iloc[index_list]
|
165 |
-
print(len(result_df))
|
166 |
-
|
167 |
-
print(len(result_df))
|
168 |
-
# add other colums to result df
|
169 |
-
|
170 |
-
similarity_list = [tup[1] for tup in top_20_sims]
|
171 |
-
result_df["similarity"] = similarity_list
|
172 |
-
|
173 |
-
similarity_table.show_table(result_df, similarity_list)
|
174 |
-
|
175 |
-
"""
|
|
|
11 |
import pickle
|
12 |
import faiss
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
+
from modules.result_table import show_table
|
15 |
import modules.semantic_search as semantic_search
|
16 |
from functions.filter_projects import filter_projects
|
17 |
+
from functions.calc_matches import calc_matches
|
18 |
import psutil
|
19 |
import os
|
20 |
|
|
|
132 |
# CRS CODE LIST
|
133 |
crs3_list = [i[-3:] for i in crs3_option]
|
134 |
|
135 |
+
# FILTER DF WITH SELECTED FILTER OPTIONS
|
136 |
+
filtered_df = filter_projects(projects_df, crs3_list)
|
137 |
|
138 |
+
# FIND MATCHES
|
139 |
+
p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix)
|
140 |
|
141 |
+
# SHOW THE RESULT
|
142 |
+
show_table(p1_df, p2_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|