Jan Mühlnikel commited on
Commit
d551fc8
1 Parent(s): 9dcd3f9

added crs filter

Browse files
Files changed (1) hide show
  1. similarity_page.py +175 -0
similarity_page.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Page for similarities
3
+ """
4
+
5
+ ################
6
+ # DEPENDENCIES #
7
+ ################
8
+ import streamlit as st
9
+ import pandas as pd
10
+ from scipy.sparse import load_npz
11
+ import pickle
12
+ import faiss
13
+ from sentence_transformers import SentenceTransformer
14
+ import modules.result_table as result_table
15
+ import modules.semantic_search as semantic_search
16
+ from functions.filter_projects import filter_projects
17
+ import psutil
18
+ import os
19
+
20
+ def get_process_memory():
21
+ process = psutil.Process(os.getpid())
22
+ return process.memory_info().rss / (1024 * 1024)
23
+
24
+ # Catch DATA
25
+ # Load Similarity matrix
26
+ @st.cache_data
27
+ def load_sim_matrix():
28
+ loaded_matrix = load_npz("src/similarities.npz")
29
+ dense_matrix = loaded_matrix.toarray()
30
+
31
+ return dense_matrix
32
+
33
+ # Load Projects DFs
34
+ @st.cache_data
35
+ def load_projects():
36
+ orgas_df = pd.read_csv("src/projects/project_orgas.csv")
37
+ region_df = pd.read_csv("src/projects/project_region.csv")
38
+ sector_df = pd.read_csv("src/projects/project_sector.csv")
39
+ status_df = pd.read_csv("src/projects/project_status.csv")
40
+ texts_df = pd.read_csv("src/projects/project_texts.csv")
41
+
42
+ projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
43
+ projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
44
+ projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
45
+ projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
46
+
47
+ return projects_df
48
+
49
+ # Load CRS 3 data
50
+ @st.cache_data
51
+ def getCRS3():
52
+ # Read in CRS3 CODELISTS
53
+ crs3_df = pd.read_csv('src/codelists/crs3_codes.csv')
54
+ CRS3_CODES = crs3_df['code'].tolist()
55
+ CRS3_NAME = crs3_df['name'].tolist()
56
+ CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)}
57
+
58
+ return CRS3_MERGED
59
+
60
+ # Load CRS 5 data
61
+ @st.cache_data
62
+ def getCRS5():
63
+ # Read in CRS3 CODELISTS
64
+ crs5_df = pd.read_csv('src/codelists/crs5_codes.csv')
65
+ CRS5_CODES = crs5_df['code'].tolist()
66
+ CRS5_NAME = crs5_df['name'].tolist()
67
+ CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)}
68
+
69
+ return CRS5_MERGED
70
+
71
+ # Load SDG data
72
+ @st.cache_data
73
+ def getSDG():
74
+ # Read in SDG CODELISTS
75
+ sdg_df = pd.read_csv('src/codelists/sdg_goals.csv')
76
+ SDG_NAMES = sdg_df['name'].tolist()
77
+
78
+ return SDG_NAMES
79
+
80
+ # Load Sentence Transformer Model
81
+ @st.cache_resource
82
+ def load_model():
83
+ model = SentenceTransformer('all-MiniLM-L6-v2')
84
+ return model
85
+
86
+
87
+ # Load Embeddings
88
+ @st.cache_data
89
+ def load_embeddings_and_index():
90
+ # Load embeddings
91
+ with open("src/embeddings.pkl", "rb") as fIn:
92
+ stored_data = pickle.load(fIn)
93
+ sentences = stored_data["sentences"]
94
+ embeddings = stored_data["embeddings"]
95
+
96
+ # Load or create FAISS index
97
+ dimension = embeddings.shape[1]
98
+ faiss_index = faiss.IndexFlatL2(dimension)
99
+ faiss_index.add(embeddings)
100
+
101
+ return sentences, embeddings, faiss_index
102
+
103
+ # USE CACHE FUNCTIONS
104
+ sim_matrix = load_sim_matrix()
105
+ projects_df = load_projects()
106
+
107
+ CRS3_MERGED = getCRS3()
108
+ CRS5_MERGED = getCRS5()
109
+ SDG_NAMES = getSDG()
110
+
111
+ model = load_model()
112
+ sentences, embeddings, faiss_index = load_embeddings_and_index()
113
+
114
+ def show_page():
115
+ st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
116
+ st.write("Similarities")
117
+
118
+ col1, col2 = st.columns([1, 1])
119
+ with col1:
120
+ # CRS 3 SELECTION
121
+ crs3_option = st.multiselect(
122
+ 'CRS 3',
123
+ CRS3_MERGED,
124
+ placeholder="Select"
125
+ )
126
+
127
+ with col2:
128
+ st.write("x")
129
+
130
+
131
+ # CRS CODE LIST
132
+ crs3_list = [i[-3:] for i in crs3_option]
133
+
134
+ st.write(crs3_list)
135
+
136
+ result_df = filter_projects(projects_df, crs3_list)
137
+ st.dataframe(result_df)
138
+
139
+
140
+
141
+ """
142
+ #semantic_search.show_search(model, faiss_index, sentences)
143
+
144
+ df_subset = projects_df.head(10)
145
+ selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
146
+
147
+ st.write(selected_index)
148
+
149
+ # add index and similarity together
150
+ indecies = range(0, len(sim_matrix))
151
+ similarities = sim_matrix[selected_index]
152
+ zipped_sims = list(zip(indecies, similarities))
153
+
154
+ # remove all 0 similarities
155
+ filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
156
+
157
+ # Select and sort top 20 most similar projects
158
+ sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
159
+ top_20_sims = sorted_sims[:20]
160
+
161
+ # create result data frame
162
+ index_list = [tup[0] for tup in top_20_sims]
163
+ print(index_list)
164
+ result_df = projects_df.iloc[index_list]
165
+ print(len(result_df))
166
+
167
+ print(len(result_df))
168
+ # add other colums to result df
169
+
170
+ similarity_list = [tup[1] for tup in top_20_sims]
171
+ result_df["similarity"] = similarity_list
172
+
173
+ similarity_table.show_table(result_df, similarity_list)
174
+
175
+ """