Jan Mühlnikel commited on
Commit
4226dcf
1 Parent(s): 4f56571

final initial update

Browse files
__pycache__/app.cpython-310.pyc ADDED
Binary file (664 Bytes). View file
 
__pycache__/crs.cpython-310.pyc ADDED
Binary file (3.71 kB). View file
 
__pycache__/home.cpython-310.pyc ADDED
Binary file (439 Bytes). View file
 
__pycache__/sector.cpython-310.pyc ADDED
Binary file (5.44 kB). View file
 
__pycache__/similarity.cpython-310.pyc ADDED
Binary file (2.53 kB). View file
 
home.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def show_page():
4
+ st.write("home")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ pandas==2.1.4
3
+ streamlit==1.32.2
4
+ streamlit-option-menu==0.3.12
5
+ scipy==1.12.0
sdg.py ADDED
File without changes
sector.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Page to analyse the link between crs codes, countries and organizations
3
+ """
4
+
5
+ ################
6
+ # DEPENDENCIES #
7
+ ################
8
+ import streamlit as st
9
+ import pandas as pd
10
+ import utils.crs_table as crs_table
11
+ import utils.sdg_table as sdg_table
12
+ import utils.filter_modules as filter_modules
13
+
14
+ from importlib.machinery import SourceFileLoader
15
+ crs_overlap = SourceFileLoader("crs_overlap", "data/models/crs_overlap.py").load_module()
16
+ sdg_overlap = SourceFileLoader("sdg_overlap", "data/models/sdg_overlap.py").load_module()
17
+ CONSTANTS = SourceFileLoader("CONSTANTS", "config/CONSTANTS.py").load_module()
18
+
19
+ # CHACHE DATA
20
+ # FETCH NEEDED DATA AND STORE IN CHACHE MEMORY TO SAVE LOADING TIME
21
+ @st.cache_data
22
+ def getCRS3():
23
+ # Read in CRS3 CODELISTS
24
+ crs3_df = pd.read_csv('app/src/codelists/crs3_codes.csv')
25
+ CRS3_CODES = crs3_df['code'].tolist()
26
+ CRS3_NAME = crs3_df['name'].tolist()
27
+ CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)}
28
+
29
+ return CRS3_MERGED
30
+
31
+ @st.cache_data
32
+ def getCRS5():
33
+ # Read in CRS3 CODELISTS
34
+ crs5_df = pd.read_csv('app/src/codelists/crs5_codes.csv')
35
+ CRS5_CODES = crs5_df['code'].tolist()
36
+ CRS5_NAME = crs5_df['name'].tolist()
37
+ CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)}
38
+
39
+ return CRS5_MERGED
40
+
41
+ @st.cache_data
42
+ def getSDG():
43
+ # Read in SDG CODELISTS
44
+ sdg_df = pd.read_csv('app/src/codelists/sdg_goals.csv')
45
+ SDG_NAMES = sdg_df['name'].tolist()
46
+
47
+ return SDG_NAMES
48
+
49
+ @st.cache_data
50
+ def getCountry():
51
+ # Read in countries from codelist
52
+ country_df = pd.read_csv('app/src/codelists/country_codes_ISO3166-1alpha-2.csv')
53
+ COUNTRY_CODES = country_df['Alpha-2 code'].tolist()
54
+ COUNTRY_NAMES = country_df['Country'].tolist()
55
+
56
+ return country_df, COUNTRY_CODES, COUNTRY_NAMES
57
+
58
+ CRS3_MERGED = getCRS3()
59
+ CRS5_MERGED = getCRS5()
60
+ SDG_NAMES = getSDG()
61
+ country_df, COUNTRY_CODES, COUNTRY_NAMES = getCountry()
62
+
63
+ # SPECIAL SELECTIONS
64
+ ## COUNTRY
65
+ SPECIAL_COUNTRY_SLECTIONS = ["All"]
66
+ SHOW_ALL_COUNTRIES = False # If all countries should be showed in matching
67
+
68
+ ## ORGANIZATION
69
+ SPECIAL_ORGA_SLECTIONS = ["All"]
70
+ SHOW_ALL_ORGAS = False
71
+
72
+ ########
73
+ # PAGE #
74
+ ########
75
+ def show_page():
76
+
77
+ def show_crs():
78
+ # SESSION STATES
79
+ st.session_state.crs5_option_disabled = True
80
+
81
+ # SELECTION FIELDS
82
+ col1, col2 = st.columns([1, 1])
83
+ with col1:
84
+ #####################
85
+ # CRS 3 CODE SELECT #
86
+ #####################
87
+ crs3_option = st.multiselect(
88
+ 'CRS 3',
89
+ CRS3_MERGED,
90
+ placeholder="Select"
91
+ )
92
+
93
+ #####################
94
+ # CRS 5 CODE SELECT #
95
+ #####################
96
+ # Only enable crs5 select field when crs3 code is selected
97
+ if crs3_option != []:
98
+ st.session_state.crs5_option_disabled = False
99
+
100
+ # define list of crs5 codes dependend on crs3 codes
101
+ crs5_list = [txt[0].replace('"', "") for crs3_item in crs3_option for code, txt in CRS5_MERGED.items() if str(code)[:3] == str(crs3_item)[-3:]]
102
+
103
+ # crs5 select field
104
+ crs5_option = st.multiselect(
105
+ 'CRS 5',
106
+ crs5_list,
107
+ placeholder="Select",
108
+ disabled=st.session_state.crs5_option_disabled
109
+ )
110
+
111
+ with col2:
112
+ # COUNTRY SELECTION
113
+ country_option = filter_modules.country_option(SPECIAL_COUNTRY_SLECTIONS, COUNTRY_NAMES)
114
+
115
+ # ORGA SELECTION
116
+ orga_option = filter_modules.orga_option(SPECIAL_ORGA_SLECTIONS, CONSTANTS.ORGA_SEARCH)
117
+
118
+ ################
119
+ # SHOW RESULTS #
120
+ ################
121
+ # Extract Orgas from multiselect
122
+ if "All" in orga_option:
123
+ SHOW_ALL_ORGAS = True
124
+ selected_orgas = []
125
+ else:
126
+ SHOW_ALL_ORGAS = False
127
+ selected_orgas = [str(o).replace(")", "").lower().split("(")[1] for o in orga_option]
128
+
129
+ if country_option != []:
130
+ # all selection
131
+ if "All" in country_option:
132
+ SHOW_ALL_COUNTRIES = True
133
+ country_option.remove("All")
134
+ else:
135
+ SHOW_ALL_COUNTRIES = False
136
+
137
+ if crs3_option != []:
138
+ # CRS 3 codes from option
139
+ crs3_list = [i[-3:] for i in crs3_option]
140
+
141
+ # get country codes from multiselect
142
+ country_names = [str(c) for c in country_option]
143
+ country_codes = [
144
+ country_df[country_df['Country'] == c]['Alpha-2 code'].values[0].replace('"', "").strip(" ")
145
+ for c in country_names
146
+ ]
147
+
148
+ result_df = crs_overlap.calc_crs3(crs3_list, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
149
+
150
+ if crs5_option != []:
151
+ # CRS 5 codes from option
152
+ crs5_list = [i[-5:] for i in crs5_option]
153
+ result_df = crs_overlap.calc_crs5(crs5_list, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
154
+
155
+ # TABLE FOR CRS OVERLAP
156
+ crs_table.show_table(result_df)
157
+
158
+ def show_sdg():
159
+ # SELECTION
160
+ col1, col2 = st.columns([1, 1])
161
+ with col1:
162
+ # CRS3 CODE SELECT
163
+ sdg_option = st.selectbox(
164
+ label = 'SDG',
165
+ index = None,
166
+ placeholder = "Select SDG",
167
+ options = SDG_NAMES,
168
+ )
169
+
170
+ with col2:
171
+ # COUNTRY SELECTION
172
+ country_option = filter_modules.country_option(SPECIAL_COUNTRY_SLECTIONS, COUNTRY_NAMES)
173
+
174
+ # ORGA SELECTION
175
+ orga_option = filter_modules.orga_option(SPECIAL_ORGA_SLECTIONS, CONSTANTS.ORGA_SEARCH)
176
+
177
+
178
+ # SHOW RESULTS
179
+ if sdg_option != None:
180
+ sdg_int = int(sdg_option.split(" ")[0].replace(".", ""))
181
+ # Extract Orgas from multiselect
182
+ if "All" in orga_option:
183
+ SHOW_ALL_ORGAS = True
184
+ selected_orgas = []
185
+ else:
186
+ SHOW_ALL_ORGAS = False
187
+ selected_orgas = [str(o).replace(")", "").lower().split("(")[1] for o in orga_option]
188
+
189
+ if country_option != []:
190
+ # all selection
191
+ if "All" in country_option:
192
+ SHOW_ALL_COUNTRIES = True
193
+ country_option.remove("All")
194
+ else:
195
+ SHOW_ALL_COUNTRIES = False
196
+
197
+ country_names = [str(c) for c in country_option]
198
+ country_codes = [
199
+ country_df[country_df['Country'] == c]['Alpha-2 code'].values[0].replace('"', "").strip(" ")
200
+ for c in country_names
201
+ ]
202
+
203
+ result_df = sdg_overlap.calc_crs3(sdg_int, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
204
+
205
+ # TABLE FOR SDG OVERLAP
206
+ sdg_table.show_table(result_df)
207
+
208
+ # SELECT IF CRS or SDG Match
209
+ match_option = st.selectbox(
210
+ label = 'Matching Method',
211
+ index = 0,
212
+ placeholder = "Select",
213
+ options = ["CRS", "SDG"],
214
+ )
215
+
216
+ st.write("------------------")
217
+
218
+ if match_option == "CRS":
219
+ show_crs()
220
+ elif match_option == "SDG":
221
+ show_sdg()
222
+
similarity.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Page for similarities
3
+ """
4
+
5
+ ################
6
+ # DEPENDENCIES #
7
+ ################
8
+ import streamlit as st
9
+ import pandas as pd
10
+ from scipy.sparse import load_npz
11
+ import utils.similarity_table as similarity_table
12
+
13
+ # Catch DATA
14
+ # Load Similarity matrix
15
+ @st.cache_data
16
+ def load_sim_matrix():
17
+ loaded_matrix = load_npz("app/src/similarities.npz")
18
+ dense_matrix = loaded_matrix.toarray()
19
+
20
+
21
+ return dense_matrix
22
+
23
+
24
+ @st.cache_data
25
+ def load_projects():
26
+ orgas_df = pd.read_csv("app/src/projects/project_orgas.csv")
27
+ region_df = pd.read_csv("app/src/projects/project_region.csv")
28
+ sector_df = pd.read_csv("app/src/projects/project_sector.csv")
29
+ status_df = pd.read_csv("app/src/projects/project_status.csv")
30
+ texts_df = pd.read_csv("app/src/projects/project_texts.csv")
31
+
32
+ projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
33
+ projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
34
+ projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
35
+ projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
36
+
37
+ return projects_df
38
+
39
+ # LOAD DATA
40
+ sim_matrix = load_sim_matrix()
41
+ projects_df = load_projects()
42
+
43
+ def show_page():
44
+ st.write("Similarities")
45
+
46
+ df_subset = projects_df.head(10)
47
+ selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
48
+
49
+ st.write(selected_index)
50
+
51
+ # add index and similarity together
52
+ indecies = range(0, len(sim_matrix))
53
+ similarities = sim_matrix[selected_index]
54
+ zipped_sims = list(zip(indecies, similarities))
55
+
56
+ # remove all 0 similarities
57
+ filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
58
+
59
+ # Select and sort top 20 most similar projects
60
+ sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
61
+ top_20_sims = sorted_sims[:20]
62
+
63
+ # create result data frame
64
+ index_list = [tup[0] for tup in top_20_sims]
65
+ print(index_list)
66
+ result_df = projects_df.iloc[index_list]
67
+ print(len(result_df))
68
+
69
+ print(len(result_df))
70
+ # add other colums to result df
71
+
72
+ similarity_list = [tup[1] for tup in top_20_sims]
73
+ result_df["similarity"] = similarity_list
74
+
75
+ similarity_table.show_table(result_df, similarity_list)
76
+
77
+
78
+
src/codelists/country_codes_ISO3166-1alpha-2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff1ad92034a4a593138fcbb7570ec5015c3c28a4476f95015a39d0bf257382a
3
+ size 13113
src/codelists/crs3_codes.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd7bf86baf7bbc54c880c098b89b803adfb060c2c9ba55ee976cc47c2be426a
3
+ size 3218
src/codelists/crs5_codes.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84a522ad573ad1866835cb24efc7984016ef17b9990ac2484345705ac82a0d80
3
+ size 100133
src/codelists/sdg_goals.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d14fbb5ea0582758b80c99d6726406852af2799dc53a3da646192535c2b3a08f
3
+ size 1887
src/codelists/sdg_targets.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bcfb315fcb778c1503557b6b76bec0159ccc25933a5de1cb6c51542064190e9
3
+ size 36758
src/projects/project_region.csv CHANGED
The diff for this file is too large to render. See raw diff
 
src/projects/project_sector.csv CHANGED
The diff for this file is too large to render. See raw diff
 
src/projects/project_status.csv CHANGED
The diff for this file is too large to render. See raw diff