Tao Wu commited on
Commit
227a7da
·
1 Parent(s): 68071c5
Files changed (31) hide show
  1. .gitattributes +4 -0
  2. .gitignore +5 -0
  3. Dockerfile +23 -0
  4. README copy.md +10 -0
  5. app/app.py +235 -0
  6. app/config.py +26 -0
  7. app/data/EduGBERT_cos_escoai/chroma.sqlite3 +3 -0
  8. app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/data_level0.bin +3 -0
  9. app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/header.bin +3 -0
  10. app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/index_metadata.pickle +3 -0
  11. app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/length.bin +3 -0
  12. app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/link_lists.bin +3 -0
  13. app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/data_level0.bin +3 -0
  14. app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/header.bin +3 -0
  15. app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/index_metadata.pickle +3 -0
  16. app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/length.bin +3 -0
  17. app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/link_lists.bin +3 -0
  18. app/data/ba_course_emb/chroma.sqlite3 +3 -0
  19. app/data/berufe_info.csv +3 -0
  20. app/data/kldb_isco_lookup.csv +3 -0
  21. app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/data_level0.bin +3 -0
  22. app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/header.bin +3 -0
  23. app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/index_metadata.pickle +3 -0
  24. app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/length.bin +3 -0
  25. app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/link_lists.bin +3 -0
  26. app/data/multilingual-e5_cos_escoai/chroma.sqlite3 +3 -0
  27. app/data/occupations_de.csv +3 -0
  28. app/data/redis_data.json +3 -0
  29. app/data_process.py +123 -0
  30. app/embedding_setup.py +204 -0
  31. requirements.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
37
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
38
+ *.rdb filter=lfs diff=lfs merge=lfs -text
39
+ *.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ app-data/
3
+ redis-data/
4
+ k8s/
5
+ *.rdb
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+
13
+ RUN export PYTHONUNBUFFERED=1
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+
17
+ WORKDIR /app
18
+ COPY app /app
19
+
20
+
21
+ EXPOSE 7860
22
+
23
+ CMD ["python", "-u", "app.py"]
README copy.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Course Rec
3
+ emoji: 🏢
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app/app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import redis
4
+ import json
5
+ import requests
6
+ from config import *
7
+ import functools
8
+ from embedding_setup import retriever, find_similar_occupation, compare_docs_with_context,generate_exp,generate_prompt_exp
9
+ from data_process import get_occupations_from_csv, get_courses_from_BA, get_occupation_detial, build_occupation_query
10
+ with open('/app/data/redis_data.json', 'r') as file:
11
+ data_dict = json.load(file)
12
+ #r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True)
13
+
14
+ skill_details_mapping = {}
15
+
16
+
17
+ # Function to retrieve documents based on selected skills
18
+ def retrieve_documents(occupation,skills):
19
+ output = []
20
+ output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
21
+ oc_uri = occupations.get(occupation, "")
22
+ skill_query = ''
23
+ candidate_docs = []
24
+ if isinstance(oc_uri, int):
25
+ df = pd.read_csv("/app/data/berufe_info.csv")
26
+ target_occupation = df[df['id'] == oc_uri]
27
+ target_occupation_name = target_occupation['short name'].values[0]
28
+ target_occupation_dsp = target_occupation['description'].values[0]
29
+ target_occupation_query = target_occupation_name + ' ' + target_occupation_dsp
30
+ target_occupation_query = target_occupation_query
31
+ else:
32
+ target_occupation = get_occupation_detial(oc_uri)
33
+ target_occupation_name, target_occupation_dsp, target_occupation_query = build_occupation_query(target_occupation)
34
+ for german_label in skills:
35
+ skill_query += german_label + ' '
36
+ ocsk_query = target_occupation_name + ' ' + german_label
37
+ skills_docs = retriever.get_relevant_documents(ocsk_query)
38
+ candidate_docs.extend(skills_docs[:2])
39
+ query = target_occupation_query + ' ' + skill_query
40
+ llama_query = 'info:' + target_occupation_name + ' ' + 'Skills gap:' + skill_query
41
+ print(query)
42
+ docs = retriever.get_relevant_documents(query)
43
+ candidate_docs.extend(docs[:5])
44
+
45
+ #remove duplicates
46
+ seen_course_ids = set()
47
+ candidate_doc_unique = []
48
+
49
+ for doc in candidate_docs:
50
+ course_id = doc.metadata.get('id','')
51
+ if course_id not in seen_course_ids:
52
+ candidate_doc_unique.append(doc)
53
+ seen_course_ids.add(course_id)
54
+
55
+ partial_compare_docs = functools.partial(compare_docs_with_context, target_occupation_name=target_occupation_name, target_occupation_dsp=target_occupation_dsp,skill_gap = skill_query)
56
+ sorted_docs = sorted(candidate_doc_unique, key=functools.cmp_to_key(partial_compare_docs), reverse=True)
57
+
58
+
59
+ batch_output = []
60
+ for doc in sorted_docs[:5]:
61
+ doc_name = doc.metadata.get('name', 'Unnamed Document')
62
+ doc_skill = doc.metadata.get('skills', '')
63
+ input_text = f"target occupation: {llama_query}\n Recommended course: name: {doc_name}, learning objectives: {doc_skill[:2000]}"
64
+ prompt = generate_prompt_exp(input_text)
65
+ batch_output += generate_exp(prompt)
66
+
67
+ # Evaluate the current batch of prompts
68
+ output.append(f"<b>Zielberuf:</b> {target_occupation_name}")
69
+ output.append(f"<b>Qualifikationslücke:</b> {skill_query}")
70
+ output.append(f"<b>Empfohlene Kurse:</b>")
71
+ for i in range(5):
72
+ doc = sorted_docs[i]
73
+ doc_name = doc.metadata.get('name', 'Unnamed Document')
74
+ doc_url = doc.metadata.get('url', '#')
75
+ doc_skill = doc.metadata.get('skills', '')
76
+ output.append(f"<a href='{doc_url}' target='_blank'>{doc_name}</a>")
77
+ output.append(f"<b>Empfehlungsgrund:</b> {batch_output[i]}")
78
+
79
+
80
+ output.append(f"<br>")
81
+ return "<br>".join(output)
82
+
83
+
84
+ def get_candidate_courses(occupation, skills):
85
+ output = []
86
+ output.append(f"<div style=\"text-align: center; font-size: 24px;\">Empfehlungsergebnisse:</div>")
87
+ df_lookup = pd.read_csv('/app/data/kldb_isco_lookup.csv')
88
+ df_berufe = pd.read_csv('/app/data/berufe_info.csv')
89
+ occupation_codes = set()
90
+ kldB_set = set()
91
+ occupation_hrefs = set()
92
+ BA_berufe = set()
93
+ oc_uri = occupations.get(occupation, "")
94
+ target_occupation = get_occupation_detial(oc_uri)
95
+ target_occupation_query = build_occupation_query(target_occupation)
96
+
97
+ for german_label in skills:
98
+ skill = skill_details_mapping.get(german_label, {})
99
+ uri = f'https://ec.europa.eu/esco/api/resource/skill?selectedVersion=v1.0.9&language=en&uri={skill["uri"]}'
100
+ try:
101
+ skill_response = requests.get(uri)
102
+ skill_response.raise_for_status()
103
+ skill_json = skill_response.json()
104
+
105
+ # Combine essential and optional occupations
106
+ skill_related_occupations = (skill_json['_links'].get('isEssentialForOccupation', []) +
107
+ skill_json['_links'].get('isOptionalForOccupation', []))
108
+
109
+ for occupation in skill_related_occupations:
110
+ href = occupation.get('href')
111
+ if href:
112
+ occupation_hrefs.add(href)
113
+ except requests.RequestException as e:
114
+ print(f"Error while fetching skill details: {e}")
115
+
116
+ for href in occupation_hrefs:
117
+ try:
118
+ occupation_response = requests.get(href)
119
+ occupation_response.raise_for_status()
120
+ occupation_details = occupation_response.json()
121
+
122
+ code = occupation_details.get('code')
123
+ if code:
124
+ occupation_codes.add(code.split('.')[0])
125
+ except requests.RequestException as e:
126
+ print(f"Error while fetching occupation details: {e}")
127
+
128
+ for isco_code in occupation_codes:
129
+ kldB_codes = df_lookup[df_lookup['isco08'] == int(isco_code)]['kldb2010'].values
130
+ for code in kldB_codes:
131
+ kldB_set.add(str(code))
132
+ dfs = []
133
+ for kldb in kldB_set:
134
+ berufe = df_berufe[df_berufe['KldB codes']=='B '+kldb]
135
+ dfs.append(berufe)
136
+
137
+ merged_df = pd.concat(dfs, ignore_index=True)
138
+ top_k_berufe = find_similar_occupation(target_occupation_query,merged_df,5,'cosine')
139
+ for beruf in top_k_berufe:
140
+ entry_requirement = beruf.metadata['entry_requirements']
141
+ corrected_json_string = entry_requirement.replace("'", '"')
142
+ entry_requirement_json = json.loads(corrected_json_string)
143
+ for js in entry_requirement_json:
144
+ BA_berufe.add(str(js['data_idref']))
145
+
146
+ result = get_courses_from_BA(BA_berufe)
147
+ courses = result
148
+ for course in courses['_embedded']['termine']:
149
+ output.append(f"<a href='{course['angebot']['link']}' target='_blank'>{course['angebot']['titel']}</a>")
150
+
151
+ return "<br>".join(output)
152
+
153
+
154
+ def get_occupation_skills(oc_uri):
155
+ #skills_json = r.get(oc_uri)
156
+ skills_json = data_dict.get(oc_uri, None)
157
+ skill_labels = []
158
+ if skills_json:
159
+ skills = json.loads(skills_json)
160
+ for skill in skills:
161
+ german_label = skill['preferredLabel']['de']
162
+ skill_details_mapping[german_label] = skill
163
+ skill_labels.append(german_label)
164
+ return skill_labels
165
+ else:
166
+ return skill_labels
167
+
168
+ def get_occupation_skills_BA(oc_uri):
169
+ df = pd.read_csv("/app/data/berufe_info.csv")
170
+ essential_skills = df[df['id'] == oc_uri]['essential skills'].values
171
+ optional_skills = df[df['id'] == oc_uri]['optional skills'].values
172
+ combined_skills = essential_skills[0][:-1] + ',' + optional_skills[0][1:]
173
+ combined_skills = combined_skills.replace("'", "\"")
174
+ skills = json.loads(combined_skills)
175
+ skill_labels = []
176
+ for skill in skills:
177
+ german_label = skill['skill']
178
+ skill_details_mapping[german_label] = skill
179
+ skill_labels.append(german_label)
180
+ return skill_labels
181
+
182
+ # Function to update the skills dropdown
183
+ def update_skills(occupation):
184
+ oc_uri = occupations.get(occupation, "")
185
+ if isinstance(oc_uri, int):
186
+ skills = get_occupation_skills_BA(oc_uri)
187
+ return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
188
+ else:
189
+ skills = get_occupation_skills(oc_uri)
190
+ return gr.Dropdown(skills,label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
191
+ return
192
+
193
+ def update_skillgap(occupation, current_skills):
194
+ oc_uri = occupations.get(occupation, "")
195
+ if isinstance(oc_uri, int):
196
+ ocupation_skills = get_occupation_skills_BA(oc_uri)
197
+ else:
198
+ ocupation_skills = get_occupation_skills(oc_uri)
199
+ skill_gap = [skill for skill in ocupation_skills if skill not in current_skills]
200
+
201
+ return gr.Dropdown(skill_gap, label="Qualifikationslücke", multiselect=True, info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')
202
+
203
+ if __name__ == "__main__":
204
+ # Load occupations from CSV
205
+ occupations_esco = get_occupations_from_csv(CSV_FILE_PATH)
206
+ df = pd.read_csv("/app/data/berufe_info.csv")
207
+ occupations_BA = df[['short name', 'id']].set_index('short name').to_dict()['id']
208
+ occupations = {**occupations_esco, **occupations_BA}
209
+ # Gradio interface
210
+ with gr.Blocks(title="MyEduLife Kursempfehlungssystem") as demo:
211
+ occupation_dropdown = gr.Dropdown(list(occupations.keys()), label="Zielberuf",info='Bitte wählen Sie Ihren Zielberuf aus.')
212
+ currentskill_dropdown = gr.Dropdown([],label="aktuelle Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie derzeit besitzen')
213
+ sb_btn = gr.Button("Absenden")
214
+ skillgap_dropdown = gr.Dropdown([],label="Fähigkeiten", multiselect=True,info='Bitte wählen Sie die Fähigkeiten aus, die Sie lernen möchten.')
215
+ # Use gr.HTML to display the HTML content
216
+ button = gr.Button("Kursempfehlungen")
217
+ documents_output = gr.HTML()
218
+
219
+ occupation_dropdown.change(update_skills, inputs=occupation_dropdown, outputs=currentskill_dropdown)
220
+
221
+ sb_btn.click(
222
+ update_skillgap,
223
+ inputs=[occupation_dropdown,currentskill_dropdown],
224
+ outputs=skillgap_dropdown
225
+ )
226
+
227
+ button.click(
228
+ retrieve_documents,
229
+ inputs=[occupation_dropdown,skillgap_dropdown],
230
+ outputs=documents_output
231
+ )
232
+ print('Initialization completed')
233
+ demo.launch(server_name="0.0.0.0", server_port=7860)
234
+
235
+
app/config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Redis configuration
4
+ #REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
5
+ REDIS_HOST = os.getenv('REDIS_HOST', '0.0.0.0')
6
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
7
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
8
+
9
+ # Model and embedding configuration
10
+ #MODEL_NAME = os.getenv('MODEL_NAME', "intfloat/multilingual-e5-large-instruct")
11
+ MODEL_NAME = os.getenv('MODEL_NAME', "wt3639/EduGBERT_CourseRec")
12
+ ENCODE_KWARGS = {
13
+ 'normalize_embeddings': os.getenv('NORMALIZE_EMBEDDINGS', 'True') == 'True',
14
+ 'convert_to_tensor': os.getenv('CONVERT_TO_TENSOR', 'True') == 'True'
15
+ }
16
+ #QUERY_INSTRUCTION = os.getenv('QUERY_INSTRUCTION', 'Find the course that relates to the given occupation and cover the skills gap')
17
+ QUERY_INSTRUCTION = os.getenv('QUERY_INSTRUCTION', '')
18
+ # Other configurations
19
+ TOP_K = int(os.getenv('TOP_K', 10))
20
+ #PERSIST_DIRECTORY = os.getenv('PERSIST_DIRECTORY', "/app/data/course_emb_db")
21
+ PERSIST_DIRECTORY = os.getenv('PERSIST_DIRECTORY', "/app/data/EduGBERT_cos_escoai")
22
+ CSV_FILE_PATH = os.getenv('CSV_FILE_PATH', '/app/data/occupations_de.csv')
23
+
24
+ REC_LORA_MODEL = os.getenv('REC_LORA_MODEL', 'wt3639/Llama-3-8B-Instruct_CourseRec_lora')
25
+ EXP_LORA_MODEL = os.getenv('EXP_LORA_MODEL', 'wt3639/Lllama-3-8B-instruct-exp-adapter')
26
+ LLM_MODEL = os.getenv('LLM_MODEL', 'meta-llama/Meta-Llama-3-8B-Instruct')
app/data/EduGBERT_cos_escoai/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a66289aa47eb8a5339cb507db5998aaf0027b7638ee6750d26a1459e7b64a9a
3
+ size 103567360
app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfcaa4e2f82fc0426cfa574fc0f5126d913d426925e1ac3cdd3403b0d6d90d57
3
+ size 25416000
app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ffd5ef9bb9ca1a421e18216d2997eea01192524736b484b49cd43b46e42e31f
3
+ size 100
app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56f054e8946b4b38b357da496aca2bebffc0adf8010a12a1bb4218581d1bf8ca
3
+ size 346049
app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b3a5a47864c30653e6f7a237ebf2b02ede483e75377c439e672417c42c0c6e0
3
+ size 24000
app/data/EduGBERT_cos_escoai/facbf936-cb42-45b3-8dd7-c6a84203b62a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c637f61ab8c840f644f6a5d9c6a6290a5374997e1d3973225841d93c3287e2b
3
+ size 51880
app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47834eba6ccc20700681289c6198c4c2f6824715763ad8be26953ba7748194c2
3
+ size 4236000
app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f03662fe47efebf7a7d3d9b8b5341473d199915b17d7cf59ed78ea2bf986a168
3
+ size 100
app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c1e4f32bde583bd6a0a03cf194ff0021751797d9ef32b210427f3e75437c2a
3
+ size 55974
app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48bff893dbcb9255890ac7dc0d30b26e4f781f6ae5c77debb693fd04c68c2d79
3
+ size 4000
app/data/ba_course_emb/3b822eb4-09a8-4577-a432-58654a23254b/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b01c1d32064620009fd560852b48bdb9eebd0ba3050b1b2ca553918602715771
3
+ size 8420
app/data/ba_course_emb/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:166d64d9df06a4494ad9fff6e9b37f8df33f710514d69c16b9fc1984d6066b6d
3
+ size 38019072
app/data/berufe_info.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa16cd91e4ae94d8044b4421aa2392b4918245896745677ba471770eeb0f69bc
3
+ size 11483882
app/data/kldb_isco_lookup.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6918ef9566992565deb01b1ca79b31b50c21130e2dcd42b9a21e116b7ddaa7a9
3
+ size 363652
app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5febe5913f14ef4735dcb9551b6f86e9d183b2fac0a2c1ac980fee33a6443534
3
+ size 25416000
app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ffd5ef9bb9ca1a421e18216d2997eea01192524736b484b49cd43b46e42e31f
3
+ size 100
app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f91c55d437d031ea4094c2a2c3e35e0aec42eb5d5f60821d7b164f714228dbe
3
+ size 346049
app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d3b0223a8c3ef4338003b319ca7fe73feb39b300f9dc618e6a9373083b59f6a
3
+ size 24000
app/data/multilingual-e5_cos_escoai/c4b78092-a1fa-41fc-b6b0-8fa36ef29576/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95c0583c98e9b4f9d1694015eb8a2bd782fca6f6ad14d7580ba83b1ced231e26
3
+ size 51880
app/data/multilingual-e5_cos_escoai/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1bfab026e95b44810b2284b3f82d10871fcd4242aa122e31fa84e27195209
3
+ size 103567360
app/data/occupations_de.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4734be24b6ce55415c5e872dbe91134541189935c776765a4ef96aa35db4b282
3
+ size 2478772
app/data/redis_data.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a01eb57f05a046beff74f8bdf2126a24a3996c0ffdeaaa444c423aaa40ee2e4
3
+ size 117289041
app/data_process.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import *
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+
7
+
8
+ def build_skill_query(skill):
9
+ skill_query = skill['preferredLabel']['de'] +" " + skill['preferredLabel']['en']+" "+ skill['description']['de']+ " "+ skill['description']['en']
10
+ if skill['borderConcept']['broaderHierarchyConcept']:
11
+ broader_hierarchy_concept_str = ", ".join(skill['borderConcept']['broaderHierarchyConcept'])
12
+ skill_query += " " + broader_hierarchy_concept_str
13
+ else:
14
+ pass
15
+ if skill['borderConcept']['broaderSkill']:
16
+ broaderSkill_str = ", ".join(skill['borderConcept']['broaderSkill'])
17
+ skill_query += " " + broaderSkill_str
18
+ else:
19
+ pass
20
+ if skill['alternativeLabel']['de']:
21
+ alternativeLabel_de_str = ", ".join(skill['alternativeLabel']['de'])
22
+ skill_query += " " + alternativeLabel_de_str
23
+ else:
24
+ pass
25
+ if skill['alternativeLabel']['en']:
26
+ alternativeLabel_en_str = ", ".join(skill['alternativeLabel']['en'])
27
+ skill_query += " " + alternativeLabel_en_str
28
+ else:
29
+ pass
30
+ return skill_query
31
+
32
+
33
+
34
+ def build_occupation_query(occupation):
35
+ occupation_name_de = occupation['preferredLabel'].get('de','')
36
+ occupation_dsp = occupation['description'].get('de','').get('literal','')
37
+ occupation_query = occupation_name_de +" " + occupation['preferredLabel'].get('en','')+" "+ occupation['description'].get('de','').get('literal','') + " "+ occupation_dsp
38
+ '''
39
+ if occupation['_links']['broaderIscoGroup']:
40
+ for group in occupation['_links']['broaderIscoGroup']:
41
+ occupation_query += " " + group['title']
42
+ else:
43
+ pass
44
+ '''
45
+ return occupation_name_de,occupation_dsp,occupation_query
46
+
47
+ # Get occupations from a CSV
48
+ def get_occupations_from_csv(file_path):
49
+ df = pd.read_csv(file_path)
50
+ return df[['preferredLabel', 'conceptUri']].set_index('preferredLabel').to_dict()['conceptUri']
51
+
52
+
53
+ def get_oauth_token():
54
+ # API endpoint URL
55
+ token_url = "https://rest.arbeitsagentur.de/oauth/gettoken_cc"
56
+
57
+ # Client credentials
58
+ client_id = "38053956-6618-4953-b670-b4ae7a2360b1"
59
+ client_secret = "c385073c-3b97-42a9-b916-08fd8a5d1795"
60
+ grant_type = "client_credentials"
61
+
62
+ # Prepare request data
63
+ payload = {
64
+ "client_id": client_id,
65
+ "client_secret": client_secret,
66
+ "grant_type": grant_type
67
+ }
68
+
69
+ # Send request and get response
70
+ response = requests.post(token_url, data=payload)
71
+
72
+ # Check if the request was successful
73
+ if response.status_code == 200:
74
+ return response.json().get("access_token")
75
+ else:
76
+ print("Token request failed:", response.text)
77
+ return None
78
+
79
+ def query_weiterbildungssuche_api(token, params):
80
+ # Set API URL
81
+ api_url = "https://rest.arbeitsagentur.de/infosysbub/wbsuche/pc/v2/bildungsangebot"
82
+
83
+ # Prepare request headers
84
+ headers = {
85
+ "Authorization": f"Bearer {token}"
86
+ }
87
+
88
+ # Send GET request
89
+ response = requests.get(api_url, headers=headers, params=params)
90
+
91
+ if response.status_code == 200:
92
+ return response.json()
93
+ else:
94
+ print("API request failed:", response.text)
95
+ return None
96
+
97
+ def get_courses_from_BA(ids):
98
+ # Get OAuth token
99
+ token = get_oauth_token()
100
+ if token:
101
+ # Set query parameters
102
+ params = {
103
+ "ids": list(ids)
104
+ }
105
+ # Use token to query the API
106
+ result = query_weiterbildungssuche_api(token, params)
107
+ return result
108
+
109
+
110
+ def get_occupation_detial(oc_uri):
111
+ uri = f'https://ec.europa.eu/esco/api/resource/occupation?selectedVersion=v1.0.9&language=en&uri={oc_uri}'
112
+ try:
113
+ occupation_response = requests.get(uri)
114
+ occupation_response.raise_for_status()
115
+ occupation_json = occupation_response.json()
116
+ return occupation_json
117
+
118
+ except requests.RequestException as e:
119
+ print(f"Error while fetching skill details: {e}")
120
+
121
+
122
+
123
+
app/embedding_setup.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
3
+
4
+ from langchain.docstore.document import Document
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig,BitsAndBytesConfig
6
+ from peft import PeftModel
7
+ from config import *
8
+ import os
9
+ import torch
10
+ from predibase import Predibase, FinetuningConfig, DeploymentConfig
11
+ pb_auth = os.environ.get("pb_token")
12
+ pb = Predibase(api_token=pb_auth)
13
+
14
+ if torch.cuda.is_available():
15
+ device = "cuda"
16
+ else:
17
+ device = "cpu"
18
+
19
+ os.environ['CURL_CA_BUNDLE'] = ""
20
+ embedding_int = HuggingFaceBgeEmbeddings(
21
+ model_name=MODEL_NAME,
22
+ encode_kwargs=ENCODE_KWARGS,
23
+ query_instruction=QUERY_INSTRUCTION
24
+ )
25
+
26
+ embedding_sim = HuggingFaceBgeEmbeddings(
27
+ model_name=MODEL_NAME,
28
+ encode_kwargs=ENCODE_KWARGS,
29
+ query_instruction='Retrieve semantically similar text.'
30
+ )
31
+
32
+ db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_int)
33
+ retriever = db.as_retriever(search_kwargs={"k": TOP_K})
34
+
35
+
36
+
37
+ lora_weights_rec = REC_LORA_MODEL
38
+ lora_weights_exp = EXP_LORA_MODEL
39
+ hf_auth = os.environ.get("hf_token")
40
+
41
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, token=hf_auth)
42
+
43
+
44
+ first_token = 'First'
45
+ second_token = 'Second'
46
+ # 获取token的ID
47
+ first_id = tokenizer.convert_tokens_to_ids(first_token)
48
+ second_id = tokenizer.convert_tokens_to_ids(second_token)
49
+ model = AutoModelForCausalLM.from_pretrained(
50
+ LLM_MODEL,
51
+ load_in_4bit=True,
52
+ torch_dtype=torch.float16,
53
+ token=hf_auth,
54
+ )
55
+
56
+ rec_adapter = PeftModel.from_pretrained(
57
+ model,
58
+ lora_weights_rec
59
+ )
60
+
61
+
62
+ tokenizer.padding_side = "left"
63
+ # unwind broken decapoda-research config
64
+ #model.half() # seems to fix bugs for some users.
65
+ rec_adapter.eval()
66
+
67
+ rec_adapter.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
68
+ rec_adapter.config.bos_token_id = 1
69
+ rec_adapter.config.eos_token_id = 2
70
+
71
+
72
+
73
+ def generate_prompt(target_occupation, skill_gap, courses):
74
+ return f"""
75
+ ### Instruction:
76
+ "As an education expert, you have been provided with a target occupation, a skill gap, and information on two candidate courses. Your task is to determine which course better matches the target occupation and skill gap. Please respond with 'First' or 'Second' to indicate your recommendation.
77
+
78
+ ### Input:
79
+ Target Occupation: {target_occupation}
80
+ Skill Gap: {skill_gap}
81
+ candidate courses: {courses}
82
+
83
+ ### Response:
84
+ """
85
+ '''
86
+ prompt_re = ChatPromptTemplate.from_template(template_re)
87
+ chain_re = (
88
+ runnable
89
+ | prompt_re
90
+ )
91
+ '''
92
+ def evaluate(
93
+ prompt=None,
94
+ temperature=0,
95
+ top_p=1.0,
96
+ top_k=40,
97
+ num_beams=1,
98
+ max_new_tokens=30,
99
+ batch_size=1,
100
+ **kwargs,
101
+ ):
102
+
103
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
104
+ generation_config = GenerationConfig(
105
+ temperature=temperature,
106
+ top_p=top_p,
107
+ top_k=top_k,
108
+ num_beams=num_beams,
109
+ **kwargs,
110
+ )
111
+ with torch.no_grad():
112
+ rec_adapter.to(device)
113
+ generation_output = rec_adapter.generate(
114
+ **inputs,
115
+ generation_config=generation_config,
116
+ return_dict_in_generate=True,
117
+ output_scores=True,
118
+ max_new_tokens=max_new_tokens,
119
+ # batch_size=batch_size,
120
+ eos_token_id=tokenizer.eos_token_id,
121
+ pad_token_id=tokenizer.eos_token_id,
122
+ )
123
+ scores = generation_output.scores[0].softmax(dim=-1)
124
+ logits = torch.tensor(scores[:,[first_id, second_id]], dtype=torch.float32).softmax(dim=-1)
125
+ s = generation_output.sequences
126
+ output = tokenizer.batch_decode(s, skip_special_tokens=True)
127
+ output = [_.split('Response:\n')[-1] for _ in output]
128
+ return output, logits.tolist()
129
+
130
+ def compare_docs_with_context(doc_a, doc_b, target_occupation_name, target_occupation_dsp,skill_gap):
131
+
132
+ #courses = f"First: name: {doc_a.metadata['name']} description:{doc_a.metadata['description']} Second: name: {doc_b.metadata['name']} description:{Sdoc_b.metadata['description']}"
133
+ courses = f"First: name: {doc_a.metadata['name']} learning outcomes:{doc_a.metadata['skills'][:1500]} Second: name: {doc_b.metadata['name']} learning outcomes:{doc_b.metadata['skills'][:1500]}"
134
+ target_occupation = f"name: {target_occupation_name} description: {target_occupation_dsp[:1500]}"
135
+ skill_gap = skill_gap
136
+ prompt = generate_prompt(target_occupation, skill_gap, courses)
137
+ prompt = [prompt]
138
+ output, logit = evaluate(prompt)
139
+ # Compare based on the response: [A] means doc_a > doc_b, [B] means doc_a < doc_b
140
+ print(output, logit)
141
+ if logit[0][0] > logit[0][1]:
142
+ return 1 # doc_a should come before doc_b
143
+ elif logit[0][0] < logit[0][1]:
144
+ return -1 # doc_a should come after doc_b
145
+ else:
146
+ return 0 # Consider them equal if the response is unclear
147
+
148
+
149
+ #-----------------------------------------explanation-------------------------------------
150
+
151
+ lorax_client = pb.deployments.client("llama-3-8b-instruct") # Insert deployment name here
152
+ def generate_prompt_exp(input_text):
153
+ return f"""
154
+ ### Instruction:
155
+ As an education expert, you have been provided with information on target occupations and skills gaps, along with recommended course details. Your task is to explain the recommendation in German, focusing on how the course's learning outcomes and target skills relate to the identified skills gaps.
156
+
157
+ ### Input:
158
+ {input_text}
159
+
160
+ ### Response:
161
+ """
162
+
163
+ def generate_exp(
164
+ prompt=None,
165
+ temperature=0.2,
166
+ top_p=1.0,
167
+ top_k=40,
168
+ num_beams=1,
169
+ max_new_tokens=512,
170
+ batch_size=1,
171
+ do_sample=True,
172
+ **kwargs,
173
+ ):
174
+
175
+
176
+ resp = lorax_client.generate(prompt,adapter_id="wt3639/Llama-3-8B-Instruct_RecExp_lora", adapter_source='hub', max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=do_sample)
177
+
178
+ return resp.generated_text
179
+
180
+
181
+ def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
182
+
183
+ # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
184
+ # Der Inhalt von page_content wird embedded und so für die sucher verwendet.
185
+ docs = []
186
+ for index, beruf in berufe.iterrows():
187
+ # Create document.
188
+ doc = Document(
189
+ page_content= beruf['short name'] + ' ' + beruf['full name'] + ' ' + beruf['description'],
190
+ metadata={
191
+ "id": beruf["id"],
192
+ "name": beruf['short name'],
193
+ "description": beruf["description"],
194
+ "entry_requirements": beruf["entry requirements"]
195
+ },
196
+ )
197
+ docs.append(doc)
198
+
199
+ db_temp = Chroma.from_documents(documents = docs, embedding= embedding_sim, collection_metadata = {"hnsw:space": similarity_func})
200
+ # Retriever will search for the top_5 most similar documents to the query.
201
+ retriever_temp = db_temp.as_retriever(search_kwargs={"k": top_k})
202
+ top_similar_occupations = retriever_temp.get_relevant_documents(target_occupation_query)
203
+
204
+ return top_similar_occupations
requirements.txt ADDED
Binary file (348 Bytes). View file