trangannh commited on
Commit
401af4a
1 Parent(s): 31ffe55

Upload Job_Recommendation_System.ipynb

Browse files
Files changed (1) hide show
  1. Job_Recommendation_System.ipynb +884 -0
Job_Recommendation_System.ipynb ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "_NrjL2ccH3yp"
7
+ },
8
+ "source": [
9
+ "RECOMMENDATION MODEL"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 1,
15
+ "metadata": {
16
+ "id": "IZfnA6W_GDyf"
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "import numpy as np\n",
21
+ "import pandas as pd\n",
22
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23
+ "from sklearn.metrics.pairwise import cosine_similarity"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 2,
29
+ "metadata": {
30
+ "id": "MV-7idG1F_NU"
31
+ },
32
+ "outputs": [],
33
+ "source": [
34
+ "# Mock data creation\n",
35
+ "def create_mock_data():\n",
36
+ " users_data = \"1st_train.csv\"\n",
37
+ " # \"/content/sample_data/train_train.csv\"\n",
38
+ " applicants = pd.read_csv(users_data)\n",
39
+ "\n",
40
+ " jobs_data = \"jobs_data.csv\"\n",
41
+ " companies = pd.read_csv(jobs_data)\n",
42
+ "\n",
43
+ " train_applicants = applicants\n",
44
+ " test_data = \"1st_test.csv\"\n",
45
+ " # \"/content/sample_data/test_train.csv\"\n",
46
+ " test_applicants = pd.read_csv(test_data)\n",
47
+ "\n",
48
+ " return train_applicants, test_applicants, companies"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 6,
54
+ "metadata": {
55
+ "id": "4VTpcXhz-5TN"
56
+ },
57
+ "outputs": [],
58
+ "source": [
59
+ "# @title\n",
60
+ "# # Mock data creation\n",
61
+ "# def create_mock_data():\n",
62
+ "# users_data = \"/content/sample_data/rematch_train_candidate_field.csv\"\n",
63
+ "# applicants = pd.read_csv(users_data)\n",
64
+ "\n",
65
+ "# jobs_data = \"/content/sample_data/jobs_data.csv\"\n",
66
+ "# companies = pd.read_csv(jobs_data)\n",
67
+ "\n",
68
+ "# # train_applicants = applicants\n",
69
+ "# # test_data = \"/content/sample_data/test_data_new.csv\"\n",
70
+ "# # test_applicants = pd.read_csv(test_data)\n",
71
+ "\n",
72
+ "# train_applicants = applicants[:10000]\n",
73
+ "# test_applicants = applicants[10000:]\n",
74
+ "\n",
75
+ "# return train_applicants, test_applicants, companies"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 3,
81
+ "metadata": {
82
+ "id": "wF1oZ6Ez96BE"
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ "train_user, test_user, jobs = create_mock_data()"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 4,
92
+ "metadata": {
93
+ "colab": {
94
+ "base_uri": "https://localhost:8080/"
95
+ },
96
+ "id": "Gj8tJNrph8Go",
97
+ "outputId": "a44b8cf0-a56f-4cd2-bbda-ca9bcabf35a0"
98
+ },
99
+ "outputs": [
100
+ {
101
+ "name": "stdout",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "Training data size: 18979\n",
105
+ "Test data size: 4745\n"
106
+ ]
107
+ }
108
+ ],
109
+ "source": [
110
+ "print(\"Training data size:\", train_user.shape[0])\n",
111
+ "print(\"Test data size:\", test_user.shape[0])"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 5,
117
+ "metadata": {
118
+ "id": "d0XY4al7K0UT"
119
+ },
120
+ "outputs": [],
121
+ "source": [
122
+ "list_hard_skill = [test_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]\n",
123
+ "list_soft_skill = [test_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 6,
129
+ "metadata": {
130
+ "colab": {
131
+ "base_uri": "https://localhost:8080/",
132
+ "height": 213
133
+ },
134
+ "id": "JOZ9_NlLK8uS",
135
+ "outputId": "17d09f55-192f-4486-bb47-b56f525d44a3"
136
+ },
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "text/html": [
141
+ "<div>\n",
142
+ "<style scoped>\n",
143
+ " .dataframe tbody tr th:only-of-type {\n",
144
+ " vertical-align: middle;\n",
145
+ " }\n",
146
+ "\n",
147
+ " .dataframe tbody tr th {\n",
148
+ " vertical-align: top;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .dataframe thead th {\n",
152
+ " text-align: right;\n",
153
+ " }\n",
154
+ "</style>\n",
155
+ "<table border=\"1\" class=\"dataframe\">\n",
156
+ " <thead>\n",
157
+ " <tr style=\"text-align: right;\">\n",
158
+ " <th></th>\n",
159
+ " <th>User ID</th>\n",
160
+ " <th>candidate_field</th>\n",
161
+ " <th>label</th>\n",
162
+ " <th>hard_skill</th>\n",
163
+ " <th>soft_skill</th>\n",
164
+ " <th>final_hard_skill</th>\n",
165
+ " <th>final_soft_skill</th>\n",
166
+ " </tr>\n",
167
+ " </thead>\n",
168
+ " <tbody>\n",
169
+ " <tr>\n",
170
+ " <th>0</th>\n",
171
+ " <td>14649</td>\n",
172
+ " <td>it jobs</td>\n",
173
+ " <td>1</td>\n",
174
+ " <td>['act', 'advertising sales', 'algorithms', 'bu...</td>\n",
175
+ " <td>['collaboration', 'decision making', 'operatio...</td>\n",
176
+ " <td>act, advertising sales, algorithms, business, ...</td>\n",
177
+ " <td>collaboration, decision making, operations, wr...</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>1</th>\n",
181
+ " <td>801</td>\n",
182
+ " <td>marketing</td>\n",
183
+ " <td>0</td>\n",
184
+ " <td>['act', 'brand communication', 'business', 'bu...</td>\n",
185
+ " <td>['collaboration', 'customer service', 'managem...</td>\n",
186
+ " <td>act, brand communication, business, business d...</td>\n",
187
+ " <td>collaboration, customer service, management</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>2</th>\n",
191
+ " <td>4393</td>\n",
192
+ " <td>accounting</td>\n",
193
+ " <td>0</td>\n",
194
+ " <td>['application', 'balance sheet', 'finance', 'p...</td>\n",
195
+ " <td>['filing', 'management']</td>\n",
196
+ " <td>application, balance sheet, finance, property ...</td>\n",
197
+ " <td>filing, management</td>\n",
198
+ " </tr>\n",
199
+ " </tbody>\n",
200
+ "</table>\n",
201
+ "</div>"
202
+ ],
203
+ "text/plain": [
204
+ " User ID candidate_field label \\\n",
205
+ "0 14649 it jobs 1 \n",
206
+ "1 801 marketing 0 \n",
207
+ "2 4393 accounting 0 \n",
208
+ "\n",
209
+ " hard_skill \\\n",
210
+ "0 ['act', 'advertising sales', 'algorithms', 'bu... \n",
211
+ "1 ['act', 'brand communication', 'business', 'bu... \n",
212
+ "2 ['application', 'balance sheet', 'finance', 'p... \n",
213
+ "\n",
214
+ " soft_skill \\\n",
215
+ "0 ['collaboration', 'decision making', 'operatio... \n",
216
+ "1 ['collaboration', 'customer service', 'managem... \n",
217
+ "2 ['filing', 'management'] \n",
218
+ "\n",
219
+ " final_hard_skill \\\n",
220
+ "0 act, advertising sales, algorithms, business, ... \n",
221
+ "1 act, brand communication, business, business d... \n",
222
+ "2 application, balance sheet, finance, property ... \n",
223
+ "\n",
224
+ " final_soft_skill \n",
225
+ "0 collaboration, decision making, operations, wr... \n",
226
+ "1 collaboration, customer service, management \n",
227
+ "2 filing, management "
228
+ ]
229
+ },
230
+ "execution_count": 6,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "test_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
237
+ "test_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
238
+ "test_user.head(3)"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 7,
244
+ "metadata": {
245
+ "id": "kYbjYsDjABda"
246
+ },
247
+ "outputs": [],
248
+ "source": [
249
+ "list_hard_skill = [train_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]\n",
250
+ "list_soft_skill = [train_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 8,
256
+ "metadata": {
257
+ "colab": {
258
+ "base_uri": "https://localhost:8080/",
259
+ "height": 213
260
+ },
261
+ "id": "GC8bn3cjB8D5",
262
+ "outputId": "436e843d-425e-4ce2-e551-e4f249bdd10b"
263
+ },
264
+ "outputs": [
265
+ {
266
+ "data": {
267
+ "text/html": [
268
+ "<div>\n",
269
+ "<style scoped>\n",
270
+ " .dataframe tbody tr th:only-of-type {\n",
271
+ " vertical-align: middle;\n",
272
+ " }\n",
273
+ "\n",
274
+ " .dataframe tbody tr th {\n",
275
+ " vertical-align: top;\n",
276
+ " }\n",
277
+ "\n",
278
+ " .dataframe thead th {\n",
279
+ " text-align: right;\n",
280
+ " }\n",
281
+ "</style>\n",
282
+ "<table border=\"1\" class=\"dataframe\">\n",
283
+ " <thead>\n",
284
+ " <tr style=\"text-align: right;\">\n",
285
+ " <th></th>\n",
286
+ " <th>User ID</th>\n",
287
+ " <th>candidate_field</th>\n",
288
+ " <th>label</th>\n",
289
+ " <th>hard_skill</th>\n",
290
+ " <th>soft_skill</th>\n",
291
+ " <th>final_hard_skill</th>\n",
292
+ " <th>final_soft_skill</th>\n",
293
+ " </tr>\n",
294
+ " </thead>\n",
295
+ " <tbody>\n",
296
+ " <tr>\n",
297
+ " <th>0</th>\n",
298
+ " <td>3030</td>\n",
299
+ " <td>sales</td>\n",
300
+ " <td>0</td>\n",
301
+ " <td>['blogs', 'business', 'lead generation', 'mark...</td>\n",
302
+ " <td>['customer service', 'driven personality', 'ma...</td>\n",
303
+ " <td>blogs, business, lead generation, marketing st...</td>\n",
304
+ " <td>customer service, driven personality, manageme...</td>\n",
305
+ " </tr>\n",
306
+ " <tr>\n",
307
+ " <th>1</th>\n",
308
+ " <td>9702</td>\n",
309
+ " <td>administration &amp; office support</td>\n",
310
+ " <td>0</td>\n",
311
+ " <td>['business', 'draft', 'go', 'manufacturing', '...</td>\n",
312
+ " <td>['business acumen', 'communications', 'managem...</td>\n",
313
+ " <td>business, draft, go, manufacturing, office man...</td>\n",
314
+ " <td>business acumen, communications, management, o...</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>2</th>\n",
318
+ " <td>8606</td>\n",
319
+ " <td>retail &amp; consumer products</td>\n",
320
+ " <td>0</td>\n",
321
+ " <td>['gross profit', 'inventory', 'inventory manag...</td>\n",
322
+ " <td>['customer service', 'management']</td>\n",
323
+ " <td>gross profit, inventory, inventory management,...</td>\n",
324
+ " <td>customer service, management</td>\n",
325
+ " </tr>\n",
326
+ " </tbody>\n",
327
+ "</table>\n",
328
+ "</div>"
329
+ ],
330
+ "text/plain": [
331
+ " User ID candidate_field label \\\n",
332
+ "0 3030 sales 0 \n",
333
+ "1 9702 administration & office support 0 \n",
334
+ "2 8606 retail & consumer products 0 \n",
335
+ "\n",
336
+ " hard_skill \\\n",
337
+ "0 ['blogs', 'business', 'lead generation', 'mark... \n",
338
+ "1 ['business', 'draft', 'go', 'manufacturing', '... \n",
339
+ "2 ['gross profit', 'inventory', 'inventory manag... \n",
340
+ "\n",
341
+ " soft_skill \\\n",
342
+ "0 ['customer service', 'driven personality', 'ma... \n",
343
+ "1 ['business acumen', 'communications', 'managem... \n",
344
+ "2 ['customer service', 'management'] \n",
345
+ "\n",
346
+ " final_hard_skill \\\n",
347
+ "0 blogs, business, lead generation, marketing st... \n",
348
+ "1 business, draft, go, manufacturing, office man... \n",
349
+ "2 gross profit, inventory, inventory management,... \n",
350
+ "\n",
351
+ " final_soft_skill \n",
352
+ "0 customer service, driven personality, manageme... \n",
353
+ "1 business acumen, communications, management, o... \n",
354
+ "2 customer service, management "
355
+ ]
356
+ },
357
+ "execution_count": 8,
358
+ "metadata": {},
359
+ "output_type": "execute_result"
360
+ }
361
+ ],
362
+ "source": [
363
+ "train_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
364
+ "train_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
365
+ "train_user.head(3)"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 9,
371
+ "metadata": {
372
+ "id": "znBy9q8XDcM7"
373
+ },
374
+ "outputs": [],
375
+ "source": [
376
+ "list_hard_skill = [jobs[\"Hard Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]\n",
377
+ "list_soft_skill = [jobs[\"Soft Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 10,
383
+ "metadata": {
384
+ "colab": {
385
+ "base_uri": "https://localhost:8080/",
386
+ "height": 213
387
+ },
388
+ "id": "knFii8o3EQmv",
389
+ "outputId": "47afb484-0765-4ad9-8765-d084673450ac"
390
+ },
391
+ "outputs": [
392
+ {
393
+ "data": {
394
+ "text/html": [
395
+ "<div>\n",
396
+ "<style scoped>\n",
397
+ " .dataframe tbody tr th:only-of-type {\n",
398
+ " vertical-align: middle;\n",
399
+ " }\n",
400
+ "\n",
401
+ " .dataframe tbody tr th {\n",
402
+ " vertical-align: top;\n",
403
+ " }\n",
404
+ "\n",
405
+ " .dataframe thead th {\n",
406
+ " text-align: right;\n",
407
+ " }\n",
408
+ "</style>\n",
409
+ "<table border=\"1\" class=\"dataframe\">\n",
410
+ " <thead>\n",
411
+ " <tr style=\"text-align: right;\">\n",
412
+ " <th></th>\n",
413
+ " <th>Job ID</th>\n",
414
+ " <th>Major</th>\n",
415
+ " <th>Hard Skills</th>\n",
416
+ " <th>Soft Skills</th>\n",
417
+ " <th>final_hard_skill</th>\n",
418
+ " <th>final_soft_skill</th>\n",
419
+ " </tr>\n",
420
+ " </thead>\n",
421
+ " <tbody>\n",
422
+ " <tr>\n",
423
+ " <th>0</th>\n",
424
+ " <td>1</td>\n",
425
+ " <td>accounting</td>\n",
426
+ " <td>['business', 'finance', 'excel', 'tax', 'servi...</td>\n",
427
+ " <td>['management', 'planning', 'operations', 'lead...</td>\n",
428
+ " <td>business, finance, excel, tax, service, data, ...</td>\n",
429
+ " <td>management, planning, operations, leadership, ...</td>\n",
430
+ " </tr>\n",
431
+ " <tr>\n",
432
+ " <th>1</th>\n",
433
+ " <td>2</td>\n",
434
+ " <td>administration &amp; office support</td>\n",
435
+ " <td>['service', 'business', 'data', 'excel', 'appl...</td>\n",
436
+ " <td>['management', 'customer service', 'microsoft ...</td>\n",
437
+ " <td>service, business, data, excel, application, s...</td>\n",
438
+ " <td>management, customer service, microsoft office...</td>\n",
439
+ " </tr>\n",
440
+ " <tr>\n",
441
+ " <th>2</th>\n",
442
+ " <td>3</td>\n",
443
+ " <td>advertising, arts &amp; media</td>\n",
444
+ " <td>['business', 'digital', 'sales', 'service', 'a...</td>\n",
445
+ " <td>['management', 'social media', 'writing', 'com...</td>\n",
446
+ " <td>business, digital, sales, service, application...</td>\n",
447
+ " <td>management, social media, writing, communicati...</td>\n",
448
+ " </tr>\n",
449
+ " </tbody>\n",
450
+ "</table>\n",
451
+ "</div>"
452
+ ],
453
+ "text/plain": [
454
+ " Job ID Major \\\n",
455
+ "0 1 accounting \n",
456
+ "1 2 administration & office support \n",
457
+ "2 3 advertising, arts & media \n",
458
+ "\n",
459
+ " Hard Skills \\\n",
460
+ "0 ['business', 'finance', 'excel', 'tax', 'servi... \n",
461
+ "1 ['service', 'business', 'data', 'excel', 'appl... \n",
462
+ "2 ['business', 'digital', 'sales', 'service', 'a... \n",
463
+ "\n",
464
+ " Soft Skills \\\n",
465
+ "0 ['management', 'planning', 'operations', 'lead... \n",
466
+ "1 ['management', 'customer service', 'microsoft ... \n",
467
+ "2 ['management', 'social media', 'writing', 'com... \n",
468
+ "\n",
469
+ " final_hard_skill \\\n",
470
+ "0 business, finance, excel, tax, service, data, ... \n",
471
+ "1 service, business, data, excel, application, s... \n",
472
+ "2 business, digital, sales, service, application... \n",
473
+ "\n",
474
+ " final_soft_skill \n",
475
+ "0 management, planning, operations, leadership, ... \n",
476
+ "1 management, customer service, microsoft office... \n",
477
+ "2 management, social media, writing, communicati... "
478
+ ]
479
+ },
480
+ "execution_count": 10,
481
+ "metadata": {},
482
+ "output_type": "execute_result"
483
+ }
484
+ ],
485
+ "source": [
486
+ "jobs[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
487
+ "jobs[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
488
+ "jobs.head(3)"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 14,
494
+ "metadata": {
495
+ "id": "wiDiHL6lStnd"
496
+ },
497
+ "outputs": [],
498
+ "source": [
499
+ "# Feature Engineering\n",
500
+ "def feature_engineering(applicants, companies):\n",
501
+ " # Vectorize skills and majors\n",
502
+ " tfidf_vectorizer_skills = TfidfVectorizer()\n",
503
+ " tfidf_vectorizer_majors = TfidfVectorizer()\n",
504
+ "\n",
505
+ " all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],\n",
506
+ " companies['final_hard_skill'], companies['final_soft_skill']])\n",
507
+ " all_majors = pd.concat([applicants['candidate_field'], companies['Major']])\n",
508
+ "\n",
509
+ " all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)\n",
510
+ " all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)\n",
511
+ "\n",
512
+ " num_applicants = len(applicants)\n",
513
+ " num_companies = len(companies)\n",
514
+ "\n",
515
+ " # Split the TF-IDF vectors back into applicants and companies\n",
516
+ " applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2] # because each applicant has 2 skill entries\n",
517
+ " companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]\n",
518
+ "\n",
519
+ " applicants_majors_vectorized = all_majors_vectorized[:num_applicants]\n",
520
+ " companies_majors_vectorized = all_majors_vectorized[num_applicants:]\n",
521
+ "\n",
522
+ " return (applicants_skills_vectorized, applicants_majors_vectorized,\n",
523
+ " companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 15,
529
+ "metadata": {
530
+ "id": "THM0mszQGNyD"
531
+ },
532
+ "outputs": [],
533
+ "source": [
534
+ "def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,\n",
535
+ " companies_skills_vectorized, companies_majors_vectorized):\n",
536
+ " # Calculate similarity based on skills (averaging hard and soft skills similarities)\n",
537
+ " applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2\n",
538
+ " companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2\n",
539
+ "\n",
540
+ " skills_similarity = cosine_similarity(applicants_skills, companies_skills)\n",
541
+ "\n",
542
+ " # Calculate similarity based on majors\n",
543
+ " majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)\n",
544
+ "\n",
545
+ " # Ensure the number of companies in both similarities is aligned\n",
546
+ " if skills_similarity.shape[1] != majors_similarity.shape[1]:\n",
547
+ " min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])\n",
548
+ " skills_similarity = skills_similarity[:, :min_dim]\n",
549
+ " majors_similarity = majors_similarity[:, :min_dim]\n",
550
+ "\n",
551
+ " # Combine these similarities (simple average for this example)\n",
552
+ " combined_similarity = (skills_similarity + majors_similarity) / 2\n",
553
+ " return combined_similarity"
554
+ ]
555
+ },
556
+ {
557
+ "cell_type": "code",
558
+ "execution_count": 16,
559
+ "metadata": {
560
+ "id": "ter3YAzxoelD"
561
+ },
562
+ "outputs": [],
563
+ "source": [
564
+ "# Recommendation Function\n",
565
+ "def recommend_jobs(applicants, companies, similarity_scores):\n",
566
+ " recommendations = {}\n",
567
+ " for i, applicant in enumerate(applicants['User ID']):\n",
568
+ " if i < len(similarity_scores):\n",
569
+ " sorted_company_indices = np.argsort(-similarity_scores[i]) # Descending sort of scores\n",
570
+ " recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3] # Top 3 recommendations\n",
571
+ " recommendations[applicant] = recommended_companies\n",
572
+ " return recommendations\n",
573
+ "\n",
574
+ "# Testing and Evaluation Function\n",
575
+ "def print_recommendations(applicants, companies, recommendations):\n",
576
+ " # This is a mock function since we don't have ground truth to compare to.\n",
577
+ " # In a real scenario, we would compare against actual matches or use some form of feedback.\n",
578
+ " print(\"Recommendations for each applicant:\")\n",
579
+ " for applicant in recommendations:\n",
580
+ " print(f\"{applicant}: {recommendations[applicant]}\")"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": null,
586
+ "metadata": {
587
+ "colab": {
588
+ "base_uri": "https://localhost:8080/"
589
+ },
590
+ "collapsed": true,
591
+ "id": "Ajxp0xelIrl2",
592
+ "outputId": "08bafc5b-73cc-4695-924a-931840047dd5"
593
+ },
594
+ "outputs": [],
595
+ "source": [
596
+ "# Let's create and process the data, and compute recommendations\n",
597
+ "# train_applicants, test_applicants, companies = create_mock_data()\n",
598
+ "applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)\n",
599
+ "\n",
600
+ "similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)\n",
601
+ "recommendations = recommend_jobs(test_user, jobs, similarity_scores)\n",
602
+ "\n",
603
+ "# Output the recommendations to observe the results\n",
604
+ "print_recommendations(test_user, jobs, recommendations)"
605
+ ]
606
+ },
607
+ {
608
+ "cell_type": "code",
609
+ "execution_count": 23,
610
+ "metadata": {
611
+ "colab": {
612
+ "base_uri": "https://localhost:8080/"
613
+ },
614
+ "id": "nj-HEdyJlYNY",
615
+ "outputId": "063b84bc-5717-4a0c-8367-939a054657bc"
616
+ },
617
+ "outputs": [
618
+ {
619
+ "name": "stdout",
620
+ "output_type": "stream",
621
+ "text": [
622
+ "Recommended Jobs based on input skills and major:\n",
623
+ "['sales' 'it jobs' 'administration & office support']\n"
624
+ ]
625
+ }
626
+ ],
627
+ "source": [
628
+ "# Process input skills and recommend jobs\n",
629
+ "def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
630
+ " input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])\n",
631
+ " input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])\n",
632
+ " input_major_vec = tfidf_vectorizer_majors.transform([input_major])\n",
633
+ "\n",
634
+ " # Average the vectorized hard and soft skills\n",
635
+ " input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
636
+ "\n",
637
+ " # Compute similarities\n",
638
+ " skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)\n",
639
+ " major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)\n",
640
+ "\n",
641
+ " # Ensure the number of companies in both similarities is aligned\n",
642
+ " if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
643
+ " min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
644
+ " skills_similarity = skills_similarity[:, :min_dim]\n",
645
+ " major_similarity = major_similarity[:, :min_dim]\n",
646
+ "\n",
647
+ " # Combine similarities\n",
648
+ " combined_similarity = (skills_similarity + major_similarity) / 2\n",
649
+ "\n",
650
+ " # Get top 3 job recommendations\n",
651
+ " sorted_company_indices = np.argsort(-combined_similarity[0])\n",
652
+ " recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
653
+ "\n",
654
+ " return recommended_companies\n",
655
+ "\n",
656
+ "\"\"\"TEST RECOMMENDED SYSTEM\"\"\"\n",
657
+ "\n",
658
+ "input_hard_skills = \"Java, Excel, Python\"\n",
659
+ "input_soft_skills = \"Communication, Teamwork\"\n",
660
+ "input_major = \"Sales\"\n",
661
+ "\n",
662
+ "recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
663
+ "print(\"Recommended Jobs based on input skills and major:\")\n",
664
+ "print(recommended_jobs)"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "markdown",
669
+ "metadata": {
670
+ "id": "IMTilMnQINZC"
671
+ },
672
+ "source": [
673
+ "TEST RECOMMENDED SYSTEM"
674
+ ]
675
+ },
676
+ {
677
+ "cell_type": "markdown",
678
+ "metadata": {
679
+ "id": "kShd99z_NiTa"
680
+ },
681
+ "source": [
682
+ "Evaluating (PENDING)"
683
+ ]
684
+ },
685
+ {
686
+ "cell_type": "code",
687
+ "execution_count": null,
688
+ "metadata": {
689
+ "id": "WfEgjqw9JE3l"
690
+ },
691
+ "outputs": [],
692
+ "source": [
693
+ "def create_ground_truth(csv_file_path):\n",
694
+ " data = pd.read_csv(csv_file_path)\n",
695
+ "\n",
696
+ " # Tạo dictionary `ground_truth`\n",
697
+ " ground_truth = {}\n",
698
+ " for index, row in data.iterrows():\n",
699
+ " user_id = row['User ID']\n",
700
+ " actual_major = row['candidate_field']\n",
701
+ "\n",
702
+ " # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc\n",
703
+ " ground_truth[user_id] = [actual_major]\n",
704
+ "\n",
705
+ " return ground_truth\n",
706
+ "\n",
707
+ "# Sử dụng hàm trên để tạo `ground_truth`\n",
708
+ "csv_file_path = '/content/sample_data/1st_test.csv'\n",
709
+ "ground_truth = create_ground_truth(csv_file_path)"
710
+ ]
711
+ },
712
+ {
713
+ "cell_type": "code",
714
+ "execution_count": null,
715
+ "metadata": {
716
+ "colab": {
717
+ "base_uri": "https://localhost:8080/",
718
+ "height": 1000
719
+ },
720
+ "collapsed": true,
721
+ "id": "TRiD4oS-AKFE",
722
+ "outputId": "256fadeb-b250-4602-affb-005cb9c658eb"
723
+ },
724
+ "outputs": [],
725
+ "source": [
726
+ "display(ground_truth)"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": null,
732
+ "metadata": {
733
+ "colab": {
734
+ "base_uri": "https://localhost:8080/"
735
+ },
736
+ "id": "pXsa_wbANjmb",
737
+ "outputId": "9bd4fc1e-781b-439c-fe35-c28769f6714c"
738
+ },
739
+ "outputs": [
740
+ {
741
+ "name": "stdout",
742
+ "output_type": "stream",
743
+ "text": [
744
+ "Average Precision@3 with 18979 trains and 4745 tests: 0.1252546540217773\n"
745
+ ]
746
+ }
747
+ ],
748
+ "source": [
749
+ "def precision_at_k(recommendations, ground_truth, k=3):\n",
750
+ " \"\"\"\n",
751
+ " Calculate the precision at k for recommendation system.\n",
752
+ "\n",
753
+ " Parameters:\n",
754
+ " - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.\n",
755
+ " - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.\n",
756
+ " - k (int): The number of top recommendations to consider for calculating precision.\n",
757
+ "\n",
758
+ " Returns:\n",
759
+ " - float: The average precision at k for all users.\n",
760
+ " \"\"\"\n",
761
+ " precision_scores = []\n",
762
+ "\n",
763
+ " for applicant, recommended_major in recommendations.items():\n",
764
+ " if applicant in ground_truth:\n",
765
+ " # Get top k recommendations\n",
766
+ " top_k_recs = recommended_major[:k]\n",
767
+ " # Calculate the number of relevant recommendations\n",
768
+ " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])\n",
769
+ " # Precision at k for this user\n",
770
+ " precision = relevant_recs / k\n",
771
+ " precision_scores.append(precision)\n",
772
+ "\n",
773
+ " # Average precision at k over all users\n",
774
+ " average_precision = np.mean(precision_scores) if precision_scores else 0\n",
775
+ " return average_precision\n",
776
+ "\n",
777
+ "avg_precision = precision_at_k(recommendations, ground_truth)\n",
778
+ "print(\"Average Precision@3 with 18979 trains and 4745 tests:\", avg_precision)"
779
+ ]
780
+ },
781
+ {
782
+ "cell_type": "code",
783
+ "execution_count": null,
784
+ "metadata": {
785
+ "colab": {
786
+ "base_uri": "https://localhost:8080/"
787
+ },
788
+ "id": "KAIvtKEaRQml",
789
+ "outputId": "7dd82dc6-0e1b-43d5-bc95-cb457cde5d72"
790
+ },
791
+ "outputs": [
792
+ {
793
+ "name": "stdout",
794
+ "output_type": "stream",
795
+ "text": [
796
+ "Average Recall@3 with 18979 trains and 4745 tests: 0.3757639620653319\n"
797
+ ]
798
+ }
799
+ ],
800
+ "source": [
801
+ "def recall_at_k(recommendations, ground_truth, k=3):\n",
802
+ " recall_scores = []\n",
803
+ "\n",
804
+ " for user_id, recommended_majors in recommendations.items():\n",
805
+ " if user_id in ground_truth:\n",
806
+ " # Get top k recommendations\n",
807
+ " top_k_recs = recommended_majors[:k]\n",
808
+ " # Calculate the number of relevant recommendations\n",
809
+ " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])\n",
810
+ " # Calculate the total number of relevant items\n",
811
+ " total_relevant = len(ground_truth[user_id])\n",
812
+ " # Recall at k for this user\n",
813
+ " recall = relevant_recs / total_relevant if total_relevant else 0\n",
814
+ " recall_scores.append(recall)\n",
815
+ "\n",
816
+ " # Average recall at k over all users\n",
817
+ " average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0\n",
818
+ " return average_recall\n",
819
+ "\n",
820
+ "# Example usage:\n",
821
+ "avg_recall = recall_at_k(recommendations, ground_truth)\n",
822
+ "print(\"Average Recall@3 with 18979 trains and 4745 tests:\", avg_recall)\n"
823
+ ]
824
+ },
825
+ {
826
+ "cell_type": "code",
827
+ "execution_count": null,
828
+ "metadata": {
829
+ "colab": {
830
+ "base_uri": "https://localhost:8080/"
831
+ },
832
+ "id": "QUHBsQS_-5Eu",
833
+ "outputId": "fdab3075-dab8-458e-e663-2564b20da97c"
834
+ },
835
+ "outputs": [
836
+ {
837
+ "name": "stdout",
838
+ "output_type": "stream",
839
+ "text": [
840
+ "Average F1 Score@3: 0.18788198103266596\n"
841
+ ]
842
+ }
843
+ ],
844
+ "source": [
845
+ "def f1_score_at_k(recommendations, ground_truth, k=3):\n",
846
+ " precision = precision_at_k(recommendations, ground_truth, k)\n",
847
+ " recall = recall_at_k(recommendations, ground_truth, k)\n",
848
+ "\n",
849
+ " if precision + recall == 0:\n",
850
+ " return 0\n",
851
+ "\n",
852
+ " f1_score = 2 * (precision * recall) / (precision + recall)\n",
853
+ " return f1_score\n",
854
+ "\n",
855
+ "avg_f1_score = f1_score_at_k(recommendations, ground_truth)\n",
856
+ "\n",
857
+ "print(\"Average F1 Score@3:\", avg_f1_score)"
858
+ ]
859
+ }
860
+ ],
861
+ "metadata": {
862
+ "colab": {
863
+ "provenance": []
864
+ },
865
+ "kernelspec": {
866
+ "display_name": "Python 3",
867
+ "name": "python3"
868
+ },
869
+ "language_info": {
870
+ "codemirror_mode": {
871
+ "name": "ipython",
872
+ "version": 3
873
+ },
874
+ "file_extension": ".py",
875
+ "mimetype": "text/x-python",
876
+ "name": "python",
877
+ "nbconvert_exporter": "python",
878
+ "pygments_lexer": "ipython3",
879
+ "version": "3.11.2"
880
+ }
881
+ },
882
+ "nbformat": 4,
883
+ "nbformat_minor": 0
884
+ }