{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "_NrjL2ccH3yp"
},
"source": [
"RECOMMENDATION MODEL"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "IZfnA6W_GDyf"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "MV-7idG1F_NU"
},
"outputs": [],
"source": [
"# Mock data creation\n",
"def create_mock_data():\n",
" users_data = \"1st_train.csv\"\n",
" # \"/content/sample_data/train_train.csv\"\n",
" applicants = pd.read_csv(users_data)\n",
"\n",
" jobs_data = \"jobs_data.csv\"\n",
" companies = pd.read_csv(jobs_data)\n",
"\n",
" train_applicants = applicants\n",
" test_data = \"1st_test.csv\"\n",
" # \"/content/sample_data/test_train.csv\"\n",
" test_applicants = pd.read_csv(test_data)\n",
"\n",
" return train_applicants, test_applicants, companies"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "4VTpcXhz-5TN"
},
"outputs": [],
"source": [
"# @title\n",
"# # Mock data creation\n",
"# def create_mock_data():\n",
"# users_data = \"/content/sample_data/rematch_train_candidate_field.csv\"\n",
"# applicants = pd.read_csv(users_data)\n",
"\n",
"# jobs_data = \"/content/sample_data/jobs_data.csv\"\n",
"# companies = pd.read_csv(jobs_data)\n",
"\n",
"# # train_applicants = applicants\n",
"# # test_data = \"/content/sample_data/test_data_new.csv\"\n",
"# # test_applicants = pd.read_csv(test_data)\n",
"\n",
"# train_applicants = applicants[:10000]\n",
"# test_applicants = applicants[10000:]\n",
"\n",
"# return train_applicants, test_applicants, companies"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "wF1oZ6Ez96BE"
},
"outputs": [],
"source": [
"train_user, test_user, jobs = create_mock_data()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Gj8tJNrph8Go",
"outputId": "a44b8cf0-a56f-4cd2-bbda-ca9bcabf35a0"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data size: 18979\n",
"Test data size: 4745\n"
]
}
],
"source": [
"print(\"Training data size:\", train_user.shape[0])\n",
"print(\"Test data size:\", test_user.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "d0XY4al7K0UT"
},
"outputs": [],
"source": [
"list_hard_skill = [test_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]\n",
"list_soft_skill = [test_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 213
},
"id": "JOZ9_NlLK8uS",
"outputId": "17d09f55-192f-4486-bb47-b56f525d44a3"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" User ID | \n",
" candidate_field | \n",
" label | \n",
" hard_skill | \n",
" soft_skill | \n",
" final_hard_skill | \n",
" final_soft_skill | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 14649 | \n",
" it jobs | \n",
" 1 | \n",
" ['act', 'advertising sales', 'algorithms', 'bu... | \n",
" ['collaboration', 'decision making', 'operatio... | \n",
" act, advertising sales, algorithms, business, ... | \n",
" collaboration, decision making, operations, wr... | \n",
"
\n",
" \n",
" 1 | \n",
" 801 | \n",
" marketing | \n",
" 0 | \n",
" ['act', 'brand communication', 'business', 'bu... | \n",
" ['collaboration', 'customer service', 'managem... | \n",
" act, brand communication, business, business d... | \n",
" collaboration, customer service, management | \n",
"
\n",
" \n",
" 2 | \n",
" 4393 | \n",
" accounting | \n",
" 0 | \n",
" ['application', 'balance sheet', 'finance', 'p... | \n",
" ['filing', 'management'] | \n",
" application, balance sheet, finance, property ... | \n",
" filing, management | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" User ID candidate_field label \\\n",
"0 14649 it jobs 1 \n",
"1 801 marketing 0 \n",
"2 4393 accounting 0 \n",
"\n",
" hard_skill \\\n",
"0 ['act', 'advertising sales', 'algorithms', 'bu... \n",
"1 ['act', 'brand communication', 'business', 'bu... \n",
"2 ['application', 'balance sheet', 'finance', 'p... \n",
"\n",
" soft_skill \\\n",
"0 ['collaboration', 'decision making', 'operatio... \n",
"1 ['collaboration', 'customer service', 'managem... \n",
"2 ['filing', 'management'] \n",
"\n",
" final_hard_skill \\\n",
"0 act, advertising sales, algorithms, business, ... \n",
"1 act, brand communication, business, business d... \n",
"2 application, balance sheet, finance, property ... \n",
"\n",
" final_soft_skill \n",
"0 collaboration, decision making, operations, wr... \n",
"1 collaboration, customer service, management \n",
"2 filing, management "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
"test_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
"test_user.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "kYbjYsDjABda"
},
"outputs": [],
"source": [
"list_hard_skill = [train_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]\n",
"list_soft_skill = [train_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 213
},
"id": "GC8bn3cjB8D5",
"outputId": "436e843d-425e-4ce2-e551-e4f249bdd10b"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" User ID | \n",
" candidate_field | \n",
" label | \n",
" hard_skill | \n",
" soft_skill | \n",
" final_hard_skill | \n",
" final_soft_skill | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3030 | \n",
" sales | \n",
" 0 | \n",
" ['blogs', 'business', 'lead generation', 'mark... | \n",
" ['customer service', 'driven personality', 'ma... | \n",
" blogs, business, lead generation, marketing st... | \n",
" customer service, driven personality, manageme... | \n",
"
\n",
" \n",
" 1 | \n",
" 9702 | \n",
" administration & office support | \n",
" 0 | \n",
" ['business', 'draft', 'go', 'manufacturing', '... | \n",
" ['business acumen', 'communications', 'managem... | \n",
" business, draft, go, manufacturing, office man... | \n",
" business acumen, communications, management, o... | \n",
"
\n",
" \n",
" 2 | \n",
" 8606 | \n",
" retail & consumer products | \n",
" 0 | \n",
" ['gross profit', 'inventory', 'inventory manag... | \n",
" ['customer service', 'management'] | \n",
" gross profit, inventory, inventory management,... | \n",
" customer service, management | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" User ID candidate_field label \\\n",
"0 3030 sales 0 \n",
"1 9702 administration & office support 0 \n",
"2 8606 retail & consumer products 0 \n",
"\n",
" hard_skill \\\n",
"0 ['blogs', 'business', 'lead generation', 'mark... \n",
"1 ['business', 'draft', 'go', 'manufacturing', '... \n",
"2 ['gross profit', 'inventory', 'inventory manag... \n",
"\n",
" soft_skill \\\n",
"0 ['customer service', 'driven personality', 'ma... \n",
"1 ['business acumen', 'communications', 'managem... \n",
"2 ['customer service', 'management'] \n",
"\n",
" final_hard_skill \\\n",
"0 blogs, business, lead generation, marketing st... \n",
"1 business, draft, go, manufacturing, office man... \n",
"2 gross profit, inventory, inventory management,... \n",
"\n",
" final_soft_skill \n",
"0 customer service, driven personality, manageme... \n",
"1 business acumen, communications, management, o... \n",
"2 customer service, management "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
"train_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
"train_user.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "znBy9q8XDcM7"
},
"outputs": [],
"source": [
"list_hard_skill = [jobs[\"Hard Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]\n",
"list_soft_skill = [jobs[\"Soft Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 213
},
"id": "knFii8o3EQmv",
"outputId": "47afb484-0765-4ad9-8765-d084673450ac"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Job ID | \n",
" Major | \n",
" Hard Skills | \n",
" Soft Skills | \n",
" final_hard_skill | \n",
" final_soft_skill | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" accounting | \n",
" ['business', 'finance', 'excel', 'tax', 'servi... | \n",
" ['management', 'planning', 'operations', 'lead... | \n",
" business, finance, excel, tax, service, data, ... | \n",
" management, planning, operations, leadership, ... | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" administration & office support | \n",
" ['service', 'business', 'data', 'excel', 'appl... | \n",
" ['management', 'customer service', 'microsoft ... | \n",
" service, business, data, excel, application, s... | \n",
" management, customer service, microsoft office... | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" advertising, arts & media | \n",
" ['business', 'digital', 'sales', 'service', 'a... | \n",
" ['management', 'social media', 'writing', 'com... | \n",
" business, digital, sales, service, application... | \n",
" management, social media, writing, communicati... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Job ID Major \\\n",
"0 1 accounting \n",
"1 2 administration & office support \n",
"2 3 advertising, arts & media \n",
"\n",
" Hard Skills \\\n",
"0 ['business', 'finance', 'excel', 'tax', 'servi... \n",
"1 ['service', 'business', 'data', 'excel', 'appl... \n",
"2 ['business', 'digital', 'sales', 'service', 'a... \n",
"\n",
" Soft Skills \\\n",
"0 ['management', 'planning', 'operations', 'lead... \n",
"1 ['management', 'customer service', 'microsoft ... \n",
"2 ['management', 'social media', 'writing', 'com... \n",
"\n",
" final_hard_skill \\\n",
"0 business, finance, excel, tax, service, data, ... \n",
"1 service, business, data, excel, application, s... \n",
"2 business, digital, sales, service, application... \n",
"\n",
" final_soft_skill \n",
"0 management, planning, operations, leadership, ... \n",
"1 management, customer service, microsoft office... \n",
"2 management, social media, writing, communicati... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jobs[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
"jobs[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
"jobs.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "wiDiHL6lStnd"
},
"outputs": [],
"source": [
"# Feature Engineering\n",
"def feature_engineering(applicants, companies):\n",
" # Vectorize skills and majors\n",
" tfidf_vectorizer_skills = TfidfVectorizer()\n",
" tfidf_vectorizer_majors = TfidfVectorizer()\n",
"\n",
" all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],\n",
" companies['final_hard_skill'], companies['final_soft_skill']])\n",
" all_majors = pd.concat([applicants['candidate_field'], companies['Major']])\n",
"\n",
" all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)\n",
" all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)\n",
"\n",
" num_applicants = len(applicants)\n",
" num_companies = len(companies)\n",
"\n",
" # Split the TF-IDF vectors back into applicants and companies\n",
" applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2] # because each applicant has 2 skill entries\n",
" companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]\n",
"\n",
" applicants_majors_vectorized = all_majors_vectorized[:num_applicants]\n",
" companies_majors_vectorized = all_majors_vectorized[num_applicants:]\n",
"\n",
" return (applicants_skills_vectorized, applicants_majors_vectorized,\n",
" companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "THM0mszQGNyD"
},
"outputs": [],
"source": [
"def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,\n",
" companies_skills_vectorized, companies_majors_vectorized):\n",
" # Calculate similarity based on skills (averaging hard and soft skills similarities)\n",
" applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2\n",
" companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2\n",
"\n",
" skills_similarity = cosine_similarity(applicants_skills, companies_skills)\n",
"\n",
" # Calculate similarity based on majors\n",
" majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)\n",
"\n",
" # Ensure the number of companies in both similarities is aligned\n",
" if skills_similarity.shape[1] != majors_similarity.shape[1]:\n",
" min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])\n",
" skills_similarity = skills_similarity[:, :min_dim]\n",
" majors_similarity = majors_similarity[:, :min_dim]\n",
"\n",
" # Combine these similarities (simple average for this example)\n",
" combined_similarity = (skills_similarity + majors_similarity) / 2\n",
" return combined_similarity"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"id": "ter3YAzxoelD"
},
"outputs": [],
"source": [
"# Recommendation Function\n",
"def recommend_jobs(applicants, companies, similarity_scores):\n",
" recommendations = {}\n",
" for i, applicant in enumerate(applicants['User ID']):\n",
" if i < len(similarity_scores):\n",
" sorted_company_indices = np.argsort(-similarity_scores[i]) # Descending sort of scores\n",
" recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3] # Top 3 recommendations\n",
" recommendations[applicant] = recommended_companies\n",
" return recommendations\n",
"\n",
"# Testing and Evaluation Function\n",
"def print_recommendations(applicants, companies, recommendations):\n",
" # This is a mock function since we don't have ground truth to compare to.\n",
" # In a real scenario, we would compare against actual matches or use some form of feedback.\n",
" print(\"Recommendations for each applicant:\")\n",
" for applicant in recommendations:\n",
" print(f\"{applicant}: {recommendations[applicant]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"collapsed": true,
"id": "Ajxp0xelIrl2",
"outputId": "08bafc5b-73cc-4695-924a-931840047dd5"
},
"outputs": [],
"source": [
"# Let's create and process the data, and compute recommendations\n",
"# train_applicants, test_applicants, companies = create_mock_data()\n",
"applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)\n",
"\n",
"similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)\n",
"recommendations = recommend_jobs(test_user, jobs, similarity_scores)\n",
"\n",
"# Output the recommendations to observe the results\n",
"print_recommendations(test_user, jobs, recommendations)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nj-HEdyJlYNY",
"outputId": "063b84bc-5717-4a0c-8367-939a054657bc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Recommended Jobs based on input skills and major:\n",
"['sales' 'it jobs' 'administration & office support']\n"
]
}
],
"source": [
"# Process input skills and recommend jobs\n",
"def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
" input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])\n",
" input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])\n",
" input_major_vec = tfidf_vectorizer_majors.transform([input_major])\n",
"\n",
" # Average the vectorized hard and soft skills\n",
" input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
"\n",
" # Compute similarities\n",
" skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)\n",
" major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)\n",
"\n",
" # Ensure the number of companies in both similarities is aligned\n",
" if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
" min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
" skills_similarity = skills_similarity[:, :min_dim]\n",
" major_similarity = major_similarity[:, :min_dim]\n",
"\n",
" # Combine similarities\n",
" combined_similarity = (skills_similarity + major_similarity) / 2\n",
"\n",
" # Get top 3 job recommendations\n",
" sorted_company_indices = np.argsort(-combined_similarity[0])\n",
" recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
"\n",
" return recommended_companies\n",
"\n",
"\"\"\"TEST RECOMMENDED SYSTEM\"\"\"\n",
"\n",
"input_hard_skills = \"Java, Excel, Python\"\n",
"input_soft_skills = \"Communication, Teamwork\"\n",
"input_major = \"Sales\"\n",
"\n",
"recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
"print(\"Recommended Jobs based on input skills and major:\")\n",
"print(recommended_jobs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IMTilMnQINZC"
},
"source": [
"TEST RECOMMENDED SYSTEM"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kShd99z_NiTa"
},
"source": [
"Evaluating (PENDING)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WfEgjqw9JE3l"
},
"outputs": [],
"source": [
"def create_ground_truth(csv_file_path):\n",
" data = pd.read_csv(csv_file_path)\n",
"\n",
" # Tạo dictionary `ground_truth`\n",
" ground_truth = {}\n",
" for index, row in data.iterrows():\n",
" user_id = row['User ID']\n",
" actual_major = row['candidate_field']\n",
"\n",
" # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc\n",
" ground_truth[user_id] = [actual_major]\n",
"\n",
" return ground_truth\n",
"\n",
"# Sử dụng hàm trên để tạo `ground_truth`\n",
"csv_file_path = '/content/sample_data/1st_test.csv'\n",
"ground_truth = create_ground_truth(csv_file_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"collapsed": true,
"id": "TRiD4oS-AKFE",
"outputId": "256fadeb-b250-4602-affb-005cb9c658eb"
},
"outputs": [],
"source": [
"display(ground_truth)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pXsa_wbANjmb",
"outputId": "9bd4fc1e-781b-439c-fe35-c28769f6714c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average Precision@3 with 18979 trains and 4745 tests: 0.1252546540217773\n"
]
}
],
"source": [
"def precision_at_k(recommendations, ground_truth, k=3):\n",
" \"\"\"\n",
" Calculate the precision at k for recommendation system.\n",
"\n",
" Parameters:\n",
" - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.\n",
" - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.\n",
" - k (int): The number of top recommendations to consider for calculating precision.\n",
"\n",
" Returns:\n",
" - float: The average precision at k for all users.\n",
" \"\"\"\n",
" precision_scores = []\n",
"\n",
" for applicant, recommended_major in recommendations.items():\n",
" if applicant in ground_truth:\n",
" # Get top k recommendations\n",
" top_k_recs = recommended_major[:k]\n",
" # Calculate the number of relevant recommendations\n",
" relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])\n",
" # Precision at k for this user\n",
" precision = relevant_recs / k\n",
" precision_scores.append(precision)\n",
"\n",
" # Average precision at k over all users\n",
" average_precision = np.mean(precision_scores) if precision_scores else 0\n",
" return average_precision\n",
"\n",
"avg_precision = precision_at_k(recommendations, ground_truth)\n",
"print(\"Average Precision@3 with 18979 trains and 4745 tests:\", avg_precision)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "KAIvtKEaRQml",
"outputId": "7dd82dc6-0e1b-43d5-bc95-cb457cde5d72"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average Recall@3 with 18979 trains and 4745 tests: 0.3757639620653319\n"
]
}
],
"source": [
"def recall_at_k(recommendations, ground_truth, k=3):\n",
" recall_scores = []\n",
"\n",
" for user_id, recommended_majors in recommendations.items():\n",
" if user_id in ground_truth:\n",
" # Get top k recommendations\n",
" top_k_recs = recommended_majors[:k]\n",
" # Calculate the number of relevant recommendations\n",
" relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])\n",
" # Calculate the total number of relevant items\n",
" total_relevant = len(ground_truth[user_id])\n",
" # Recall at k for this user\n",
" recall = relevant_recs / total_relevant if total_relevant else 0\n",
" recall_scores.append(recall)\n",
"\n",
" # Average recall at k over all users\n",
" average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0\n",
" return average_recall\n",
"\n",
"# Example usage:\n",
"avg_recall = recall_at_k(recommendations, ground_truth)\n",
"print(\"Average Recall@3 with 18979 trains and 4745 tests:\", avg_recall)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QUHBsQS_-5Eu",
"outputId": "fdab3075-dab8-458e-e663-2564b20da97c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average F1 Score@3: 0.18788198103266596\n"
]
}
],
"source": [
"def f1_score_at_k(recommendations, ground_truth, k=3):\n",
" precision = precision_at_k(recommendations, ground_truth, k)\n",
" recall = recall_at_k(recommendations, ground_truth, k)\n",
"\n",
" if precision + recall == 0:\n",
" return 0\n",
"\n",
" f1_score = 2 * (precision * recall) / (precision + recall)\n",
" return f1_score\n",
"\n",
"avg_f1_score = f1_score_at_k(recommendations, ground_truth)\n",
"\n",
"print(\"Average F1 Score@3:\", avg_f1_score)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}