File size: 18,486 Bytes
bac66fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
081374c
bac66fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import gradio as gr
import torch
import spacy
import nltk
import re
import PyPDF2
import numpy as np
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK resources
nltk.download('punkt')

# Load spaCy and Sentence Transformer models
nlp = spacy.load('en_core_web_sm')
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

# Updated career database
CAREER_RECOMMENDATIONS = [
    {"title": "Software Engineer", "description": "Develops software applications and systems", "skills":["Python","Java","C++","JavaScript", "Software Development","Database Management","Web Development", "Cloud Computing","Data Structures", "Algorithms"]},
     {"title": "Data Scientist", "description": "Analyzes complex data to help make business decisions","skills": ["Python","R","Statistics","Machine Learning","Data Visualization","Data Analysis","SQL"]},
     {"title": "Cloud Solutions Architect", "description": "Designs and manages cloud computing strategies","skills":["Cloud Computing","AWS","Azure","GCP","Infrastructure as Code","Networking"]},
    {"title": "AI/ML Engineer", "description": "Creates intelligent systems and machine learning models","skills": ["Machine Learning", "Deep Learning", "Neural Networks", "TensorFlow", "PyTorch","Computer Vision","Natural Language Processing"]},
    {"title":"Database Administrator","description":"Manage databases, ensure data security","skills":["SQL", "Database Management", "Database Security", "Database Design","Database Modeling"]},
    {"title": "Mechanical Engineer", "description": "Designs, develops, and tests mechanical devices and systems","skills": ["CAD","CAM","Matlab","Mechanical Design", "Manufacturing Engineering", "Quality Control", "Thermal Engineering", "Fluid Mechanics", "GD&T","Engineering Drawings","Blueprint reading","Product Design","FEA Analysis"]},
    {"title": "Manufacturing Engineer", "description": "Optimizes manufacturing processes for efficiency and quality","skills": ["Manufacturing Engineering","Process Optimization","Lean Manufacturing","Six Sigma","Production Planning","Supply Chain Management"]},
    {"title":"Quality Engineer","description":"Oversees quality assurance activities and ensures products meet standards.","skills":["Quality Control","Quality Assurance","ISO Standards","Statistical Process Control","Inspection","Testing"]},
     {"title": "Design Engineer", "description": "Creates product designs and technical drawings using CAD software","skills": ["CAD","CAM","Product Design","3D Modeling","Engineering Design","Drafting"]},
    {"title": "Business Analyst", "description": "Identifies business needs and determines solutions","skills": ["Business Analysis", "Requirements Gathering", "Data Analysis", "Process Improvement", "Project Management"]},
    {"title": "Marketing Manager", "description": "Develops and implements marketing strategies","skills":["Marketing","Digital Marketing","Social Media Marketing","Market Research","Branding","Advertising", "Content Marketing"]},
   {"title": "Project Manager", "description": "Leads and coordinates project teams and resources","skills":["Project Management","Project Planning","Risk Management","Team Management","Agile Methodologies"]},
    {"title": "Management Consultant", "description": "Advises organizations on improving performance","skills":["Consulting","Strategy","Problem Solving","Business Analysis","Communication"]},
    {"title": "Graphic Designer", "description": "Creates visual concepts using computer software or by hand","skills": ["Graphic Design","Adobe Photoshop","Adobe Illustrator","UI/UX Design","Visual Communication","Branding"]},
    {"title": "Content Strategist", "description": "Develops content plans and marketing strategies","skills":["Content Writing","Content Strategy","SEO","Content Marketing","Copywriting"]},
    {"title": "UI/UX Designer", "description": "Designs user interfaces for digital products","skills":["UI Design","UX Design","Wireframing","Prototyping","User Research","Interaction Design"]},
    {"title": "Digital Marketing Specialist", "description": "Promotes brands and products through digital channels","skills":["Digital Marketing","Social Media Marketing","SEO","PPC Advertising","Email Marketing","Content Marketing"]},
    {"title": "Healthcare Administrator", "description": "Manages healthcare facilities and services","skills":["Healthcare Administration","Healthcare Management","Healthcare Policy","Healthcare Finance","Patient Care"]},
    {"title": "Medical Researcher", "description": "Conducts research to improve medical knowledge","skills":["Medical Research","Data Analysis","Research Design","Laboratory Techniques","Scientific Writing"]},
    {"title": "Healthcare Consultant", "description": "Advises healthcare organizations on improvement strategies","skills":["Healthcare Consulting", "Healthcare Strategy","Healthcare Operations","Healthcare Policy"]},
     {"title":"Medical Assistant","description": "Assists with patient care and medical administrative tasks.","skills":["Patient Care","Medical Terminology","Medical Assisting","Clinical Procedures","Vital Signs","Electronic Health Records"]}
]

def extract_text_from_pdf(file_path):
    """
    Extract text from PDF file

    Args:
        file_path (str): Path to the PDF file

    Returns:
        str: Extracted text from the PDF
    """
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text() + '\n'
            return text
    except Exception as e:
        print(f"Error extracting PDF text: {e}")
        return ""

def preprocess_cv_text(text):
    """
    Preprocess CV text for analysis

    Args:
        text (str): Raw CV text

    Returns:
        dict: Processed CV information
    """
    # Normalize text
    text = text.lower()

    # Extract key sections with more flexible regex
    sections = {
         'contact': re.findall(r'(email|phone|contact)[:\s]*([^\n]+)', text),
         'education': re.findall(r'(education|qualification|academic)[:\s]*(.*?)(?=\n\n|\n(?:work|experience|skills|projects|training|hobbies|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
        'experience': re.findall(r'(experience|work)[:\s]*(.*?)(?=\n\n|\n(?:education|skills|projects|training|hobbies|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
        'skills': re.findall(r'(skills|expertise|technical skills)[:\s]*(.*?)(?=\n\n|\n(?:education|work|projects|training|hobbies|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
         'projects': re.findall(r'(projects)[:\s]*(.*?)(?=\n\n|\n(?:education|work|skills|training|hobbies|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
        'training': re.findall(r'(training|certification)[:\s]*(.*?)(?=\n\n|\n(?:education|work|skills|projects|hobbies|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
        'hobbies': re.findall(r'(hobbies|interests)[:\s]*(.*?)(?=\n\n|\n(?:education|work|skills|projects|training|personal|declaration))', text, re.DOTALL | re.IGNORECASE),
        'personal': re.findall(r'(personal details)[:\s]*(.*?)(?=\n\n|\n(?:education|work|skills|projects|training|hobbies|declaration))', text, re.DOTALL | re.IGNORECASE)
    }

    # Process extracted sections
    processed_sections = {}
    for key, matches in sections.items():
        if matches:
            processed_sections[key] = " ".join([match[1].strip() for match in matches]) #Combine all matches into one string

    return processed_sections

def analyze_cv_skills(cv_text):
    """
    Analyze skills from CV and recommend career paths based on combined scores.

    Args:
        cv_text (str): Processed CV text

    Returns:
        dict: Career recommendations and analysis
    """
    # Preprocess CV
    cv_info = preprocess_cv_text(cv_text)

    # Extract skills and keywords
    all_skills = []
    all_hobbies = []
    all_qualifications = []
    all_experience = []

    #Skill Extraction
    if 'skills' in cv_info:
        skill_text = cv_info['skills']
        doc = nlp(skill_text)
        all_skills.extend([ent.text for ent in doc.ents if ent.label_ in ['SKILL', 'ORG','PRODUCT']]) #Add Org and Product
        all_skills.extend([token.text for token in doc if token.pos_ in ['NOUN', 'ADJ']])
          # Manually extract skills based on keyword
        skill_keywords = ["AutoCAD", "Manufacturing Engineering", "Quality Control", "Thermal Engineering", "Heat Transfer","Machine Design", "Fluid Mechanics","CAD","CAM", "Matlab","GD&T","Engineering Drawings","Blueprint reading","Product Design","FEA Analysis",
                        "Project Management", "Marketing", "Business Analysis", "Sales", "Finance", "Consulting", "Market Research",
                       "Graphic Design", "Content Writing", "Digital Marketing", "UI/UX Design", "Video Production","SEO","Social Media Marketing",
                       "Patient Care", "Medical Research", "Healthcare Administration", "Medical Technology", "Anatomy", "Physiology","Pharmacology","Python", "Java", "Machine Learning", "Data Science", "Cloud Computing", "Cybersecurity", "Web Development", "Software Development", "Database Management",
                       "SQL", "C++", "JavaScript","AWS", "Azure", "GCP", "Infrastructure as Code", "Networking", "Deep Learning", "Neural Networks", "TensorFlow", "PyTorch","Computer Vision","Natural Language Processing","R","Statistics", "Data Visualization", "Data Analysis","Agile Methodologies",
                        "Adobe Photoshop", "Adobe Illustrator", "Visual Communication", "Branding", "Copywriting", "Wireframing","Prototyping","User Research","Interaction Design","PPC Advertising","Email Marketing","Healthcare Management", "Healthcare Policy", "Healthcare Finance",
                        "Medical Terminology", "Clinical Procedures", "Vital Signs", "Electronic Health Records","Lean Manufacturing","Six Sigma","Production Planning","Supply Chain Management","ISO Standards", "Statistical Process Control","Inspection","Testing",
                        "Requirements Gathering","Process Improvement"]
        all_skills.extend([keyword for keyword in skill_keywords if keyword.lower() in skill_text.lower()])
    # Experience Extraction
    if 'experience' in cv_info:
        exp_doc = nlp(cv_info['experience'])
        all_experience.extend([token.text for token in exp_doc if token.pos_ in ['NOUN', 'VERB']])
          # Manually extract skills based on keywords
        exp_keywords = ["blueprints", "specifications","production","inspection", "testing","measurement","calipers",
                           "gauges","micrometers","quality standards","production process","finished items","inspection results", "test data","training", "design", "development","analysis", "management",
                            "research", "consulting"]
        all_experience.extend([keyword for keyword in exp_keywords if keyword.lower() in cv_info['experience'].lower()])

    #Project extraction
    if 'projects' in cv_info:
      proj_doc = nlp(cv_info['projects'])
      all_experience.extend([token.text for token in proj_doc if token.pos_ in ['NOUN','VERB']]) #Add nouns and verbs
         # Manually extract skills based on keywords
      proj_keywords = ["helicopter", "assembly", "dismantling","5S methodology","flow path","material","productivity","layout"]
      all_experience.extend([keyword for keyword in proj_keywords if keyword.lower() in cv_info['projects'].lower()])

      #Training extraction
    if 'training' in cv_info:
      train_doc = nlp(cv_info['training'])
      all_experience.extend([token.text for token in train_doc if token.pos_ in ['NOUN','VERB']])
          # Manually extract skills based on keywords
      train_keywords = ["inplant training"]
      all_experience.extend([keyword for keyword in train_keywords if keyword.lower() in cv_info['training'].lower()])

    #Hobby Extraction
    if 'hobbies' in cv_info:
        hobby_doc = nlp(cv_info['hobbies'])
        all_hobbies.extend([token.text for token in hobby_doc if token.pos_ in ['NOUN','VERB','ADJ']]) #Add all POS tags

    #Qualification Extraction
    if 'education' in cv_info:
      qual_doc = nlp(cv_info['education'])
      all_qualifications.extend([token.text for token in qual_doc if token.pos_ in ['NOUN','ADJ']])
      qual_keywords = ["engineering", "diploma", "bachelor", "master", "degree", "computer science", "information technology","business administration","medical","healthcare"]
      all_qualifications.extend([keyword for keyword in qual_keywords if keyword.lower() in cv_info['education'].lower()])

    # Remove duplicates and convert to lowercase
    all_skills = list(set(skill.lower() for skill in all_skills if len(skill) > 2))
    all_hobbies = list(set(hobby.lower() for hobby in all_hobbies if len(hobby)>2))
    all_qualifications = list(set(qualification.lower() for qualification in all_qualifications if len(qualification) > 2))
    all_experience = list(set(exp.lower() for exp in all_experience if len(exp)>2))

    # Calculate similarity scores for each career recommendation
    career_scores = []
    for career in CAREER_RECOMMENDATIONS:
         #Embed career skills and CV skills
        career_skill_embeddings = embedding_model.encode(career['skills'])
        cv_skill_embeddings = embedding_model.encode(all_skills)

        #Embed CV sections
        cv_hobby_embeddings = embedding_model.encode(all_hobbies)
        cv_qualifications_embeddings = embedding_model.encode(all_qualifications)
        cv_experience_embeddings = embedding_model.encode(all_experience)

        total_similarity = 0
        skills_similarity = 0
        hobby_similarity = 0
        qualification_similarity =0
        experience_similarity = 0
        #Calculate Similarity Score for skills
        if len(cv_skill_embeddings) > 0:
          similarities = cosine_similarity(career_skill_embeddings, cv_skill_embeddings)
          skills_similarity= np.max(similarities) #Use max instead of avg
        #Calculate similarity score for hobbies
        if len(cv_hobby_embeddings) > 0:
          similarities = cosine_similarity(embedding_model.encode([", ".join(career['skills'])]),cv_hobby_embeddings)
          hobby_similarity = np.max(similarities)

          #Calculate similarity score for qualification
        if len(cv_qualifications_embeddings) > 0:
          similarities = cosine_similarity(embedding_model.encode([", ".join(career['skills'])]),cv_qualifications_embeddings)
          qualification_similarity = np.max(similarities)
          #Calculate similarity score for experience
        if len(cv_experience_embeddings) >0:
          similarities = cosine_similarity(embedding_model.encode([", ".join(career['skills'])]),cv_experience_embeddings)
          experience_similarity = np.max(similarities)

        #Calculate weighted sum of similarities
        total_similarity = (0.5*skills_similarity) + (0.1*hobby_similarity) + (0.2*qualification_similarity) + (0.2*experience_similarity)
        career_scores.append({
            'title': career['title'],
            'description': career['description'],
            'score': total_similarity,
            'matched_skills': all_skills,
            'matched_hobbies':all_hobbies,
            'matched_qualifications':all_qualifications,
            'matched_experience':all_experience
        })
    # Sort careers by similarity score
    ranked_careers = sorted(career_scores, key=lambda x: x['score'], reverse=True)

    # Prepare recommendation report
    report = "### Career Recommendation Analysis\n\n"
    report += "**Top Career Recommendations**:\n"
    for career in ranked_careers[:5]:  # Display top 5 recommendations
        report += f"- **{career['title']}**\n"
        report += f"  *{career['description']}*\n"
        report += f"  *Similarity Score: {career['score']:.2f}*\n"

    report += "\n**Skills Match**:\n"
    report += "- Identified Skills: " + ", ".join(ranked_careers[0]['matched_skills']) + "\n\n"

    report += "**Hobbies Match**:\n"
    report += "- Identified Hobbies: " + ", ".join(ranked_careers[0]['matched_hobbies']) + "\n\n"

    report += "**Qualification Match**:\n"
    report += "- Identified Qualifications: " + ", ".join(ranked_careers[0]['matched_qualifications']) + "\n\n"

    report += "**Experience Match**:\n"
    report += "- Identified Experience: " + ", ".join(ranked_careers[0]['matched_experience']) + "\n\n"

    return report

def cv_skill_assessment(cv_file):
    """
    Main function to process uploaded CV and provide skill assessment

    Args:
        cv_file (str): Path to uploaded CV file

    Returns:
        str: Skill assessment and career recommendations
    """
    try:
        # Extract text from PDF
        cv_text = extract_text_from_pdf(cv_file)

        # If PDF extraction fails, try direct text processing
        if not cv_text.strip():
            with open(cv_file, 'r', encoding='utf-8') as f:
                cv_text = f.read()

        # Analyze CV and get recommendations
        assessment = analyze_cv_skills(cv_text)

        return assessment

    except Exception as e:
        return f"Error processing CV: {str(e)}"

# Create Gradio Interface
def launch_cv_skill_assessment_app():
    """
    Launch the CV Skill Assessment AI Gradio Interface
    """
    demo = gr.Interface(
        fn=cv_skill_assessment,
        inputs=gr.File(label="Upload Your CV (PDF/Text)", type="filepath"),
        outputs=gr.Markdown(label="Career Recommendation Report"),
        title="πŸš€ CV Skills Assessment AI",
        description="""
        Discover your ideal career path based on your CV!

        *How to use*:
        1. Upload your CV (PDF or Text file)
        2. Our AI analyzes your skills, experience, and background
        3. Receive personalized career recommendations

        *Features*:
        - Advanced CV parsing
        - Skill extraction
        - Domain-based career matching
        - Detailed recommendation report
        """,
        theme="huggingface"
    )

    demo.launch(debug=True)

# Run the application
launch_cv_skill_assessment_app()