Spaces:
Configuration error
Configuration error
import streamlit as st | |
import os | |
#import sys | |
from PyPDF2 import PdfReader | |
import docx2txt | |
from transformers import pipeline | |
import pandas as pd | |
def fetch_pdf_doc_file(directory): | |
pdf_doc_file = [] | |
for file in os.listdir(directory): | |
if file.endswith('.pdf') or file.endswith('.docx'): | |
temp = directory + "/" +file | |
pdf_doc_file.append(temp) | |
return pdf_doc_file | |
# extract texts from files | |
def extract_text(files_list): | |
reader = PdfReader() | |
for file in files_list: | |
text = "" | |
l = len(reader.pages) | |
for i in range(l): | |
page = reader.pages[i] | |
text += page.extract_text() | |
text = text.lower() | |
return text | |
#passing text for extracting skills | |
pipe = pipeline("token-classification", model="algiraldohe/lm-ner-linkedin-skills-recognition") | |
def skill_extract(text): | |
output = pipe(text) | |
technical_words = [entry['word'] for entry in output if entry['entity'] in ['B-TECHNICAL', 'I-TECHNICAL', 'B-TECHNOLOGY', 'I-TECHNOLOGY']] | |
l = len(technical_words) | |
index = 0 # Initialize the index variable | |
while index < l: | |
if technical_words[index].startswith("##"): | |
half = technical_words[index][2:] | |
technical_words[index-1] += half | |
technical_words.pop(index) | |
l -= 1 # Decrease the length of the list | |
else: | |
index += 1 # Move to the next word | |
technical_words = set(technical_words) | |
return technical_words | |
# function for matching and returning skills | |
def match(required_skills, resume_skills): | |
# Convert the skills lists to sets for efficient set operations | |
required_skills = set(required_skills) | |
# Find the common skills (matching skills) | |
matching_skills = required_skills.intersection(resume_skills) | |
# Calculate the score as a percentage | |
score_percentage = (len(matching_skills) / len(required_skills))*100 | |
#Find the missing skills | |
missing_skills = required_skills.difference(resume_skills) | |
return missing_skills, score_percentage | |
# Define the list of required skills | |
required_skills = ["Python", "Java", "Django", "Machine Learning", "Data Science", "Communication", 'Natural language processing (nlp)'] | |
# Create a list to store selected skills | |
selected_skills = [] | |
# Streamlit UI | |
st.title("TalentMatch") | |
st.header("Select the required skills") | |
# Use st.columns to create three columns | |
col1, col2, col3 = st.columns(3) | |
# Display checkboxes for each skill in three columns | |
for i, skill in enumerate(required_skills): | |
if i % 3 == 0: | |
checkbox = col1.checkbox(skill) | |
elif i % 3 == 1: | |
checkbox = col2.checkbox(skill) | |
else: | |
checkbox = col3.checkbox(skill) | |
if checkbox: | |
selected_skills.append(skill) | |
pdf_docs = st.file_uploader("upload your files and click on process", accept_multiple_files = True) | |
if selected_skills and pdf_docs: | |
if st.button("Process"): | |
st.write("Processing...") | |
result_data = [] | |
# iterating over each file | |
for file in pdf_docs: | |
text = extract_text(pdf_docs) | |
print(text) | |
resume_skills = skill_extract(text) | |
missing_skills, score = match(required_skills, resume_skills) | |
result_data.append({"File": file, "Score": score+"%", "Missing Skills": missing_skills}) | |
# create a dataframe | |
df = pd.DataFrame(result_data) | |
#sort the data frame according to the score | |
#df = df.sort_values(by = "Score", ascending = False) | |
# display the result table | |
st.subheader("Processing Completed") | |
st.subheader("RESULT") | |
st.table(df) | |