File size: 3,687 Bytes
8322a94
 
af2aec4
8322a94
af2aec4
 
8322a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af2aec4
8322a94
 
af2aec4
 
 
 
 
 
 
 
 
 
 
 
 
 
8322a94
 
 
 
 
 
 
 
 
 
 
 
af2aec4
8322a94
 
af2aec4
8322a94
 
 
 
 
 
 
 
 
 
 
 
 
af2aec4
8322a94
 
 
 
af2aec4
 
 
8322a94
af2aec4
 
 
 
 
 
 
 
 
 
 
8322a94
 
af2aec4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import MinMaxScaler
import re
from PyPDF2 import PdfReader

def extract_text_from_file(file):
    if file.type == "application/pdf":
        return extract_text_from_pdf(file)
    else:
        return file.read().decode('utf-8')

def extract_text_from_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    return text.lower()

def calculate_similarity_metrics(resumes, keywords):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
    
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    
    def jaccard_similarity(doc1, doc2):
        set1 = set(doc1.split())
        set2 = set(doc2.split())
        return len(set1.intersection(set2)) / len(set1.union(set2))
    
    jaccard_sim = [jaccard_similarity(keywords, resume) for resume in resumes]
    
    euclidean_dist = euclidean_distances(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    euclidean_sim = 1 / (1 + euclidean_dist) 
    
    return cosine_sim, jaccard_sim, euclidean_sim

st.title("Resume Analyzer")

st.sidebar.subheader("Enter Keywords and Priority")
data = pd.DataFrame({
    'Keyword': ['']*10,
    'Priority': ['']*10
})
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")

if not keywords_df['Keyword'].isnull().all():
    keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
    
    st.subheader("Upload up to 5 resumes (PDF or Text files)")
    uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
    
    if len(uploaded_files) > 0 and keywords_combined:
        with st.spinner("Analyzing Resumes..."):
            resumes = []
            for file in uploaded_files:
                try:
                    resume_text = extract_text_from_file(file)
                    clean_resume = clean_text(resume_text)
                    resumes.append(clean_resume)
                except Exception as e:
                    st.error(f"Error processing {file.name}: {str(e)}")
            
            clean_keywords = clean_text(keywords_combined)
            
            cosine_scores, jaccard_scores, euclidean_scores = calculate_similarity_metrics(resumes, clean_keywords)
            
            st.subheader("Resume Analysis Results")
            results_df = pd.DataFrame({
                'Resume': [file.name for file in uploaded_files],
                'Cosine Similarity': cosine_scores,
                'Jaccard Index': jaccard_scores,
                'Euclidean Similarity': euclidean_scores
            })
            
            scaler = MinMaxScaler()
            normalized_scores = scaler.fit_transform(results_df[['Cosine Similarity', 'Jaccard Index', 'Euclidean Similarity']])
            
            overall_scores = np.mean(normalized_scores, axis=1)
            results_df['Overall Score'] = overall_scores
            
            results_df['Rank'] = results_df['Overall Score'].rank(ascending=False, method='min').astype(int)
            
            results_df = results_df.sort_values('Rank')
            
            st.dataframe(results_df)
else:
    st.info("Please upload resumes and enter keywords with priority.")