import streamlit as st
import os
#import sys
from PyPDF2 import PdfReader
import docx2txt
from transformers import pipeline
import pandas as pd


def fetch_pdf_doc_file(directory):
  pdf_doc_file = []
  for file in os.listdir(directory):
    if file.endswith('.pdf') or file.endswith('.docx'):
      temp = directory + "/" +file
      pdf_doc_file.append(temp)
  return pdf_doc_file

# extract texts from files
def extract_text(files_list):
  reader = PdfReader()
  for file in files_list:
    text = ""
    l = len(reader.pages)
    for i in range(l):
      page = reader.pages[i]
      text += page.extract_text()
      text = text.lower()
  return text

#passing text for extracting skills
pipe = pipeline("token-classification", model="algiraldohe/lm-ner-linkedin-skills-recognition")
def skill_extract(text):
  output = pipe(text)
  technical_words = [entry['word'] for entry in output if entry['entity'] in ['B-TECHNICAL', 'I-TECHNICAL', 'B-TECHNOLOGY', 'I-TECHNOLOGY']]
  l = len(technical_words)
  index = 0  # Initialize the index variable
  while index < l:
        if technical_words[index].startswith("##"):
            half = technical_words[index][2:]
            technical_words[index-1] += half
            technical_words.pop(index)
            l -= 1  # Decrease the length of the list
        else:
            index += 1  # Move to the next word
  technical_words = set(technical_words)
  return technical_words


# function for matching and returning skills
def match(required_skills, resume_skills):
  # Convert the skills lists to sets for efficient set operations
  required_skills = set(required_skills)

# Find the common skills (matching skills)
  matching_skills = required_skills.intersection(resume_skills)

# Calculate the score as a percentage
  score_percentage = (len(matching_skills) / len(required_skills))*100
   #Find the missing skills
  missing_skills = required_skills.difference(resume_skills)
  return missing_skills, score_percentage


# Define the list of required skills
required_skills = ["Python", "Java", "Django", "Machine Learning", "Data Science", "Communication", 'Natural language processing (nlp)']

# Create a list to store selected skills
selected_skills = []

# Streamlit UI
st.title("TalentMatch")
st.header("Select the required skills")

# Use st.columns to create three columns
col1, col2, col3 = st.columns(3)

# Display checkboxes for each skill in three columns
for i, skill in enumerate(required_skills):
    if i % 3 == 0:
        checkbox = col1.checkbox(skill)
    elif i % 3 == 1:
        checkbox = col2.checkbox(skill)
    else:
        checkbox = col3.checkbox(skill)
    
    if checkbox:
        selected_skills.append(skill)


pdf_docs = st.file_uploader("upload your files and click on process", accept_multiple_files = True)
if selected_skills and pdf_docs:
    if st.button("Process"):
       st.write("Processing...")


result_data = []
# iterating over each file
for file in pdf_docs:
    text = extract_text(pdf_docs)
    print(text)
    resume_skills = skill_extract(text)
    missing_skills, score = match(required_skills, resume_skills)
    result_data.append({"File": file, "Score": score+"%", "Missing Skills": missing_skills})

# create a dataframe
df = pd.DataFrame(result_data)
#sort the data frame according to the score
#df = df.sort_values(by = "Score", ascending = False)

# display the result table
st.subheader("Processing Completed")
st.subheader("RESULT")
st.table(df)