Souha Ben Hassine commited on
Commit
717f996
1 Parent(s): 22ffb5a
Files changed (1) hide show
  1. app.py +140 -4
app.py CHANGED
@@ -1,7 +1,143 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import spacy
4
+ from spacy.pipeline import EntityRuler
5
+ from spacy.lang.en import English
6
+ from spacy.tokens import Doc
7
+ import gensim
8
+ from gensim import corpora
9
+ from spacy import displacy
10
+ import pyLDAvis.gensim_models
11
+ from wordcloud import WordCloud
12
+ import plotly.express as px
13
+ import matplotlib.pyplot as plt
14
+ import pandas as pd
15
+ import numpy as np
16
+ import re
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+ from nltk.stem import WordNetLemmatizer
20
+ nltk.download(['stopwords','wordnet'])
21
 
 
 
22
 
23
+ # Load the CSV file into a DataFrame
24
+ dataset_path = "Resume.csv"
25
+ data = pd.read_csv(dataset_path)
26
+
27
+ # Load the spaCy English language model with large vocabulary and pre-trained word vectors
28
+ nlp = spacy.load("en_core_web_lg")
29
+
30
+ # Path to the file containing skill patterns in JSONL format
31
+ skill_pattern_path = "jz_skill_patterns.jsonl"
32
+
33
+ # Add an entity ruler to the spaCy pipeline
34
+ ruler = nlp.add_pipe("entity_ruler")
35
+
36
+ # Load skill patterns from disk into the entity ruler
37
+ ruler.from_disk(skill_pattern_path)
38
+
39
+
40
+ def get_unique_skills(text):
41
+ doc = nlp(text)
42
+ skills = set()
43
+ for ent in doc.ents:
44
+ if ent.label_ == "SKILL":
45
+ skills.add(ent.text)
46
+ return list(skills)
47
+
48
+ def preprocess_resume(resume_str):
49
+ # Remove special characters, URLs, and Twitter mentions
50
+ review = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", resume_str)
51
+
52
+ # Convert to lowercase and tokenize
53
+ review = review.lower().split()
54
+
55
+ # Lemmatize and remove stopwords
56
+ lm = WordNetLemmatizer()
57
+ review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
58
+
59
+ # Join the words back into a string
60
+ review = " ".join(review)
61
+ return review
62
+
63
+ Job_cat = data["Category"].unique()
64
+ Job_cat = np.append(Job_cat, "ALL")
65
+ Job_Category = "INFORMATION-TECHNOLOGY"
66
+
67
+ def get_skills_distribution(Job_Category):
68
+ if Job_Category != "ALL":
69
+ filtered_data = data[data["Category"] == Job_Category]["skills"]
70
+ else:
71
+ filtered_data = data["skills"]
72
+
73
+ total_skills = [skill for sublist in filtered_data for skill in sublist]
74
+
75
+ fig = px.histogram(
76
+ x=total_skills,
77
+ labels={"x": "Skills"},
78
+ title=f"{Job_Category} Distribution of Skills",
79
+ ).update_xaxes(categoryorder="total descending")
80
+
81
+ return fig.show()
82
+
83
+ get_skills_distribution(Job_Category)
84
+
85
+ # Apply the preprocess_resume function to each resume string and store the result in a new column
86
+ data["Clean_Resume"] = data["Resume_str"].apply(preprocess_resume)
87
+
88
+ # Extract skills from each preprocessed resume and store them in a new column
89
+ data["skills"] = data["Clean_Resume"].str.lower().apply(get_unique_skills)
90
+
91
+ patterns = data.Category.unique()
92
+ for a in patterns:
93
+ ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
94
+
95
+
96
+ # Load the spaCy model
97
+ nlp = spacy.load("en_core_web_sm")
98
+
99
+ # Define the styles and options for highlighting entities
100
+ colors = {
101
+ "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
102
+ "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
103
+ "ORG": "#ffd966",
104
+ "PERSON": "#e06666",
105
+ "GPE": "#9fc5e8",
106
+ "DATE": "#c27ba0",
107
+ "ORDINAL": "#674ea7",
108
+ "PRODUCT": "#f9cb9c",
109
+ }
110
+ options = {
111
+ "ents": [
112
+ "Job-Category",
113
+ "SKILL",
114
+ "ORG",
115
+ "PERSON",
116
+ "GPE",
117
+ "DATE",
118
+ "ORDINAL",
119
+ "PRODUCT",
120
+ ],
121
+ "colors": colors,
122
+ }
123
+
124
+ # Define a function to process the resume text and highlight entities
125
+ def highlight_entities(resume_text):
126
+ # Process the resume text with spaCy
127
+ doc = nlp(resume_text)
128
+ # Render the entities with displacy and return the HTML
129
+ html = displacy.render(doc, style="ent", options=options, jupyter=False)
130
+ return html
131
+
132
+ # Create the Gradio interface
133
+ iface = gr.Interface(
134
+ fn=highlight_entities,
135
+ inputs=gr.Textbox(lines=10, label="Input Resume Text"),
136
+ outputs=gr.HTML(label="Highlighted Entities"),
137
+ title="Resume Entity Highlighter",
138
+ description="Enter your resume text and see entities highlighted.",
139
+ theme="compact"
140
+ )
141
+
142
+ # Launch the interface
143
+ iface.launch()