Ammar-Abdelhady-ai commited on
Commit
a16181d
1 Parent(s): 9094907

Add application file

Browse files
Files changed (4) hide show
  1. Dockerfile +20 -0
  2. functions.py +30 -0
  3. main.py +98 -0
  4. requirements.txt +21 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN useradd -m -u 1000 user
10
+
11
+ USER user
12
+
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
+
18
+ COPY --chown=user . $HOME/app
19
+
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
functions.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import fitz # PyMuPDF
4
+ from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
5
+ import numpy as np
6
+
7
+
8
+
9
+ def extract_text_from_pdf(pdf_content):
10
+ text = ''
11
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
12
+ temp_file.write(pdf_content)
13
+ temp_path = temp_file.name
14
+
15
+ pdf_document = fitz.open(temp_path)
16
+ for page_number in range(pdf_document.page_count):
17
+ page = pdf_document[page_number]
18
+ text += page.get_text()
19
+
20
+ pdf_document.close() # Close the PDF document explicitly
21
+ os.remove(temp_path) # Remove the temporary file after use
22
+ return str(text.replace("\xa0", ""))
23
+
24
+
25
+ def get_most_similar_job(data, cv_vect, df_vect):
26
+ for i in range(0, len([data])):
27
+ distances = cosine_similarity(cv_vect[i], df_vect).flatten()
28
+ indices = np.argsort(distances)[::-1]
29
+
30
+ return indices
main.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ from functions import extract_text_from_pdf, get_most_similar_job
3
+ from fastapi import UploadFile, HTTPException, FastAPI
4
+ import pandas as pd
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+
7
+
8
+ summarizer = ""
9
+ def define_summarizer():
10
+ from transformers import pipeline
11
+ global summarizer
12
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
+ print("\n\n definition Done")
14
+ define = threading.Thread(target=define_summarizer)
15
+ define.start()
16
+
17
+ def fit_threads(text):
18
+ define.join()
19
+
20
+ ######## Handel Sumarization model
21
+
22
+ a = threading.Thread(target=summarization, args=(text[0],))
23
+ b = threading.Thread(target=summarization, args=(text[1],))
24
+ c = threading.Thread(target=summarization, args=(text[-1],))
25
+
26
+ # Start all threads
27
+ a.start()
28
+ b.start()
29
+ c.start()
30
+
31
+ # Wait for all threads to finish
32
+ a.join()
33
+ b.join()
34
+ c.join()
35
+ print("Summarization Done")
36
+
37
+
38
+
39
+ df = pd.read_csv("all.csv")
40
+ df['concatenated_column'] = pd.concat([df['job_title'] + df['job_description'] + df['job_requirements'], df['city_name']], axis=1).astype(str).agg(''.join, axis=1)
41
+ x = df['concatenated_column']
42
+ y = df["label"]
43
+ vectorizer = TfidfVectorizer(stop_words='english')
44
+
45
+ vectorizer.fit(x)
46
+ df_vect = vectorizer.transform(x)
47
+ print(df.shape, len(df))
48
+ # Initialize the summarizer model
49
+
50
+
51
+
52
+ ######### using summarizer model
53
+ summ_data = []
54
+
55
+ def summarization(text):
56
+ global summ_data
57
+ part = summarizer(text, max_length=150, min_length=30, do_sample=False)
58
+ summ_data.append(part[0]["summary_text"].replace("\xa0", ""))
59
+
60
+
61
+ app = FastAPI(project_name="cv")
62
+
63
+ @app.get("/")
64
+ async def read_root():
65
+ return {"Hello": "World, Project name is : CV Description"}
66
+
67
+ @app.post("/prediction")
68
+ async def detect(cv: UploadFile, number_of_jobs: int):
69
+
70
+ if (type(number_of_jobs) != int) or (number_of_jobs < 1) or (number_of_jobs > df.shape[0]):
71
+ raise HTTPException(
72
+ status_code=415, detail = f"Please enter the number of jobs you want as an ' integer from 1 to {int(df.shape[0]) - 1} '."
73
+ )
74
+
75
+ if cv.filename.split(".")[-1] not in ("pdf") :
76
+ raise HTTPException(
77
+ status_code=415, detail="Please inter PDF file "
78
+ )
79
+
80
+
81
+
82
+ cv_data = extract_text_from_pdf(await cv.read())
83
+ index = len(cv_data)//3
84
+ text = [cv_data[:index], cv_data[index:2*index], cv_data[2*index:]]
85
+ fit_threads(text)
86
+
87
+ data = " .".join(summ_data)
88
+ summ_data.clear()
89
+ cv_vect = vectorizer.transform([data])
90
+ indices = get_most_similar_job(data=data, cv_vect=cv_vect, df_vect=df_vect)
91
+ # Check if all threads have finished
92
+ print("ALL Done")
93
+
94
+ prediction_data = df.iloc[indices[:number_of_jobs]].applymap(lambda x: str(x)).to_dict(orient='records')
95
+
96
+
97
+
98
+ return {"prediction": prediction_data}
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DateTime==5.3
2
+ joblib==1.3.2
3
+ json5==0.9.14979/work
4
+ numpy==1.23.5
5
+ onnxruntime==1.14.1
6
+ optimum==1.16.1
7
+ pandas==1.5.3
8
+ scikit-learn==1.0.2
9
+ selenium==4.2.0
10
+ spacy==2.3.5
11
+ tblib==2.0.0
12
+ timm==0.9.7
13
+ torch==2.0.1+cu117
14
+ transformers==4.34.1
15
+ ultralytics==8.0.200
16
+ uri-template==1.3.0
17
+ uritemplate==4.1.1
18
+ urllib3==1.26.18
19
+ urllib3-secure-extra==0.1.0
20
+ uvicorn==0.18.3
21
+ webdriver-manager==4.0.1