|
import gradio as gr |
|
from imports import * |
|
from parse_info import * |
|
|
|
token = os.environ.get("HF_TOKEN") |
|
login(token=token) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
dict_ = { |
|
0: "negative", |
|
1: "positive", |
|
2: "neutral"} |
|
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False) |
|
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device) |
|
def cvt2cls(data): |
|
data = list(set(data)) |
|
try: |
|
data.remove(20) |
|
except: |
|
pass |
|
for i, num in enumerate(data): |
|
if num == 20: |
|
continue |
|
if num>=10: |
|
data[i] -= 10 |
|
return data |
|
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'} |
|
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'} |
|
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21) |
|
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False) |
|
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device) |
|
model_topic.resize_token_embeddings(len(tokenizer_topic)) |
|
|
|
|
|
def sentiment(sent: str): |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
print("New review inference at: ", datetime.utcnow()) |
|
print("review: ", sent) |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
sent_ = normalize(text=sent) |
|
input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device) |
|
with torch.no_grad(): |
|
out_sent = model_sent(input_sent) |
|
logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0] |
|
pred_sent = dict_[np.argmax(logits_sent)] |
|
|
|
sent = replace_all(text=sent) |
|
sent_segment = sent.split(".") |
|
for i, s in enumerate(sent_segment): |
|
s = s.strip() |
|
sent_segment[i] = underthesea.word_tokenize(s, format="text").split() |
|
dump = [[i, 'O'] for s in sent_segment for i in s] |
|
dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True)) |
|
dump_iter = DataLoader(dump_set, batch_size=1) |
|
with torch.no_grad(): |
|
for idx, batch in enumerate(dump_iter): |
|
batch = { k:v.to(device) for k, v in batch.items() } |
|
outputs = model_topic(**batch) |
|
pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])])) |
|
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) |
|
|
|
|
|
processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False) |
|
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label") |
|
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8).to(device) |
|
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value', |
|
'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name', |
|
'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value'] |
|
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value', |
|
7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name', |
|
14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key', |
|
21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'} |
|
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value", |
|
"socical_address_value","education_name","education_time","experience_name","experience_time", |
|
"designation_value","degree_value","skill_value"] |
|
label2id = {v: k for k, v in id2label.items()} |
|
def pred_resume(pdf_path) -> dict: |
|
global key_list, device |
|
result = {} |
|
for i in key_list: |
|
result[i] = [] |
|
DPI = 200/77 |
|
global label_list, id2label, label2id |
|
|
|
|
|
doc = fitz.open(pdf_path.name) |
|
num_pages = len(doc) |
|
images = pdf2image.convert_from_path(pdf_path.name) |
|
block_dict = {} |
|
|
|
|
|
page_num = 1 |
|
for page in doc: |
|
file_dict = page.get_text('dict') |
|
block = file_dict['blocks'] |
|
block_dict[page_num] = block |
|
page_num += 1 |
|
|
|
|
|
for page_num, blocks in block_dict.items(): |
|
bboxes, words = [], [] |
|
image = images[page_num-1] |
|
for block in blocks: |
|
if block['type'] == 0: |
|
for line in block['lines']: |
|
for span in line['spans']: |
|
xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])] |
|
text = span['text'].strip() |
|
if text.replace(" ","") != "": |
|
bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size)) |
|
words.append(decontracted(text)) |
|
text_reverse = {str(bboxes[i]): words[i] for i,_ in enumerate(words)} |
|
fake_label = ["O"] * len(words) |
|
encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256, |
|
padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True) |
|
labels = encoding["labels"] |
|
key_box = encoding["bbox"] |
|
offset_mapping = encoding.pop('offset_mapping') |
|
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping') |
|
encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"} |
|
x = [] |
|
for i in range(0, len(encoding['pixel_values'])): |
|
x.append(encoding['pixel_values'][i]) |
|
x = torch.stack(x) |
|
encoding['pixel_values'] = x |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"}) |
|
|
|
|
|
predictions = outputs["logits"].argmax(-1).squeeze().tolist() |
|
if outputs["logits"].shape[0] > 1: |
|
for i, label in enumerate(labels): |
|
if i>0: |
|
labels[i] = labels[i][256:] |
|
predictions[i] = predictions[i][256:] |
|
key_box[i] = key_box[i][256:] |
|
predictions = [j for i in predictions for j in i] |
|
key_box = [j for i in key_box for j in i] |
|
labels = [j for i in labels for j in i] |
|
true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100] |
|
key_box = [box for box, label in zip(key_box, labels) if label != -100] |
|
for box, pred in zip(key_box, true_predictions): |
|
if pred in key_list: |
|
result[pred].append(text_reverse[str(box)]) |
|
result = {k: list(set(v)) for k, v in result.items()} |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
print("New resume inference at: ", datetime.utcnow()) |
|
print("Pdf name: ", pdf_path.name) |
|
print("Result: ", result) |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
return result |
|
def norm(result: dict) -> str: |
|
result = ast.literal_eval(result) |
|
result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()]) |
|
result["email_value"] = parse_email(result["email_value"]) |
|
result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()]) |
|
result["address_value"] = parse_address(result["address_value"]) |
|
result["designation_value"] = parse_designation(result["designation_value"]) |
|
result["experience_time"] = parse_time(result["experience_time"]) |
|
result["gender_value"] = parse_gender(result["gender_value"]) |
|
result["skill_value"] = parse_skill(result["skill_value"]) |
|
result["education_name"] = parse_designation(result["education_name"]) |
|
result["experience_name"] = parse_designation(result["experience_name"]) |
|
for k, v in result.items(): |
|
if isinstance(v, list): |
|
result[k] = ". ".join([i for i in result[k]]) |
|
if isinstance(v, int) or isinstance(v, float): |
|
result[k] = str(result[k]) |
|
return "Tên: "+result["person_name"]+"\n"+"Ngày sinh: "+result["dob_value"]+"\n"+"Giới tính: "+result["gender_value"]+"\n"+"Chức danh: "+result["designation_value"]+"\n"+"Số điện thoại: "+result["phonenumber_value"]+"\n"+"Email: "+result["email_value"]+"\n"+"Địa chỉ: "+result["address_value"]+"\n"+"Tên công ty/công việc: "+result["experience_name"]+"\n"+"Tên trường học: "+result["education_name"]+"\n"+"Kỹ năng: "+result["skill_value"]+"\n"+"Năm kinh nghiệm: "+result["experience_time"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("DEMO PROJECTS: REVIEW ANALYSIS AND EXTRACT INFOMATION FROM RESUME") |
|
with gr.Tab("Review analysis"): |
|
text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...") |
|
text_output = gr.Textbox(label="Result:") |
|
text_button = gr.Button("Predict") |
|
with gr.Tab("Extract infomation from resume"): |
|
with gr.Column(): |
|
file_input = gr.File(label="Upload pdf", file_types=[".pdf"]) |
|
with gr.Column(): |
|
cv_output = gr.Textbox(label="Information fields") |
|
resume_button = gr.Button("Extract") |
|
with gr.Column(): |
|
normalize_output = gr.Textbox(label="Normalize by rule-based:") |
|
normalize_button = gr.Button("Normailze") |
|
|
|
|
|
|
|
|
|
text_button.click(sentiment, inputs=text_input, outputs=text_output) |
|
resume_button.click(pred_resume, inputs=file_input, outputs=cv_output) |
|
normalize_button.click(norm, inputs=cv_output, outputs=normalize_output) |
|
|
|
demo.launch() |