|
import gradio as gr |
|
from imports import * |
|
from parse_info import * |
|
|
|
token = os.environ.get("HF_TOKEN") |
|
login(token=token) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
dict_ = { |
|
0: "negative", |
|
1: "positive", |
|
2: "neutral"} |
|
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False) |
|
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device) |
|
def cvt2cls(data): |
|
data = list(set(data)) |
|
try: |
|
data.remove(20) |
|
except: |
|
pass |
|
for i, num in enumerate(data): |
|
if num == 20: |
|
continue |
|
if num>=10: |
|
data[i] -= 10 |
|
return data |
|
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'} |
|
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'} |
|
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21) |
|
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False) |
|
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device) |
|
model_topic.resize_token_embeddings(len(tokenizer_topic)) |
|
|
|
|
|
def sentiment(sent: str): |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
print("New review inference at: ", datetime.utcnow()) |
|
print("review: ", sent) |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
sent_ = normalize(text=sent) |
|
input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device) |
|
with torch.no_grad(): |
|
out_sent = model_sent(input_sent) |
|
logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0] |
|
pred_sent = dict_[np.argmax(logits_sent)] |
|
|
|
sent = replace_all(text=sent) |
|
sent_segment = sent.split(".") |
|
for i, s in enumerate(sent_segment): |
|
s = s.strip() |
|
sent_segment[i] = underthesea.word_tokenize(s, format="text").split() |
|
dump = [[i, 'O'] for s in sent_segment for i in s] |
|
dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True)) |
|
dump_iter = DataLoader(dump_set, batch_size=1) |
|
with torch.no_grad(): |
|
for idx, batch in enumerate(dump_iter): |
|
batch = { k:v.to(device) for k, v in batch.items() } |
|
outputs = model_topic(**batch) |
|
pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])])) |
|
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) |
|
|
|
|
|
processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False) |
|
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device) |
|
|
|
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value', |
|
'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name', |
|
'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value'] |
|
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value', |
|
7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name', |
|
14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key', |
|
21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'} |
|
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value", |
|
"socical_address_value","education_name","education_time","experience_name","experience_time", |
|
"designation_value","degree_value","skill_value"] |
|
label2id = {v: k for k, v in id2label.items()} |
|
def pred_resume(pdf_path) -> dict: |
|
global key_list, device |
|
result = {} |
|
for i in key_list: |
|
result[i] = [] |
|
DPI = 200/77 |
|
global label_list, id2label, label2id |
|
|
|
|
|
doc = fitz.open(pdf_path.name) |
|
num_pages = len(doc) |
|
images = pdf2image.convert_from_path(pdf_path.name) |
|
block_dict = {} |
|
|
|
|
|
page_num = 1 |
|
for page in doc: |
|
file_dict = page.get_text('dict') |
|
block = file_dict['blocks'] |
|
block_dict[page_num] = block |
|
page_num += 1 |
|
|
|
|
|
for page_num, blocks in block_dict.items(): |
|
bboxes, words = [], [] |
|
image = images[page_num-1] |
|
for block in blocks: |
|
if block['type'] == 0: |
|
for line in block['lines']: |
|
for span in line['spans']: |
|
xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])] |
|
text = span['text'].strip() |
|
if text.replace(" ","") != "": |
|
bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size)) |
|
words.append(decontracted(text)) |
|
text_reverse = {str(bboxes[i]): words[i] for i,_ in enumerate(words)} |
|
fake_label = ["O"] * len(words) |
|
encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256, |
|
padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True) |
|
labels = encoding["labels"] |
|
key_box = encoding["bbox"] |
|
offset_mapping = encoding.pop('offset_mapping') |
|
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping') |
|
encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"} |
|
x = [] |
|
for i in range(0, len(encoding['pixel_values'])): |
|
x.append(encoding['pixel_values'][i]) |
|
x = torch.stack(x) |
|
encoding['pixel_values'] = x |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"}) |
|
|
|
|
|
predictions = outputs["logits"].argmax(-1).squeeze().tolist() |
|
if outputs["logits"].shape[0] > 1: |
|
for i, label in enumerate(labels): |
|
if i>0: |
|
labels[i] = labels[i][256:] |
|
predictions[i] = predictions[i][256:] |
|
key_box[i] = key_box[i][256:] |
|
predictions = [j for i in predictions for j in i] |
|
key_box = [j for i in key_box for j in i] |
|
labels = [j for i in labels for j in i] |
|
true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100] |
|
key_box = [box for box, label in zip(key_box, labels) if label != -100] |
|
for box, pred in zip(key_box, true_predictions): |
|
if pred in key_list: |
|
result[pred].append(text_reverse[str(box)]) |
|
result = {k: list(set(v)) for k, v in result.items()} |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
print("New resume inference at: ", datetime.utcnow()) |
|
print("Pdf name: ", pdf_path.name) |
|
print("Result: ", result) |
|
print("\n--------------------------------------------------------------------------------------------------------------------------\n") |
|
return result |
|
def norm(result: dict) -> str: |
|
result = ast.literal_eval(result) |
|
result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()]) |
|
result["email_value"] = parse_email(result["email_value"]) |
|
result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()]) |
|
result["address_value"] = parse_address(result["address_value"]) |
|
result["designation_value"] = parse_designation(result["designation_value"]) |
|
result["experience_time"] = parse_time(result["experience_time"]) |
|
result["gender_value"] = parse_gender(result["gender_value"]) |
|
result["skill_value"] = parse_skill(result["skill_value"]) |
|
result["education_name"] = parse_designation(result["education_name"]) |
|
result["experience_name"] = parse_designation(result["experience_name"]) |
|
for k, v in result.items(): |
|
if isinstance(v, list): |
|
result[k] = ". ".join([i for i in result[k]]) |
|
if isinstance(v, int) or isinstance(v, float): |
|
result[k] = str(result[k]) |
|
return "Tên: "+result["person_name"]+"\n"+"Ngày sinh: "+result["dob_value"]+"\n"+"Giới tính: "+result["gender_value"]+"\n"+"Chức danh: "+result["designation_value"]+"\n"+"Số điện thoại: "+result["phonenumber_value"]+"\n"+"Email: "+result["email_value"]+"\n"+"Địa chỉ: "+result["address_value"]+"\n"+"Tên công ty/công việc: "+result["experience_name"]+"\n"+"Tên trường học: "+result["education_name"]+"\n"+"Kỹ năng: "+result["skill_value"]+"\n"+"Năm kinh nghiệm: "+result["experience_time"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Tab("REVIEW ANALYSIS"): |
|
text_input = gr.Textbox(label="Input company review sentence (in Vietnamese, ex: Sếp lắng nghe, con người thân thiện, không phải OT):", placeholder="input here...") |
|
text_output = gr.Textbox(label="Result (entities, sentiments, context extracted, ...):") |
|
text_button = gr.Button("Predict") |
|
with gr.Tab("RESUME PARSER"): |
|
with gr.Column(): |
|
file_input = gr.File(label="Upload .pdf file", file_types=[".pdf"]) |
|
with gr.Column(): |
|
cv_output = gr.Textbox(label="Information fields found:") |
|
resume_button = gr.Button("Extract") |
|
with gr.Column(): |
|
normalize_output = gr.Textbox(label="Normalized by rule-based:") |
|
normalize_button = gr.Button("Normailze") |
|
|
|
|
|
|
|
|
|
text_button.click(sentiment, inputs=text_input, outputs=text_output) |
|
resume_button.click(pred_resume, inputs=file_input, outputs=cv_output) |
|
normalize_button.click(norm, inputs=cv_output, outputs=normalize_output) |
|
|
|
demo.launch() |