File size: 11,471 Bytes
9e5b4bd 988450a f91691d a7b84e5 c218615 012faab 988450a 4e55f8f 988450a 4e55f8f 1984dbe 988450a 4e55f8f 988450a 7bfec6b 460c285 7bfec6b ef85f78 988450a 4e55f8f ef85f78 246d50e 4e55f8f 988450a fe3240a 988450a 9e5b4bd f91691d 438f504 f91691d fe3240a f91691d 36812fc f91691d 44a5c94 f91691d dc354d3 f91691d dc354d3 f91691d dc354d3 f91691d dc354d3 f91691d dc354d3 f91691d dc354d3 90b529f b3fce6b f91691d 32a8609 8dcf015 32a8609 8dcf015 65069a9 9e5b4bd ed4ffac 79a32e1 988450a ed4ffac 36812fc ed4ffac 36812fc ed4ffac f91691d 36812fc ed4ffac f91691d 9e5b4bd 988450a 9e5b4bd 988450a f91691d 9e5b4bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import gradio as gr
from imports import *
from parse_info import *
#os.system("apt-get install poppler-utils")
token = os.environ.get("HF_TOKEN")
login(token=token)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
0: "negative",
1: "positive",
2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
data = list(set(data))
try:
data.remove(20)
except:
pass
for i, num in enumerate(data):
if num == 20:
continue
if num>=10:
data[i] -= 10
return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False)
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic))
def sentiment(sent: str):
print("\n--------------------------------------------------------------------------------------------------------------------------\n")
print("New review inference at: ", datetime.utcnow())
print("review: ", sent)
print("\n--------------------------------------------------------------------------------------------------------------------------\n")
sent_ = normalize(text=sent)
input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
with torch.no_grad():
out_sent = model_sent(input_sent)
logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
pred_sent = dict_[np.argmax(logits_sent)]
sent = replace_all(text=sent)
sent_segment = sent.split(".")
for i, s in enumerate(sent_segment):
s = s.strip()
sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
dump = [[i, 'O'] for s in sent_segment for i in s]
dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))
dump_iter = DataLoader(dump_set, batch_size=1)
with torch.no_grad():
for idx, batch in enumerate(dump_iter):
batch = { k:v.to(device) for k, v in batch.items() }
outputs = model_topic(**batch)
pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False)
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
# model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8).to(device)
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
"socical_address_value","education_name","education_time","experience_name","experience_time",
"designation_value","degree_value","skill_value"]
label2id = {v: k for k, v in id2label.items()}
def pred_resume(pdf_path) -> dict:
global key_list, device
result = {}
for i in key_list:
result[i] = []
DPI = 200/77
global label_list, id2label, label2id
# read pdf, convert to img
doc = fitz.open(pdf_path.name)
num_pages = len(doc)
images = pdf2image.convert_from_path(pdf_path.name)
block_dict = {}
# get all data in pdf
page_num = 1
for page in doc:
file_dict = page.get_text('dict')
block = file_dict['blocks']
block_dict[page_num] = block
page_num += 1
# predict each page in pdf
for page_num, blocks in block_dict.items():
bboxes, words = [], [] # store bounding boxes, text in a page
image = images[page_num-1]
for block in blocks:
if block['type'] == 0:
for line in block['lines']:
for span in line['spans']:
xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
text = span['text'].strip()
if text.replace(" ","") != "":
bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
words.append(decontracted(text))
text_reverse = {str(bboxes[i]): words[i] for i,_ in enumerate(words)}
fake_label = ["O"] * len(words)
encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
labels = encoding["labels"]
key_box = encoding["bbox"]
offset_mapping = encoding.pop('offset_mapping')
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
x = []
for i in range(0, len(encoding['pixel_values'])):
x.append(encoding['pixel_values'][i])
x = torch.stack(x)
encoding['pixel_values'] = x
# forawrd to model
with torch.no_grad():
outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
# process output
predictions = outputs["logits"].argmax(-1).squeeze().tolist()
if outputs["logits"].shape[0] > 1:
for i, label in enumerate(labels):
if i>0:
labels[i] = labels[i][256:]
predictions[i] = predictions[i][256:]
key_box[i] = key_box[i][256:]
predictions = [j for i in predictions for j in i]
key_box = [j for i in key_box for j in i]
labels = [j for i in labels for j in i]
true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
key_box = [box for box, label in zip(key_box, labels) if label != -100]
for box, pred in zip(key_box, true_predictions):
if pred in key_list:
result[pred].append(text_reverse[str(box)])
result = {k: list(set(v)) for k, v in result.items()}
print("\n--------------------------------------------------------------------------------------------------------------------------\n")
print("New resume inference at: ", datetime.utcnow())
print("Pdf name: ", pdf_path.name)
print("Result: ", result)
print("\n--------------------------------------------------------------------------------------------------------------------------\n")
return result
def norm(result: dict) -> str:
result = ast.literal_eval(result)
result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
result["email_value"] = parse_email(result["email_value"])
result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
result["address_value"] = parse_address(result["address_value"])
result["designation_value"] = parse_designation(result["designation_value"])
result["experience_time"] = parse_time(result["experience_time"])
result["gender_value"] = parse_gender(result["gender_value"])
result["skill_value"] = parse_skill(result["skill_value"])
result["education_name"] = parse_designation(result["education_name"])
result["experience_name"] = parse_designation(result["experience_name"])
for k, v in result.items():
if isinstance(v, list):
result[k] = ". ".join([i for i in result[k]])
if isinstance(v, int) or isinstance(v, float):
result[k] = str(result[k])
return "Tên: "+result["person_name"]+"\n"+"Ngày sinh: "+result["dob_value"]+"\n"+"Giới tính: "+result["gender_value"]+"\n"+"Chức danh: "+result["designation_value"]+"\n"+"Số điện thoại: "+result["phonenumber_value"]+"\n"+"Email: "+result["email_value"]+"\n"+"Địa chỉ: "+result["address_value"]+"\n"+"Tên công ty/công việc: "+result["experience_name"]+"\n"+"Tên trường học: "+result["education_name"]+"\n"+"Kỹ năng: "+result["skill_value"]+"\n"+"Năm kinh nghiệm: "+result["experience_time"]
with gr.Blocks() as demo:
with gr.Tab("REVIEW ANALYSIS"):
text_input = gr.Textbox(label="Input company review sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
text_output = gr.Textbox(label="Result:")
text_button = gr.Button("Predict")
with gr.Tab("RESUME PARSER"):
with gr.Column():
file_input = gr.File(label="Upload .pdf file", file_types=[".pdf"])
with gr.Column():
cv_output = gr.Textbox(label="Information fields found:")
resume_button = gr.Button("Extract")
with gr.Column():
normalize_output = gr.Textbox(label="Normalized by rule-based:")
normalize_button = gr.Button("Normailze")
# with gr.Accordion("Open for More!"):
# gr.Markdown("Look at me...")
text_button.click(sentiment, inputs=text_input, outputs=text_output)
resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)
demo.launch() |