Spaces:

nam194
/

Review_company_analysis_and_Resume_parsing

Running

Review_company_analysis_and_Resume_parsing

File size: 11,412 Bytes

9e5b4bd
988450a
f91691d
a7b84e5
c218615
 
012faab
 
988450a
4e55f8f
 
 
 
988450a
 
4e55f8f
 
 
 
 
 
 
 
 
 
 
 
 
 
1984dbe
988450a
 
4e55f8f
 
 
988450a
7bfec6b
 
460c285
7bfec6b
ef85f78
988450a
4e55f8f
 
 
 
 
ef85f78
246d50e
 
 
 
4e55f8f
 
 
 
 
 
 
988450a
fe3240a
988450a
9e5b4bd
f91691d
438f504
 
f91691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe3240a
f91691d
 
 
36812fc
f91691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44a5c94
f91691d
 
 
dc354d3
f91691d
 
 
 
dc354d3
f91691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc354d3
f91691d
dc354d3
f91691d
 
dc354d3
 
f91691d
dc354d3
 
90b529f
 
 
 
 
b3fce6b
 
f91691d
 
 
 
 
 
 
 
 
 
 
32a8609
8dcf015
32a8609
8dcf015
 
65069a9
9e5b4bd
 
 
ed4ffac
1757391
 
988450a
ed4ffac
36812fc
ed4ffac
36812fc
ed4ffac
f91691d
36812fc
ed4ffac
f91691d
9e5b4bd
988450a
 
9e5b4bd
988450a
f91691d
 
9e5b4bd

import gradio as gr
from imports import *
from parse_info import *
#os.system("apt-get install poppler-utils")
token = os.environ.get("HF_TOKEN")
login(token=token)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
    0: "negative",
    1: "positive",
    2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
    data = list(set(data))
    try: 
        data.remove(20)
    except:
        pass
    for i, num in enumerate(data):
        if num == 20:
            continue
        if num>=10:
            data[i] -= 10
    return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False) 
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic)) 


def sentiment(sent: str):
    print("\n--------------------------------------------------------------------------------------------------------------------------\n")
    print("New review inference at: ", datetime.utcnow())
    print("review: ", sent)
    print("\n--------------------------------------------------------------------------------------------------------------------------\n")
    sent_ = normalize(text=sent) 
    input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
    with torch.no_grad():
        out_sent = model_sent(input_sent)
        logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
        pred_sent = dict_[np.argmax(logits_sent)]

    sent = replace_all(text=sent) 
    sent_segment = sent.split(".")
    for i, s in enumerate(sent_segment):
        s = s.strip()
        sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
    dump = [[i, 'O'] for s in sent_segment for i in s]
    dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))    
    dump_iter = DataLoader(dump_set, batch_size=1)
    with torch.no_grad():
        for idx, batch in enumerate(dump_iter):
            batch = { k:v.to(device) for k, v in batch.items() }        
            outputs = model_topic(**batch)
    pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
    return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
    

processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False) 
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
# model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8).to(device)
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
              'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
              'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
            7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
            14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
            21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
                "socical_address_value","education_name","education_time","experience_name","experience_time",
                "designation_value","degree_value","skill_value"]
label2id = {v: k for k, v in id2label.items()}
def pred_resume(pdf_path) -> dict:
    global key_list, device
    result = {}
    for i in key_list:
        result[i] = []
    DPI = 200/77
    global label_list, id2label, label2id

    # read pdf, convert to img
    doc = fitz.open(pdf_path.name)
    num_pages = len(doc)
    images = pdf2image.convert_from_path(pdf_path.name)
    block_dict = {} 

    # get all data in pdf
    page_num = 1
    for page in doc: 
        file_dict = page.get_text('dict') 
        block = file_dict['blocks'] 
        block_dict[page_num] = block 
        page_num += 1

    # predict each page in pdf
    for page_num, blocks in block_dict.items():
        bboxes, words = [], [] # store bounding boxes, text in a page
        image = images[page_num-1]
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
                        text = span['text'].strip()
                        if text.replace(" ","") !=  "":
                            bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
                            words.append(decontracted(text))
        text_reverse = {str(bboxes[i]): words[i] for i,_ in enumerate(words)}
        fake_label = ["O"] * len(words)
        encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
                      padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
        labels = encoding["labels"]
        key_box = encoding["bbox"]
        offset_mapping = encoding.pop('offset_mapping')
        overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
        encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
        x = []
        for i in range(0, len(encoding['pixel_values'])):
            x.append(encoding['pixel_values'][i])
        x = torch.stack(x)
        encoding['pixel_values'] = x

        # forawrd to model
        with torch.no_grad():
            outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
        
        # process output
        predictions = outputs["logits"].argmax(-1).squeeze().tolist()
        if outputs["logits"].shape[0] > 1:
            for i, label in enumerate(labels):
                if i>0:
                    labels[i] = labels[i][256:]
                    predictions[i] = predictions[i][256:]
                    key_box[i] = key_box[i][256:]
            predictions = [j for i in predictions for j in i]
        key_box = [j for i in key_box for j in i]
        labels = [j for i in labels for j in i]
        true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
        key_box = [box for box, label in zip(key_box, labels) if label != -100]
        for box, pred in zip(key_box, true_predictions):
            if pred in key_list:
                result[pred].append(text_reverse[str(box)])
    result = {k: list(set(v)) for k, v in result.items()}
    print("\n--------------------------------------------------------------------------------------------------------------------------\n")
    print("New resume inference at: ", datetime.utcnow())
    print("Pdf name: ", pdf_path.name)
    print("Result: ", result)
    print("\n--------------------------------------------------------------------------------------------------------------------------\n")    
    return result
def norm(result: dict) -> str:
    result = ast.literal_eval(result)
    result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
    result["email_value"] = parse_email(result["email_value"])
    result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
    result["address_value"] = parse_address(result["address_value"])
    result["designation_value"] = parse_designation(result["designation_value"])
    result["experience_time"] = parse_time(result["experience_time"])
    result["gender_value"] = parse_gender(result["gender_value"])
    result["skill_value"] = parse_skill(result["skill_value"])
    result["education_name"] = parse_designation(result["education_name"])
    result["experience_name"] = parse_designation(result["experience_name"])
    for k, v in result.items():
        if isinstance(v, list):
            result[k] = ". ".join([i for i in result[k]])
        if isinstance(v, int) or isinstance(v, float):
            result[k] = str(result[k])
    return "Tên: "+result["person_name"]+"\n"+"Ngày sinh: "+result["dob_value"]+"\n"+"Giới tính: "+result["gender_value"]+"\n"+"Chức danh: "+result["designation_value"]+"\n"+"Số điện thoại: "+result["phonenumber_value"]+"\n"+"Email: "+result["email_value"]+"\n"+"Địa chỉ: "+result["address_value"]+"\n"+"Tên công ty/công việc: "+result["experience_name"]+"\n"+"Tên trường học: "+result["education_name"]+"\n"+"Kỹ năng: "+result["skill_value"]+"\n"+"Năm kinh nghiệm: "+result["experience_time"]


with gr.Blocks() as demo:
    with gr.Tab("REVIEW ANALYSIS"):
        text_input = gr.Textbox(label="Input company review sentence (in Vietnamese, ex: Sếp lắng nghe, con người thân thiện, không phải OT):", placeholder="input here...")
        text_output = gr.Textbox(label="Result (entities, sentiments, context extracted, ...):")
        text_button = gr.Button("Predict")
    with gr.Tab("RESUME PARSER"):
        with gr.Column():
            file_input = gr.File(label="Upload .pdf file", file_types=[".pdf"])
        with gr.Column():
            cv_output = gr.Textbox(label="Information fields found:")
            resume_button = gr.Button("Extract")
        with gr.Column():
            normalize_output = gr.Textbox(label="Normalized by rule-based:")
            normalize_button = gr.Button("Normailze")

    # with gr.Accordion("Open for More!"):
    #     gr.Markdown("Look at me...")

    text_button.click(sentiment, inputs=text_input, outputs=text_output)
    resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
    normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)

demo.launch()