Disfluency-large

Sleeping

App Files Files Community

Disfluency-large / app.py

DD0101

remove (" ", None)

5bd29a6 over 1 year ago

raw

history blame

8.59 kB

	import os

	import transformers
	from transformers import pipeline
	from transformers.pipelines.token_classification import TokenClassificationPipeline
	import py_vncorenlp

	os.system('pwd')
	os.system('sudo update-alternatives --config java')
	os.mkdir('/home/user/app/vncorenlp')
	py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
	rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')

	# I have to make some changes to the preprocess() method since they (Hugging Face) had changed some attributes
	class MyPipeline(TokenClassificationPipeline):
	def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
	tokenizer_params = preprocess_params.pop("tokenizer_params", {})
	truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
	inputs = self.tokenizer(
	sentence,
	return_tensors=self.framework,
	truncation=truncation,
	return_special_tokens_mask=True,
	return_offsets_mapping=self.tokenizer.is_fast,
	**tokenizer_params,
	)
	inputs.pop("overflow_to_sample_mapping", None)
	num_chunks = len(inputs["input_ids"])

	# Override preprocess method with these offset_mapping lines
	length = len(inputs['input_ids'][0]) - 2
	tokens = self.tokenizer.tokenize(sentence)
	seek = 0
	offset_mapping_list = [[(0, 0)]]
	for i in range(length):
	if tokens[i][-2:] == '@@':
	offset_mapping_list[0].append((seek, seek + len(tokens[i]) - 2))
	seek += len(tokens[i]) - 2
	else:
	offset_mapping_list[0].append((seek, seek + len(tokens[i])))
	seek += len(tokens[i]) + 1
	offset_mapping_list[0].append((0, 0))

	for i in range(num_chunks):
	if self.framework == "tf":
	model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
	else:
	model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}

	model_inputs['offset_mapping'] = offset_mapping_list
	model_inputs["sentence"] = sentence if i == 0 else None
	model_inputs["is_last"] = i == num_chunks - 1

	yield model_inputs

	model_checkpoint = "DD0101/disfluency-large"

	my_classifier = pipeline(
	"token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)


	#################### IDSF #######################
	from Customized_IDSF.utils import get_intent_labels, get_slot_labels, load_tokenizer
	import argparse
	import Customized_IDSF.load_model as lm

	parser = argparse.ArgumentParser()

	# parser.add_argument("--input_file", default="sample_pred_in.txt", type=str, help="Input file for prediction")
	# parser.add_argument("--output_file", default="sample_pred_out.txt", type=str, help="Output file for prediction")
	parser.add_argument("--model_dir", default="/home/user/app/Customized_IDSF/JointBERT-CRF_PhoBERTencoder", type=str, help="Path to save, load model")

	parser.add_argument("--batch_size", default=32, type=int, help="Batch size for prediction")
	parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")

	pred_config = parser.parse_args()

	# load model and args
	args = lm.get_args(pred_config)
	device = lm.get_device(pred_config)
	model = lm.load_model(pred_config, args, device)

	intent_label_lst = get_intent_labels(args)
	slot_label_lst = get_slot_labels(args)

	# Convert input file to TensorDataset
	pad_token_label_id = args.ignore_index
	tokenizer = load_tokenizer(args)

	#################### END IDSF #######################



	def remove_disfluency(example, prediction):
	characters = list(example)

	for entity in reversed(prediction):
	characters[entity['start']:entity['end']] = ''

	return " ".join("".join(characters).split())


	import gradio as gr

	def ner(text):
	text = " ".join(rdrsegmenter.word_segment(text))

	# Some words in lowercase like "đà nẵng" will get error (due to vncorenlp)
	text = text.replace("_đà ", " đà_").replace("_Đà ", " Đà_")

	output = my_classifier(text)
	for entity in output:
	entity['entity'] = entity.pop('entity_group')

	# Remove Disfluency-entities to return a sentence with "Fluency" version
	fluency_sentence = remove_disfluency(text, output)


	#################### IDSF #######################
	prediction = lm.predict([fluency_sentence.strip().split()], pred_config, args, tokenizer, pad_token_label_id, model, device,
	intent_label_lst, slot_label_lst)
	words, slot_preds, intent_pred = prediction[0][0], prediction[1][0], prediction[2][0]

	slot_tokens = []

	words[0] = words[0][0].upper() + words[0][1:] # capitalize the first word of sentence
	for word, pred in zip(words, slot_preds):
	word = word.replace("_", " ")
	if pred == 'O':
	slot_tokens.extend([(word, None))

	elif pred[0] == 'I': # token with label's prefix' "I-XXX": will be combined their corresponded "B-XXX"
	added_tokens = list(slot_tokens[-2])
	added_tokens[0] += f' {word}'
	slot_tokens[-2] = tuple(added_tokens)

	elif pred[0] == 'B': # token with label's prefix "B-XXX" : remove "B-"
	slot_tokens.extend([(word, pred[2:]))

	else: # PAD or UNK tags
	slot_tokens.extend([(word, pred))

	intent_label = intent_label_lst[intent_pred]

	#################### END IDSF #######################



	fluency_sentence = fluency_sentence[0].upper() + fluency_sentence[1:] # since capitalize() just lowercase whole sentence first then uppercase the first letter

	# Replace words like "Đà_Nẵng" to "Đà Nẵng"
	text = text.replace("_", " ")
	fluency_sentence = fluency_sentence.replace("_", " ")

	return {'text': text, 'entities': output}, fluency_sentence, slot_tokens, intent_label


	################################### Gradio Demo ####################################

	examples = ['Tôi cần thuê à tôi muốn bay một chuyến khứ hồi từ Đà Nẵng đến Đà Lạt',
	'Giá vé một chiều à không khứ hồi từ Đà Nẵng đến Vinh dưới 2 triệu đồng giá vé khứ hồi từ Quy Nhơn đến Vinh dưới 3 triệu đồng giá vé khứ hồi từ Buôn Ma Thuột đến Quy Nhơn à đến Vinh dưới 4 triệu rưỡi',
	'Cho tôi biết các chuyến bay đến Đà Nẵng vào ngày 12 mà không ngày 14 tháng sáu',
	'Những chuyến bay nào khởi hành từ Thành phố Hồ Chí Minh bay đến Frankfurt mà nối chuyến ở Singapore và hạ cánh trước 10 giờ ý tôi là 9 giờ tối',
	'Thành Phố nào có VNA ừm thôi cho tôi xem tất cả các chuyến bay từ Thanh Hóa hay Nghệ An nhỉ à Thanh Hóa đến Đà Lạt vào Thứ ba à thôi tôi cần vào Thứ hai',
	'Thông tin về hạng ghế à thôi những bữa ăn nào được phục vụ trên chuyến bay 1490 ơ đâu 1409 của Vietnam Airlines từ Quảng Bình à đâu Quảng Nam đến Cần Thơ',
	'Thời gian khởi hành của chuyến bay từ Huế đến Cam Ranh à ừm tôi nhầm đến Phú Quốc mới đúng là mấy giờ',
	'lịch trình xe bus ấy quên giá vé cho các phương tiện giao thông đường bộ ở quảng nam không ý tôi là đà nẵng là bao nhiêu',
	'mã giá vé ờ không chính xác là mã bữa ăn sb có nghĩa là gì',
	'liệt kê giúp tôi tất cả các chuyến bay và giá vé của chúng từ thanh hóa đến tuy hòa vào chiều à tôi nhầm sáng thứ hai'
	]

	demo = gr.Interface(ner,
	gr.Textbox(label='Sentence', placeholder="Enter your sentence here..."),
	outputs=[gr.HighlightedText(label='Disfluency Highlighted'), gr.Textbox(label='"Fluency" version'),
	gr.HighlightedText(label='Slot Filling Highlighted'), gr.Textbox(label='Intent Label')],
	examples=examples,
	title="Disfluency Detection for Slot Filling and Intent Detection task",
	description="This is an easy-to-use built in Gradio for desmontrating a NER System that identifies disfluency-entities in \
	Vietnamese utterances, then using the 'fluency' version for Slot Filling and Intent Detection task",
	theme=gr.themes.Soft())

	demo.launch()