Spaces:

DD0101
/

IDSF-JointBERT_CRF

Sleeping

App Files Files Community

IDSF-JointBERT_CRF / data_loader.py

DD0101

first commit

c938124 over 1 year ago

raw

history blame

10.5 kB

	import copy
	import json
	import logging
	import os

	import torch
	from torch.utils.data import TensorDataset
	from utils import get_intent_labels, get_slot_labels


	logger = logging.getLogger(__name__)


	class InputExample(object):
	"""
	A single training/test example for simple sequence classification.

	Args:
	guid: Unique id for the example.
	words: list. The words of the sequence.
	intent_label: (Optional) string. The intent label of the example.
	slot_labels: (Optional) list. The slot labels of the example.
	"""

	def __init__(self, guid, words, intent_label=None, slot_labels=None):
	self.guid = guid
	self.words = words
	self.intent_label = intent_label
	self.slot_labels = slot_labels

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class InputFeatures(object):
	"""A single set of features of data."""

	def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
	self.input_ids = input_ids
	self.attention_mask = attention_mask
	self.token_type_ids = token_type_ids
	self.intent_label_id = intent_label_id
	self.slot_labels_ids = slot_labels_ids

	def __repr__(self):
	return str(self.to_json_string())

	def to_dict(self):
	"""Serializes this instance to a Python dictionary."""
	output = copy.deepcopy(self.__dict__)
	return output

	def to_json_string(self):
	"""Serializes this instance to a JSON string."""
	return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


	class JointProcessor(object):
	"""Processor for the JointBERT data set """

	def __init__(self, args):
	self.args = args
	self.intent_labels = get_intent_labels(args)
	self.slot_labels = get_slot_labels(args)

	self.input_text_file = "seq.in"
	self.intent_label_file = "label"
	self.slot_labels_file = "seq.out"

	@classmethod
	def _read_file(cls, input_file, quotechar=None):
	"""Reads a tab separated value file."""
	with open(input_file, "r", encoding="utf-8") as f:
	lines = []
	for line in f:
	lines.append(line.strip())
	return lines

	def _create_examples(self, texts, intents, slots, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
	guid = "%s-%s" % (set_type, i)
	# 1. input_text
	words = text.split() # Some are spaced twice
	# 2. intent
	intent_label = (
	self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
	)
	# 3. slot
	slot_labels = []
	for s in slot.split():
	slot_labels.append(
	self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK")
	)

	assert len(words) == len(slot_labels)
	examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
	return examples

	def get_examples(self, mode):
	"""
	Args:
	mode: train, dev, test
	"""
	data_path = os.path.join(self.args.data_dir, self.args.token_level, mode)
	logger.info("LOOKING AT {}".format(data_path))
	return self._create_examples(
	texts=self._read_file(os.path.join(data_path, self.input_text_file)),
	intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
	slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
	set_type=mode,
	)


	processors = {"syllable-level": JointProcessor, "word-level": JointProcessor}


	def convert_examples_to_features(
	examples,
	max_seq_len,
	tokenizer,
	pad_token_label_id=-100,
	cls_token_segment_id=0,
	pad_token_segment_id=0,
	sequence_a_segment_id=0,
	mask_padding_with_zero=True,
	):
	# Setting based on the current model type
	cls_token = tokenizer.cls_token
	sep_token = tokenizer.sep_token
	unk_token = tokenizer.unk_token
	pad_token_id = tokenizer.pad_token_id

	features = []
	for (ex_index, example) in enumerate(examples):
	if ex_index % 5000 == 0:
	logger.info("Writing example %d of %d" % (ex_index, len(examples)))

	# Tokenize word by word (for NER)
	tokens = []
	slot_labels_ids = []
	for word, slot_label in zip(example.words, example.slot_labels):
	word_tokens = tokenizer.tokenize(word)
	if not word_tokens:
	word_tokens = [unk_token] # For handling the bad-encoded word
	tokens.extend(word_tokens)
	# Use the real label id for the first token of the word, and padding ids for the remaining tokens
	slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

	# Account for [CLS] and [SEP]
	special_tokens_count = 2
	if len(tokens) > max_seq_len - special_tokens_count:
	tokens = tokens[: (max_seq_len - special_tokens_count)]
	slot_labels_ids = slot_labels_ids[: (max_seq_len - special_tokens_count)]

	# Add [SEP] token
	tokens += [sep_token]
	slot_labels_ids += [pad_token_label_id]
	token_type_ids = [sequence_a_segment_id] * len(tokens)

	# Add [CLS] token
	tokens = [cls_token] + tokens
	slot_labels_ids = [pad_token_label_id] + slot_labels_ids
	token_type_ids = [cls_token_segment_id] + token_type_ids

	input_ids = tokenizer.convert_tokens_to_ids(tokens)

	# The mask has 1 for real tokens and 0 for padding tokens. Only real
	# tokens are attended to.
	attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

	# Zero-pad up to the sequence length.
	padding_length = max_seq_len - len(input_ids)
	input_ids = input_ids + ([pad_token_id] * padding_length)
	attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
	token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
	slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)

	assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
	assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
	len(attention_mask), max_seq_len
	)
	assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(
	len(token_type_ids), max_seq_len
	)
	assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(
	len(slot_labels_ids), max_seq_len
	)

	intent_label_id = int(example.intent_label)

	if ex_index < 5:
	logger.info("* Example *")
	logger.info("guid: %s" % example.guid)
	logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
	logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
	logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
	logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
	logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
	logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))

	features.append(
	InputFeatures(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	intent_label_id=intent_label_id,
	slot_labels_ids=slot_labels_ids,
	)
	)

	return features


	def load_and_cache_examples(args, tokenizer, mode):
	processor = processors[args.token_level](args)

	# Load data features from cache or dataset file
	cached_features_file = os.path.join(
	args.data_dir,
	"cached_{}_{}_{}_{}".format(
	mode, args.token_level, list(filter(None, args.model_name_or_path.split("/"))).pop(), args.max_seq_len
	),
	)

	if os.path.exists(cached_features_file):
	logger.info("Loading features from cached file %s", cached_features_file)
	features = torch.load(cached_features_file)
	else:
	# Load data features from dataset file
	logger.info("Creating features from dataset file at %s", args.data_dir)
	if mode == "train":
	examples = processor.get_examples("train")
	elif mode == "dev":
	examples = processor.get_examples("dev")
	elif mode == "test":
	examples = processor.get_examples("test")
	else:
	raise Exception("For mode, Only train, dev, test is available")

	# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
	pad_token_label_id = args.ignore_index
	features = convert_examples_to_features(
	examples, args.max_seq_len, tokenizer, pad_token_label_id=pad_token_label_id
	)
	logger.info("Saving features into cached file %s", cached_features_file)
	torch.save(features, cached_features_file)

	# Convert to Tensors and build dataset
	all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
	all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
	all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
	all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
	all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)

	dataset = TensorDataset(
	all_input_ids, all_attention_mask, all_token_type_ids, all_intent_label_ids, all_slot_labels_ids
	)
	return dataset