oh201516
/

en_setec_mk_tv

Token Classification

Model card Files Files and versions Community

en_setec_mk_tv / feature_aggregator_component.py

oh201516's picture

Update spaCy pipeline

ecea783 verified 4 months ago

history blame contribute delete

3.74 kB

	from spacy.tokens import Doc, Span, Token
	from spacy.language import Language
	import pandas as pd

	# Default modes for feature extraction for the labels
	#
	# There are three methods:
	# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
	# if the resolution is reapeted we just want the first accurance.
	# - `expand`: This effectively does OneHot encoding where the feature value names
	# become columns and 1 is put if the feature is there.
	# - `join`: This concatinates the feature values under feature label.
	default_feature_aggregation_config = {
	'INCH': {
	'method': 'first',
	},
	'MOUNTING_FEATURE': {
	'method': 'join',
	},
	'OS': {
	'method': 'first',
	},
	'REFRESH_RATE': {
	'method': 'first',
	},
	'RESOLUTION': {
	'method': 'first',
	},
	'SOFTWARE_FEATURE': {
	'method': 'expand',
	},
	'VIDEO_FEATURE': {
	'method': 'expand',
	},
	'AUDIO_FEATURE': {
	'method': 'expand',
	},
	'COLOR': {
	'method': 'join',
	},
	'WIRELESS_FEATURE': {
	'method': 'expand',
	},
	}

	@Language.factory("feature_aggregator_component")
	class FeatureAggregatorComponent(object):
	def __init__(self, nlp, name, config=default_feature_aggregation_config):
	self.config = config
	Doc.set_extension("raw_features", getter=self.raw_features, force=True)
	Doc.set_extension("features", getter=self.features, force=True)
	Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
	Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)

	def __call__(self, doc):
	return doc

	def get_feature_aggregation_config(self, doc):
	return self.config

	def set_feature_aggregation_config(self, doc, config):
	self.config = config

	def raw_features(self, doc):
	features = {}
	for ent in doc.ents:
	if ent._.count is None:
	if not ent.label_ in features:
	features[ent.label_] = set()
	features[ent.label_].add(ent._.text)
	else:
	# If it has a count we put it in a separate column an accumulate the counts
	if not ent._.text in features:
	features[ent._.text] = 0
	features[ent._.text] += ent._.count
	return features

	def features(self, doc):
	features = {}
	for name, values in self.raw_features(doc).items():
	if not name in self.config:
	features[name] = values
	continue

	if not 'method' in self.config[name]:
	features[name] = values
	continue

	method = self.config[name]["method"]
	if method == 'first':
	if len(values) != 0:
	features[name] = values.pop()
	else:
	features[name] = float('nan')
	elif method == 'join':
	features[name] = ','.join(list(values))
	elif method == 'expand':
	for value in values:
	features[value] = 1
	else:
	print(f"unknown feature aggregation method: {method}, skipping...")
	features[name] = values
	return features

	def add_to_dataframe(self, doc, df):
	features = self.features(doc)
	for name, feature in features.items():
	features[name] = [feature]
	df = pd.concat([df, pd.DataFrame(features)])
	return df