en_setec_mk_tv / feature_aggregator_component.py
oh201516's picture
Update spaCy pipeline
ecea783 verified
from spacy.tokens import Doc, Span, Token
from spacy.language import Language
import pandas as pd
# Default modes for feature extraction for the labels
#
# There are three methods:
# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
# if the resolution is reapeted we just want the first accurance.
# - `expand`: This effectively does OneHot encoding where the feature value names
# become columns and 1 is put if the feature is there.
# - `join`: This concatinates the feature values under feature label.
default_feature_aggregation_config = {
'INCH': {
'method': 'first',
},
'MOUNTING_FEATURE': {
'method': 'join',
},
'OS': {
'method': 'first',
},
'REFRESH_RATE': {
'method': 'first',
},
'RESOLUTION': {
'method': 'first',
},
'SOFTWARE_FEATURE': {
'method': 'expand',
},
'VIDEO_FEATURE': {
'method': 'expand',
},
'AUDIO_FEATURE': {
'method': 'expand',
},
'COLOR': {
'method': 'join',
},
'WIRELESS_FEATURE': {
'method': 'expand',
},
}
@Language.factory("feature_aggregator_component")
class FeatureAggregatorComponent(object):
def __init__(self, nlp, name, config=default_feature_aggregation_config):
self.config = config
Doc.set_extension("raw_features", getter=self.raw_features, force=True)
Doc.set_extension("features", getter=self.features, force=True)
Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)
def __call__(self, doc):
return doc
def get_feature_aggregation_config(self, doc):
return self.config
def set_feature_aggregation_config(self, doc, config):
self.config = config
def raw_features(self, doc):
features = {}
for ent in doc.ents:
if ent._.count is None:
if not ent.label_ in features:
features[ent.label_] = set()
features[ent.label_].add(ent._.text)
else:
# If it has a count we put it in a separate column an accumulate the counts
if not ent._.text in features:
features[ent._.text] = 0
features[ent._.text] += ent._.count
return features
def features(self, doc):
features = {}
for name, values in self.raw_features(doc).items():
if not name in self.config:
features[name] = values
continue
if not 'method' in self.config[name]:
features[name] = values
continue
method = self.config[name]["method"]
if method == 'first':
if len(values) != 0:
features[name] = values.pop()
else:
features[name] = float('nan')
elif method == 'join':
features[name] = ','.join(list(values))
elif method == 'expand':
for value in values:
features[value] = 1
else:
print(f"unknown feature aggregation method: {method}, skipping...")
features[name] = values
return features
def add_to_dataframe(self, doc, df):
features = self.features(doc)
for name, feature in features.items():
features[name] = [feature]
df = pd.concat([df, pd.DataFrame(features)])
return df