from spacy.tokens import Doc, Span, Token from spacy.language import Language import pandas as pd # Default modes for feature extraction for the labels # # There are three methods: # - `first`: Wich gets the first occurance and stops, this is nice for features like resolution, # if the resolution is reapeted we just want the first accurance. # - `expand`: This effectively does OneHot encoding where the feature value names # become columns and 1 is put if the feature is there. # - `join`: This concatinates the feature values under feature label. default_feature_aggregation_config = { 'INCH': { 'method': 'first', }, 'MOUNTING_FEATURE': { 'method': 'join', }, 'OS': { 'method': 'first', }, 'REFRESH_RATE': { 'method': 'first', }, 'RESOLUTION': { 'method': 'first', }, 'SOFTWARE_FEATURE': { 'method': 'expand', }, 'VIDEO_FEATURE': { 'method': 'expand', }, 'AUDIO_FEATURE': { 'method': 'expand', }, 'COLOR': { 'method': 'join', }, 'WIRELESS_FEATURE': { 'method': 'expand', }, } @Language.factory("feature_aggregator_component") class FeatureAggregatorComponent(object): def __init__(self, nlp, name, config=default_feature_aggregation_config): self.config = config Doc.set_extension("raw_features", getter=self.raw_features, force=True) Doc.set_extension("features", getter=self.features, force=True) Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True) Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True) def __call__(self, doc): return doc def get_feature_aggregation_config(self, doc): return self.config def set_feature_aggregation_config(self, doc, config): self.config = config def raw_features(self, doc): features = {} for ent in doc.ents: if ent._.count is None: if not ent.label_ in features: features[ent.label_] = set() features[ent.label_].add(ent._.text) else: # If it has a count we put it in a separate column an accumulate the counts if not ent._.text in features: features[ent._.text] = 0 features[ent._.text] += ent._.count return features def features(self, doc): features = {} for name, values in self.raw_features(doc).items(): if not name in self.config: features[name] = values continue if not 'method' in self.config[name]: features[name] = values continue method = self.config[name]["method"] if method == 'first': if len(values) != 0: features[name] = values.pop() else: features[name] = float('nan') elif method == 'join': features[name] = ','.join(list(values)) elif method == 'expand': for value in values: features[value] = 1 else: print(f"unknown feature aggregation method: {method}, skipping...") features[name] = values return features def add_to_dataframe(self, doc, df): features = self.features(doc) for name, feature in features.items(): features[name] = [feature] df = pd.concat([df, pd.DataFrame(features)]) return df