File size: 3,742 Bytes

from spacy.tokens import Doc, Span, Token
from spacy.language import Language
import pandas as pd

# Default modes for feature extraction for the labels
#
# There are three methods:
# - `first`: Wich gets the first occurance and stops, this is nice for features like resolution,
#            if the resolution is reapeted we just want the first accurance.
# - `expand`: This effectively does OneHot encoding where the feature value names
#             become columns and 1 is put if the feature is there.
# - `join`: This concatinates the feature values under feature label.
default_feature_aggregation_config = {
    'INCH': {
        'method': 'first',
    },
    'MOUNTING_FEATURE': {
        'method': 'join',
    },
    'OS': {
        'method': 'first',
    },
    'REFRESH_RATE': {
        'method': 'first',
    },
    'RESOLUTION': {
        'method': 'first',
    },
    'SOFTWARE_FEATURE': {
        'method': 'expand',
    },
    'VIDEO_FEATURE': {
        'method': 'expand',
    },
    'AUDIO_FEATURE': {
        'method': 'expand',
    },
    'COLOR': {
        'method': 'join',
    },
    'WIRELESS_FEATURE': {
        'method': 'expand',
    },
}

@Language.factory("feature_aggregator_component")
class FeatureAggregatorComponent(object):
    def __init__(self, nlp, name, config=default_feature_aggregation_config):
        self.config = config
        Doc.set_extension("raw_features", getter=self.raw_features, force=True)
        Doc.set_extension("features", getter=self.features, force=True)
        Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True)
        Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True)
        
    def __call__(self, doc):
        return doc
    
    def get_feature_aggregation_config(self, doc):
        return self.config
    
    def set_feature_aggregation_config(self, doc, config):
        self.config = config

    def raw_features(self, doc):
        features = {}
        for ent in doc.ents:
            if ent._.count is None:
                if not ent.label_ in features:
                    features[ent.label_] = set()
                features[ent.label_].add(ent._.text)
            else:
                # If it has a count we put it in a separate column an accumulate the counts
                if not ent._.text in features:
                    features[ent._.text] = 0
                features[ent._.text] += ent._.count
        return features

    def features(self, doc):
        features = {}
        for name, values in self.raw_features(doc).items():
            if not name in self.config:
                features[name] = values
                continue
            
            if not 'method' in self.config[name]:
                features[name] = values
                continue

            method = self.config[name]["method"]
            if method == 'first':
                if len(values) != 0:
                    features[name] = values.pop()
                else:
                    features[name] = float('nan')
            elif method == 'join':
                features[name] = ','.join(list(values))
            elif method == 'expand':
                for value in values:
                    features[value] = 1
            else:
                print(f"unknown feature aggregation method: {method}, skipping...")
                features[name] = values
        return features
    
    def add_to_dataframe(self, doc, df):
        features = self.features(doc)
        for name, feature in features.items():
            features[name] = [feature]
        df = pd.concat([df, pd.DataFrame(features)])
        return df