|
from spacy.tokens import Doc, Span, Token |
|
from spacy.language import Language |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
default_feature_aggregation_config = { |
|
'INCH': { |
|
'method': 'first', |
|
}, |
|
'MOUNTING_FEATURE': { |
|
'method': 'join', |
|
}, |
|
'OS': { |
|
'method': 'first', |
|
}, |
|
'REFRESH_RATE': { |
|
'method': 'first', |
|
}, |
|
'RESOLUTION': { |
|
'method': 'first', |
|
}, |
|
'SOFTWARE_FEATURE': { |
|
'method': 'expand', |
|
}, |
|
'VIDEO_FEATURE': { |
|
'method': 'expand', |
|
}, |
|
'AUDIO_FEATURE': { |
|
'method': 'expand', |
|
}, |
|
'COLOR': { |
|
'method': 'join', |
|
}, |
|
'WIRELESS_FEATURE': { |
|
'method': 'expand', |
|
}, |
|
} |
|
|
|
@Language.factory("feature_aggregator_component") |
|
class FeatureAggregatorComponent(object): |
|
def __init__(self, nlp, name, config=default_feature_aggregation_config): |
|
self.config = config |
|
Doc.set_extension("raw_features", getter=self.raw_features, force=True) |
|
Doc.set_extension("features", getter=self.features, force=True) |
|
Doc.set_extension("add_to_dataframe", method=self.add_to_dataframe, force=True) |
|
Doc.set_extension("feature_aggregation_config", getter=self.get_feature_aggregation_config, setter=self.set_feature_aggregation_config, force=True) |
|
|
|
def __call__(self, doc): |
|
return doc |
|
|
|
def get_feature_aggregation_config(self, doc): |
|
return self.config |
|
|
|
def set_feature_aggregation_config(self, doc, config): |
|
self.config = config |
|
|
|
def raw_features(self, doc): |
|
features = {} |
|
for ent in doc.ents: |
|
if ent._.count is None: |
|
if not ent.label_ in features: |
|
features[ent.label_] = set() |
|
features[ent.label_].add(ent._.text) |
|
else: |
|
|
|
if not ent._.text in features: |
|
features[ent._.text] = 0 |
|
features[ent._.text] += ent._.count |
|
return features |
|
|
|
def features(self, doc): |
|
features = {} |
|
for name, values in self.raw_features(doc).items(): |
|
if not name in self.config: |
|
features[name] = values |
|
continue |
|
|
|
if not 'method' in self.config[name]: |
|
features[name] = values |
|
continue |
|
|
|
method = self.config[name]["method"] |
|
if method == 'first': |
|
if len(values) != 0: |
|
features[name] = values.pop() |
|
else: |
|
features[name] = float('nan') |
|
elif method == 'join': |
|
features[name] = ','.join(list(values)) |
|
elif method == 'expand': |
|
for value in values: |
|
features[value] = 1 |
|
else: |
|
print(f"unknown feature aggregation method: {method}, skipping...") |
|
features[name] = values |
|
return features |
|
|
|
def add_to_dataframe(self, doc, df): |
|
features = self.features(doc) |
|
for name, feature in features.items(): |
|
features[name] = [feature] |
|
df = pd.concat([df, pd.DataFrame(features)]) |
|
return df |
|
|