from spacy.tokens import Doc, Span, Token from spacy.matcher import PhraseMatcher from spacy.util import filter_spans from spacy.language import Language import re # https://spacy.io/usage/processing-pipelines#custom-components @Language.factory("count_extraction_component") class CountExtractorComponent(object): # By default it only extracts count from CONNECTION type but this can be changed. def __init__(self, nlp, name, label="CONNECTION"): self.label = label self.reg_left = re.compile(r"^(?P\d+)\s*[xX]\s*(?P.+)$") self.reg_right = re.compile(r"^(?P.+)\s*[xX]\s*(?P\d+)$") self.reg_right_inverted = re.compile(r"^(?P.+)\s*(?P\d+)\s*[xX]$") # set extensions to tokens, spans and docs Span.set_extension("count", default=None, force=True) Span.set_extension("text", default=None, force=True) def __call__(self, doc): for ent in doc.ents: text = ent.text.strip() if ent.label_ != self.label: ent._.text = text continue m = self.reg_left.match(text) if m is not None: map = m.groupdict() ent._.text = map["name"].strip() ent._.count = int(map["count"]) continue m = self.reg_right.match(text) if m is not None: map = m.groupdict() ent._.text = map["name"].strip() ent._.count = int(map["count"]) continue m = self.reg_right_inverted.match(text) if m is not None: map = m.groupdict() ent._.text = map["name"].strip() ent._.count = int(map["count"]) continue ent._.text = text ent._.count = 1 return doc