import json from spacy.language import Language from spacy.matcher import PhraseMatcher # Default normalization table, that can be customized by passing it to the component as a parameter. default_normalization_table = { "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"], "Ethernet": [ "Ethernet LAN", "Ethernet port RJ-45", "Ethernet RJ-45", "Ethernet RJ45", "Ethernet-LAN RJ-45", "LAN RJ45", "Ethernet R45", ], "CI+ Slot": [ "CI+ Card Slot", "Common Interface Plus (CI+)", "Common Interface Plus", "Card Slot CI +", ], "Scart": ["SCART", "Scart Input"], "Component In": [ "Component In", "Component in(YPbPr)", "Component Input", "Component (Y/Pb/Pr)", "Component In (Y/Pb/Pr)", ], "USB 2.0": ["USB2.0"], "Digital Audio": [ "Digital Audio Out", "Digital Audio Output", "Digital Audio Output(Coaxial and Optic)", ], "Composite In": ["Composite", "AV Composite In"], "3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"], "Optical Audio Out": ["Optical Audio Out", "Optical Out"], "Android": ["ANDROID"], "Android 7.1": ["Android Nougat"], "Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"], "VIDAA U4": ["VIDAA U4.0"], "Android TV": ["Android TV", "AndroidTV", "Android"], "Titan OS": ["TITAN OS"], "7680x4320": ["8K"], "3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"], "1920x1080": ["FullHD", "Full HD"], "1366x768": ["HD Ready", "HDReady"], "1280x720": ["HD"], "640x480": ["SD"], "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"], "Blutooth": ["BLUETOOTH"], } @Language.factory("normalizer_component") class NormalizerComponent(object): def __init__(self, nlp, name, norms=None): if norms is None: self.norm_table = default_normalization_table elif isinstance(norms, str): self.norm_table = json.load(open(norms)) else: self.norm_table = norms self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.nlp = nlp for name, patterns in self.norm_table.items(): self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns]) def __call__(self, doc): for ent in doc.ents: for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)): match_id_string = self.nlp.vocab.strings[match_id] ent._.text = match_id_string.strip() return doc