|
import json |
|
|
|
from spacy.language import Language |
|
from spacy.matcher import PhraseMatcher |
|
|
|
|
|
default_normalization_table = { |
|
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"], |
|
"Ethernet": [ |
|
"Ethernet LAN", |
|
"Ethernet port RJ-45", |
|
"Ethernet RJ-45", |
|
"Ethernet RJ45", |
|
"Ethernet-LAN RJ-45", |
|
"LAN RJ45", |
|
"Ethernet R45", |
|
], |
|
"CI+ Slot": [ |
|
"CI+ Card Slot", |
|
"Common Interface Plus (CI+)", |
|
"Common Interface Plus", |
|
"Card Slot CI +", |
|
], |
|
"Scart": ["SCART", "Scart Input"], |
|
"Component In": [ |
|
"Component In", |
|
"Component in(YPbPr)", |
|
"Component Input", |
|
"Component (Y/Pb/Pr)", |
|
"Component In (Y/Pb/Pr)", |
|
], |
|
"USB 2.0": ["USB2.0"], |
|
"Digital Audio": [ |
|
"Digital Audio Out", |
|
"Digital Audio Output", |
|
"Digital Audio Output(Coaxial and Optic)", |
|
], |
|
"Composite In": ["Composite", "AV Composite In"], |
|
"3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"], |
|
"Optical Audio Out": ["Optical Audio Out", "Optical Out"], |
|
"Android": ["ANDROID"], |
|
"Android 7.1": ["Android Nougat"], |
|
"Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"], |
|
"VIDAA U4": ["VIDAA U4.0"], |
|
"Android TV": ["Android TV", "AndroidTV", "Android"], |
|
"Titan OS": ["TITAN OS"], |
|
"7680x4320": ["8K"], |
|
"3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"], |
|
"1920x1080": ["FullHD", "Full HD"], |
|
"1366x768": ["HD Ready", "HDReady"], |
|
"1280x720": ["HD"], |
|
"640x480": ["SD"], |
|
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"], |
|
"Blutooth": ["BLUETOOTH"], |
|
} |
|
|
|
|
|
@Language.factory("normalizer_component") |
|
class NormalizerComponent(object): |
|
def __init__(self, nlp, name, norms=None): |
|
if norms is None: |
|
self.norm_table = default_normalization_table |
|
elif isinstance(norms, str): |
|
self.norm_table = json.load(open(norms)) |
|
else: |
|
self.norm_table = norms |
|
|
|
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") |
|
self.nlp = nlp |
|
|
|
for name, patterns in self.norm_table.items(): |
|
self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns]) |
|
|
|
def __call__(self, doc): |
|
for ent in doc.ents: |
|
for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)): |
|
match_id_string = self.nlp.vocab.strings[match_id] |
|
ent._.text = match_id_string.strip() |
|
return doc |
|
|