en_setec_mk_tv / normalizer_component.py
oh201516's picture
Update spaCy pipeline
ecea783 verified
import json
from spacy.language import Language
from spacy.matcher import PhraseMatcher
# Default normalization table, that can be customized by passing it to the component as a parameter.
default_normalization_table = {
"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
"Ethernet": [
"Ethernet LAN",
"Ethernet port RJ-45",
"Ethernet RJ-45",
"Ethernet RJ45",
"Ethernet-LAN RJ-45",
"LAN RJ45",
"Ethernet R45",
],
"CI+ Slot": [
"CI+ Card Slot",
"Common Interface Plus (CI+)",
"Common Interface Plus",
"Card Slot CI +",
],
"Scart": ["SCART", "Scart Input"],
"Component In": [
"Component In",
"Component in(YPbPr)",
"Component Input",
"Component (Y/Pb/Pr)",
"Component In (Y/Pb/Pr)",
],
"USB 2.0": ["USB2.0"],
"Digital Audio": [
"Digital Audio Out",
"Digital Audio Output",
"Digital Audio Output(Coaxial and Optic)",
],
"Composite In": ["Composite", "AV Composite In"],
"3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"],
"Optical Audio Out": ["Optical Audio Out", "Optical Out"],
"Android": ["ANDROID"],
"Android 7.1": ["Android Nougat"],
"Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"],
"VIDAA U4": ["VIDAA U4.0"],
"Android TV": ["Android TV", "AndroidTV", "Android"],
"Titan OS": ["TITAN OS"],
"7680x4320": ["8K"],
"3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"],
"1920x1080": ["FullHD", "Full HD"],
"1366x768": ["HD Ready", "HDReady"],
"1280x720": ["HD"],
"640x480": ["SD"],
"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
"Blutooth": ["BLUETOOTH"],
}
@Language.factory("normalizer_component")
class NormalizerComponent(object):
def __init__(self, nlp, name, norms=None):
if norms is None:
self.norm_table = default_normalization_table
elif isinstance(norms, str):
self.norm_table = json.load(open(norms))
else:
self.norm_table = norms
self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
self.nlp = nlp
for name, patterns in self.norm_table.items():
self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns])
def __call__(self, doc):
for ent in doc.ents:
for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)):
match_id_string = self.nlp.vocab.strings[match_id]
ent._.text = match_id_string.strip()
return doc