oh201516
/

en_setec_mk_tv

Token Classification

Model card Files Files and versions Community

en_setec_mk_tv / normalizer_component.py

oh201516's picture

Update spaCy pipeline

ecea783 verified 7 months ago

history blame contribute delete

2.71 kB

	import json

	from spacy.language import Language
	from spacy.matcher import PhraseMatcher

	# Default normalization table, that can be customized by passing it to the component as a parameter.
	default_normalization_table = {
	"Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
	"Ethernet": [
	"Ethernet LAN",
	"Ethernet port RJ-45",
	"Ethernet RJ-45",
	"Ethernet RJ45",
	"Ethernet-LAN RJ-45",
	"LAN RJ45",
	"Ethernet R45",
	],
	"CI+ Slot": [
	"CI+ Card Slot",
	"Common Interface Plus (CI+)",
	"Common Interface Plus",
	"Card Slot CI +",
	],
	"Scart": ["SCART", "Scart Input"],
	"Component In": [
	"Component In",
	"Component in(YPbPr)",
	"Component Input",
	"Component (Y/Pb/Pr)",
	"Component In (Y/Pb/Pr)",
	],
	"USB 2.0": ["USB2.0"],
	"Digital Audio": [
	"Digital Audio Out",
	"Digital Audio Output",
	"Digital Audio Output(Coaxial and Optic)",
	],
	"Composite In": ["Composite", "AV Composite In"],
	"3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"],
	"Optical Audio Out": ["Optical Audio Out", "Optical Out"],
	"Android": ["ANDROID"],
	"Android 7.1": ["Android Nougat"],
	"Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"],
	"VIDAA U4": ["VIDAA U4.0"],
	"Android TV": ["Android TV", "AndroidTV", "Android"],
	"Titan OS": ["TITAN OS"],
	"7680x4320": ["8K"],
	"3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"],
	"1920x1080": ["FullHD", "Full HD"],
	"1366x768": ["HD Ready", "HDReady"],
	"1280x720": ["HD"],
	"640x480": ["SD"],
	"Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
	"Blutooth": ["BLUETOOTH"],
	}


	@Language.factory("normalizer_component")
	class NormalizerComponent(object):
	def __init__(self, nlp, name, norms=None):
	if norms is None:
	self.norm_table = default_normalization_table
	elif isinstance(norms, str):
	self.norm_table = json.load(open(norms))
	else:
	self.norm_table = norms

	self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
	self.nlp = nlp

	for name, patterns in self.norm_table.items():
	self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns])

	def __call__(self, doc):
	for ent in doc.ents:
	for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)):
	match_id_string = self.nlp.vocab.strings[match_id]
	ent._.text = match_id_string.strip()
	return doc