oh201516 commited on
Commit
03f139a
1 Parent(s): b250531

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -26,8 +26,8 @@ model-index:
26
  | **Name** | `en_setec_mk_tv` |
27
  | **Version** | `0.0.0` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
- | **Default Pipeline** | `tok2vec`, `ner` |
30
- | **Components** | `tok2vec`, `ner` |
31
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
 
26
  | **Name** | `en_setec_mk_tv` |
27
  | **Version** | `0.0.0` |
28
  | **spaCy** | `>=3.7.5,<3.8.0` |
29
+ | **Default Pipeline** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
30
+ | **Components** | `tok2vec`, `ner`, `count_extraction_component`, `normalizer_component` |
31
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
32
  | **Sources** | n/a |
33
  | **License** | n/a |
config.cfg CHANGED
@@ -10,7 +10,7 @@ seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["tok2vec","ner","count_extraction_component"]
14
  batch_size = 1000
15
  disabled = []
16
  before_creation = null
@@ -46,6 +46,10 @@ nO = null
46
  width = ${components.tok2vec.model.encode.width}
47
  upstream = "*"
48
 
 
 
 
 
49
  [components.tok2vec]
50
  factory = "tok2vec"
51
 
 
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["tok2vec","ner","count_extraction_component","normalizer_component"]
14
  batch_size = 1000
15
  disabled = []
16
  before_creation = null
 
46
  width = ${components.tok2vec.model.encode.width}
47
  upstream = "*"
48
 
49
+ [components.normalizer_component]
50
+ factory = "normalizer_component"
51
+ norm_file = "normilization.json"
52
+
53
  [components.tok2vec]
54
  factory = "tok2vec"
55
 
en_setec_mk_tv-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:486bf4e15ccf77ad023ba0362bf1142e833b37e7506d78a7141975d4480ebdd3
3
- size 5706265
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b24f9fbb068cdd0818a7286c4afdd871eef0d64a5bb0ec99824e8a66f5d6b0
3
+ size 5707744
meta.json CHANGED
@@ -36,12 +36,14 @@
36
  "pipeline":[
37
  "tok2vec",
38
  "ner",
39
- "count_extraction_component"
 
40
  ],
41
  "components":[
42
  "tok2vec",
43
  "ner",
44
- "count_extraction_component"
 
45
  ],
46
  "disabled":[
47
 
 
36
  "pipeline":[
37
  "tok2vec",
38
  "ner",
39
+ "count_extraction_component",
40
+ "normalizer_component"
41
  ],
42
  "components":[
43
  "tok2vec",
44
  "ner",
45
+ "count_extraction_component",
46
+ "normalizer_component"
47
  ],
48
  "disabled":[
49
 
normalizer_component.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from spacy.language import Language
4
+ from spacy.matcher import PhraseMatcher
5
+
6
+ default_normalization_table = {
7
+ "Dolby Atmos": ["Dolby Atmos", "Dolby Audio Atmos", "Dolby Atmos Audio"],
8
+ "Ethernet": [
9
+ "Ethernet LAN",
10
+ "Ethernet port RJ-45",
11
+ "Ethernet RJ-45",
12
+ "Ethernet RJ45",
13
+ "Ethernet-LAN RJ-45",
14
+ "LAN RJ45",
15
+ "Ethernet R45",
16
+ ],
17
+ "CI+ Slot": [
18
+ "CI+ Card Slot",
19
+ "Common Interface Plus (CI+)",
20
+ "Common Interface Plus",
21
+ "Card Slot CI +",
22
+ ],
23
+ "Scart": ["SCART", "Scart Input"],
24
+ "Component In": [
25
+ "Component In",
26
+ "Component in(YPbPr)",
27
+ "Component Input",
28
+ "Component (Y/Pb/Pr)",
29
+ "Component In (Y/Pb/Pr)",
30
+ ],
31
+ "USB 2.0": ["USB2.0"],
32
+ "Digital Audio": [
33
+ "Digital Audio Out",
34
+ "Digital Audio Output",
35
+ "Digital Audio Output(Coaxial and Optic)",
36
+ ],
37
+ "Composite In": ["Composite", "AV Composite In"],
38
+ "3.5mm Headphone jack": ["3.5mm Headphone jack", "Headphone 3.5mm jack"],
39
+ "Optical Audio Out": ["Optical Audio Out", "Optical Out"],
40
+ "Android": ["ANDROID"],
41
+ "Android 7.1": ["Android Nougat"],
42
+ "Google TV": ["GoogleTV", "Google LED TV", "Google miniLED TV", "Google OLED TV"],
43
+ "VIDAA U4": ["VIDAA U4.0"],
44
+ "Android TV": ["Android TV", "AndroidTV", "Android"],
45
+ "Titan OS": ["TITAN OS"],
46
+ "7680x4320": ["8K"],
47
+ "3840x2160": ["4K", "4K UltraHD", "4K Ultra HD", "UltraHD", "Ultra HD"],
48
+ "1920x1080": ["FullHD", "Full HD"],
49
+ "1366x768": ["HD Ready", "HDReady"],
50
+ "1280x720": ["HD"],
51
+ "640x480": ["SD"],
52
+ "Wifi": ["Wifi", "Wi-Fi", "Wifi built in", "built in Wifi", "WiFi integrated"],
53
+ "BLUETOOTH": ["BLUETOOTH", "Blutooth"],
54
+ }
55
+
56
+
57
+ @Language.factory("normalizer_component")
58
+ class NormalizerComponent(object):
59
+ def __init__(self, nlp, name, norm_file=None):
60
+ # if norm_file is None:
61
+ self.norm_table = default_normalization_table
62
+ # elif isinstance(norm_file, object):
63
+ # self.norm_table = norm_file
64
+ # else:
65
+ # self.norm_table = json.load(open(norm_file))
66
+
67
+ self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
68
+ self.nlp = nlp
69
+
70
+ for name, patterns in self.norm_table.items():
71
+ self.matcher.add(name, [nlp.make_doc(pattern) for pattern in patterns])
72
+
73
+ def __call__(self, doc):
74
+ for ent in doc.ents:
75
+ for match_id, start, end in self.matcher(self.nlp.make_doc(ent._.text)):
76
+ match_id_string = nlp.vocab.strings[match_id]
77
+ ent._.text = match_id_string.strip()
78
+ return doc
vocab/strings.json CHANGED
@@ -3,6 +3,10 @@
3
  "\n",
4
  " ",
5
  " ",
 
 
 
 
6
  "\"",
7
  "%",
8
  "&",
@@ -167,6 +171,7 @@
167
  "126",
168
  "127",
169
  "1280",
 
170
  "12a.m",
171
  "12a.m.",
172
  "12p.m",
@@ -275,6 +280,8 @@
275
  "3.0",
276
  "3.1.2",
277
  "3.5",
 
 
278
  "3.840",
279
  "30\"-100",
280
  "300",
@@ -340,6 +347,7 @@
340
  "43pus8507/12",
341
  "4500",
342
  "48",
 
343
  "4800",
344
  "4CH",
345
  "4Hz",
@@ -421,6 +429,7 @@
421
  "60",
422
  "600",
423
  "632",
 
424
  "65",
425
  "65OLED707/12",
426
  "65OLED907/12",
@@ -472,6 +481,7 @@
472
  "75pus8807/12",
473
  "768",
474
  "7680",
 
475
  "768p",
476
  "77",
477
  "7a.m",
@@ -660,6 +670,8 @@
660
  "Ambilight+hue",
661
  "Amlogic",
662
  "Android",
 
 
663
  "AndroidTV",
664
  "Antenna",
665
  "Anyview",
@@ -714,6 +726,7 @@
714
  "CHROMECAST",
715
  "CI",
716
  "CI+",
 
717
  "CMP",
718
  "COLOR",
719
  "CONNECTION",
@@ -752,7 +765,9 @@
752
  "Comfort",
753
  "Common",
754
  "Component",
 
755
  "Composite",
 
756
  "Computer",
757
  "Conn",
758
  "Conn.",
@@ -803,6 +818,7 @@
803
  "Design",
804
  "Did",
805
  "Digital",
 
806
  "Diming",
807
  "Dimming",
808
  "Dinamic",
@@ -813,6 +829,7 @@
813
  "Doin'",
814
  "Doin\u2019",
815
  "Dolby",
 
816
  "Dot",
817
  "Dots",
818
  "Dr",
@@ -880,6 +897,7 @@
880
  "Goin\u2019",
881
  "Gon",
882
  "Google",
 
883
  "GoogleTV",
884
  "Got",
885
  "Gov",
@@ -1156,6 +1174,7 @@
1156
  "Operating",
1157
  "Optic",
1158
  "Optical",
 
1159
  "Ore",
1160
  "Ore.",
1161
  "Oregon",
@@ -1267,6 +1286,7 @@
1267
  "S506",
1268
  "S905w",
1269
  "SCART",
 
1270
  "SK",
1271
  "SMART",
1272
  "SOFTWARE_FEATURE",
@@ -1343,6 +1363,7 @@
1343
  "This\u2019s",
1344
  "Those",
1345
  "Titan",
 
1346
  "Total",
1347
  "Triluminos",
1348
  "Triple",
@@ -1365,6 +1386,7 @@
1365
  "ULTRAHD",
1366
  "URE",
1367
  "USB",
 
1368
  "USB2.0",
1369
  "Ultimate",
1370
  "Ultra",
@@ -1380,6 +1402,7 @@
1380
  "VGA(PC",
1381
  "VIA",
1382
  "VIDAA",
 
1383
  "VIDEO_FEATURE",
1384
  "VRR",
1385
  "VTE",
@@ -1448,13 +1471,16 @@
1448
  "XR400",
1449
  "XX",
1450
  "XX+",
 
1451
  "XX-dd",
1452
  "XXH",
1453
  "XXX",
 
1454
  "XXX(XX",
1455
  "XXX+",
1456
  "XXX-",
1457
  "XXXX",
 
1458
  "XXXX+",
1459
  "XXXX_XXXX",
1460
  "XXXXd",
@@ -1507,6 +1533,11 @@
1507
  "XxxxXxxxx",
1508
  "XxxxdddXXxd",
1509
  "Xxxxx",
 
 
 
 
 
1510
  "Xxxxx&Xxxxx",
1511
  "Xxxxx'",
1512
  "Xxxxx'x",
@@ -1601,6 +1632,8 @@
1601
  "and",
1602
  "and/or",
1603
  "android",
 
 
1604
  "androidtv",
1605
  "ano",
1606
  "ans",
@@ -1692,6 +1725,7 @@
1692
  "chromecast",
1693
  "ci",
1694
  "ci+",
 
1695
  "cks",
1696
  "clarity",
1697
  "clariy",
@@ -1712,7 +1746,9 @@
1712
  "comfort",
1713
  "common",
1714
  "component",
 
1715
  "composite",
 
1716
  "computer",
1717
  "conn",
1718
  "conn.",
@@ -1759,6 +1795,7 @@
1759
  "d.dXX",
1760
  "d.ddd",
1761
  "d.dxx",
 
1762
  "d.x",
1763
  "dTV",
1764
  "dX",
@@ -1793,6 +1830,7 @@
1793
  "ddddxddd",
1794
  "ddddxdddd",
1795
  "dddx",
 
1796
  "dddxx",
1797
  "dddxxddd",
1798
  "dddxxx",
@@ -1817,6 +1855,7 @@
1817
  "dex",
1818
  "did",
1819
  "digital",
 
1820
  "diming",
1821
  "dimming",
1822
  "dinamic",
@@ -1835,6 +1874,7 @@
1835
  "doing",
1836
  "doin\u2019",
1837
  "dolby",
 
1838
  "dot",
1839
  "dots",
1840
  "dr",
@@ -1959,6 +1999,7 @@
1959
  "gon",
1960
  "gonna",
1961
  "google",
 
1962
  "googletv",
1963
  "got",
1964
  "gov",
@@ -2330,6 +2371,7 @@
2330
  "operating",
2331
  "optic",
2332
  "optical",
 
2333
  "option",
2334
  "or",
2335
  "ore",
@@ -2488,6 +2530,7 @@
2488
  "scaling",
2489
  "scart",
2490
  "screen",
 
2491
  "sen",
2492
  "sen.",
2493
  "sep",
@@ -2575,6 +2618,7 @@
2575
  "those",
2576
  "tic",
2577
  "titan",
 
2578
  "titanium",
2579
  "to",
2580
  "tor",
@@ -2623,6 +2667,7 @@
2623
  "urn",
2624
  "us",
2625
  "usb",
 
2626
  "usb2.0",
2627
  "use",
2628
  "ust",
@@ -2644,6 +2689,7 @@
2644
  "vga(pc",
2645
  "via",
2646
  "vidaa",
 
2647
  "video",
2648
  "video_feature",
2649
  "videos",
 
3
  "\n",
4
  " ",
5
  " ",
6
+ " In",
7
+ " OS",
8
+ " TV",
9
+ " U4",
10
  "\"",
11
  "%",
12
  "&",
 
171
  "126",
172
  "127",
173
  "1280",
174
+ "1280x720",
175
  "12a.m",
176
  "12a.m.",
177
  "12p.m",
 
280
  "3.0",
281
  "3.1.2",
282
  "3.5",
283
+ "3.5mm Headphone jack",
284
+ "3.5mm headphone jack",
285
  "3.840",
286
  "30\"-100",
287
  "300",
 
347
  "43pus8507/12",
348
  "4500",
349
  "48",
350
+ "480",
351
  "4800",
352
  "4CH",
353
  "4Hz",
 
429
  "60",
430
  "600",
431
  "632",
432
+ "640x480",
433
  "65",
434
  "65OLED707/12",
435
  "65OLED907/12",
 
481
  "75pus8807/12",
482
  "768",
483
  "7680",
484
+ "7680x4320",
485
  "768p",
486
  "77",
487
  "7a.m",
 
670
  "Ambilight+hue",
671
  "Amlogic",
672
  "Android",
673
+ "Android 7.1",
674
+ "Android TV",
675
  "AndroidTV",
676
  "Antenna",
677
  "Anyview",
 
726
  "CHROMECAST",
727
  "CI",
728
  "CI+",
729
+ "CI+ Slot",
730
  "CMP",
731
  "COLOR",
732
  "CONNECTION",
 
765
  "Comfort",
766
  "Common",
767
  "Component",
768
+ "Component In",
769
  "Composite",
770
+ "Composite In",
771
  "Computer",
772
  "Conn",
773
  "Conn.",
 
818
  "Design",
819
  "Did",
820
  "Digital",
821
+ "Digital Audio",
822
  "Diming",
823
  "Dimming",
824
  "Dinamic",
 
829
  "Doin'",
830
  "Doin\u2019",
831
  "Dolby",
832
+ "Dolby Atmos",
833
  "Dot",
834
  "Dots",
835
  "Dr",
 
897
  "Goin\u2019",
898
  "Gon",
899
  "Google",
900
+ "Google TV",
901
  "GoogleTV",
902
  "Got",
903
  "Gov",
 
1174
  "Operating",
1175
  "Optic",
1176
  "Optical",
1177
+ "Optical Audio Out",
1178
  "Ore",
1179
  "Ore.",
1180
  "Oregon",
 
1286
  "S506",
1287
  "S905w",
1288
  "SCART",
1289
+ "SD",
1290
  "SK",
1291
  "SMART",
1292
  "SOFTWARE_FEATURE",
 
1363
  "This\u2019s",
1364
  "Those",
1365
  "Titan",
1366
+ "Titan OS",
1367
  "Total",
1368
  "Triluminos",
1369
  "Triple",
 
1386
  "ULTRAHD",
1387
  "URE",
1388
  "USB",
1389
+ "USB 2.0",
1390
  "USB2.0",
1391
  "Ultimate",
1392
  "Ultra",
 
1402
  "VGA(PC",
1403
  "VIA",
1404
  "VIDAA",
1405
+ "VIDAA U4",
1406
  "VIDEO_FEATURE",
1407
  "VRR",
1408
  "VTE",
 
1471
  "XR400",
1472
  "XX",
1473
  "XX+",
1474
+ "XX+ Xxxx",
1475
  "XX-dd",
1476
  "XXH",
1477
  "XXX",
1478
+ "XXX d.d",
1479
  "XXX(XX",
1480
  "XXX+",
1481
  "XXX-",
1482
  "XXXX",
1483
+ "XXXX Xd",
1484
  "XXXX+",
1485
  "XXXX_XXXX",
1486
  "XXXXd",
 
1533
  "XxxxXxxxx",
1534
  "XxxxdddXXxd",
1535
  "Xxxxx",
1536
+ "Xxxxx XX",
1537
+ "Xxxxx Xx",
1538
+ "Xxxxx Xxxxx",
1539
+ "Xxxxx Xxxxx Xxx",
1540
+ "Xxxxx d.d",
1541
  "Xxxxx&Xxxxx",
1542
  "Xxxxx'",
1543
  "Xxxxx'x",
 
1632
  "and",
1633
  "and/or",
1634
  "android",
1635
+ "android 7.1",
1636
+ "android tv",
1637
  "androidtv",
1638
  "ano",
1639
  "ans",
 
1725
  "chromecast",
1726
  "ci",
1727
  "ci+",
1728
+ "ci+ slot",
1729
  "cks",
1730
  "clarity",
1731
  "clariy",
 
1746
  "comfort",
1747
  "common",
1748
  "component",
1749
+ "component in",
1750
  "composite",
1751
+ "composite in",
1752
  "computer",
1753
  "conn",
1754
  "conn.",
 
1795
  "d.dXX",
1796
  "d.ddd",
1797
  "d.dxx",
1798
+ "d.dxx Xxxxx xxxx",
1799
  "d.x",
1800
  "dTV",
1801
  "dX",
 
1830
  "ddddxddd",
1831
  "ddddxdddd",
1832
  "dddx",
1833
+ "dddxddd",
1834
  "dddxx",
1835
  "dddxxddd",
1836
  "dddxxx",
 
1855
  "dex",
1856
  "did",
1857
  "digital",
1858
+ "digital audio",
1859
  "diming",
1860
  "dimming",
1861
  "dinamic",
 
1874
  "doing",
1875
  "doin\u2019",
1876
  "dolby",
1877
+ "dolby atmos",
1878
  "dot",
1879
  "dots",
1880
  "dr",
 
1999
  "gon",
2000
  "gonna",
2001
  "google",
2002
+ "google tv",
2003
  "googletv",
2004
  "got",
2005
  "gov",
 
2371
  "operating",
2372
  "optic",
2373
  "optical",
2374
+ "optical audio out",
2375
  "option",
2376
  "or",
2377
  "ore",
 
2530
  "scaling",
2531
  "scart",
2532
  "screen",
2533
+ "sd",
2534
  "sen",
2535
  "sen.",
2536
  "sep",
 
2618
  "those",
2619
  "tic",
2620
  "titan",
2621
+ "titan os",
2622
  "titanium",
2623
  "to",
2624
  "tor",
 
2667
  "urn",
2668
  "us",
2669
  "usb",
2670
+ "usb 2.0",
2671
  "usb2.0",
2672
  "use",
2673
  "ust",
 
2689
  "vga(pc",
2690
  "via",
2691
  "vidaa",
2692
+ "vidaa u4",
2693
  "video",
2694
  "video_feature",
2695
  "videos",