Update spaCy pipeline
Browse files
count_extraction_component.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.tokens import Doc, Span, Token
|
2 |
+
from spacy.matcher import PhraseMatcher
|
3 |
+
from spacy.util import filter_spans
|
4 |
+
from spacy.language import Language
|
5 |
+
import re
|
6 |
+
|
7 |
+
# https://spacy.io/usage/processing-pipelines#custom-components
|
8 |
+
@Language.factory("count_extraction")
|
9 |
+
class CountExtractorComponent(object):
|
10 |
+
def __init__(self, nlp, name, label="CONNECTION"):
|
11 |
+
self.label = label
|
12 |
+
self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
|
13 |
+
self.reg_right = re.compile(r"^(?P<name>.+)\s*[xX]\s*(?P<count>\d+)$")
|
14 |
+
self.reg_right_inverted = re.compile(r"^(?P<name>.+)\s*(?P<count>\d+)\s*[xX]$")
|
15 |
+
|
16 |
+
# set extensions to tokens, spans and docs
|
17 |
+
Span.set_extension("count", default=None, force=True)
|
18 |
+
Span.set_extension("text", default=None, force=True)
|
19 |
+
|
20 |
+
Doc.set_extension("connections", getter=self.connections, force=True)
|
21 |
+
|
22 |
+
def __call__(self, doc):
|
23 |
+
for ent in doc.ents:
|
24 |
+
text = ent.text.strip()
|
25 |
+
if ent.label_ != self.label:
|
26 |
+
ent._.text = text
|
27 |
+
continue
|
28 |
+
|
29 |
+
m = self.reg_left.match(text)
|
30 |
+
if m is not None:
|
31 |
+
map = m.groupdict()
|
32 |
+
ent._.text = map["name"].strip()
|
33 |
+
ent._.count = int(map["count"])
|
34 |
+
continue
|
35 |
+
|
36 |
+
m = self.reg_right.match(text)
|
37 |
+
if m is not None:
|
38 |
+
map = m.groupdict()
|
39 |
+
ent._.text = map["name"].strip()
|
40 |
+
ent._.count = int(map["count"])
|
41 |
+
continue
|
42 |
+
|
43 |
+
m = self.reg_right_inverted.match(text)
|
44 |
+
if m is not None:
|
45 |
+
map = m.groupdict()
|
46 |
+
ent._.text = map["name"].strip()
|
47 |
+
ent._.count = int(map["count"])
|
48 |
+
continue
|
49 |
+
|
50 |
+
ent._.text = text
|
51 |
+
ent._.count = 1
|
52 |
+
return doc
|
en_setec_mk_tv-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edda8abe91cb2274ca61da460b6ccc5f3f80d155d1dd7bae59914e179a475699
|
3 |
+
size 5706128
|