oh201516 commited on
Commit
78bb401
1 Parent(s): 7ee8264

Update spaCy pipeline

Browse files
count_extraction_component.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.tokens import Doc, Span, Token
2
+ from spacy.matcher import PhraseMatcher
3
+ from spacy.util import filter_spans
4
+ from spacy.language import Language
5
+ import re
6
+
7
+ # https://spacy.io/usage/processing-pipelines#custom-components
8
+ @Language.factory("count_extraction")
9
+ class CountExtractorComponent(object):
10
+ def __init__(self, nlp, name, label="CONNECTION"):
11
+ self.label = label
12
+ self.reg_left = re.compile(r"^(?P<count>\d+)\s*[xX]\s*(?P<name>.+)$")
13
+ self.reg_right = re.compile(r"^(?P<name>.+)\s*[xX]\s*(?P<count>\d+)$")
14
+ self.reg_right_inverted = re.compile(r"^(?P<name>.+)\s*(?P<count>\d+)\s*[xX]$")
15
+
16
+ # set extensions to tokens, spans and docs
17
+ Span.set_extension("count", default=None, force=True)
18
+ Span.set_extension("text", default=None, force=True)
19
+
20
+ Doc.set_extension("connections", getter=self.connections, force=True)
21
+
22
+ def __call__(self, doc):
23
+ for ent in doc.ents:
24
+ text = ent.text.strip()
25
+ if ent.label_ != self.label:
26
+ ent._.text = text
27
+ continue
28
+
29
+ m = self.reg_left.match(text)
30
+ if m is not None:
31
+ map = m.groupdict()
32
+ ent._.text = map["name"].strip()
33
+ ent._.count = int(map["count"])
34
+ continue
35
+
36
+ m = self.reg_right.match(text)
37
+ if m is not None:
38
+ map = m.groupdict()
39
+ ent._.text = map["name"].strip()
40
+ ent._.count = int(map["count"])
41
+ continue
42
+
43
+ m = self.reg_right_inverted.match(text)
44
+ if m is not None:
45
+ map = m.groupdict()
46
+ ent._.text = map["name"].strip()
47
+ ent._.count = int(map["count"])
48
+ continue
49
+
50
+ ent._.text = text
51
+ ent._.count = 1
52
+ return doc
en_setec_mk_tv-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25ef56110fc9e2fbfbf16f3ef7291197337d6c1996ea079360bfea2d2426f725
3
- size 5705298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edda8abe91cb2274ca61da460b6ccc5f3f80d155d1dd7bae59914e179a475699
3
+ size 5706128