File size: 2,717 Bytes
c8a708c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import spacy
__all__ = 'SpacyPipeline'
MODELS = {
"en": "en_core_web_sm",
"ja": "ja_core_news_sm",
"zh": "zh_core_web_sm",
"de": "de_core_news_sm",
"es": "es_core_news_sm",
"it": "it_core_news_sm",
"ko": "ko_core_news_sm",
"ru": "ru_core_news_sm",
"fr": "fr_core_news_sm",
"vi": "vi_core_news_lg"
}
VALID_METHODS = ['positionrank', 'textrank', 'biasedtextrank', 'positionrank', 'ner']
class SpacyPipeline:
def __init__(self, language, algorithm: str = None):
model = "vi_core_news_lg" if language not in MODELS else MODELS[language]
self.nlp = spacy.load(model)
self.nlp.add_pipe("sentencizer")
self.algorithm = algorithm
self.library = None
if self.algorithm is not None and self.algorithm != 'ner':
assert algorithm in VALID_METHODS, f'invalid algorithm {algorithm}\n- valid list: {VALID_METHODS}'
if self.algorithm == 'yake':
import spacy_ke # need to load yake
self.nlp.add_pipe("yake")
self.library = 'spacy_ke'
elif self.algorithm in ['textrank', 'biasedtextrank', 'positionrank']:
import pytextrank
self.nlp.add_pipe(algorithm)
self.library = 'pytextrank'
else:
raise ValueError(f'unknown algorithm: {self.algorithm}')
def _get_keyword(self, output, original_document=None, n=None):
if self.algorithm == 'ner':
return [str(i) for i in output.ents]
assert original_document is not None
assert n is not None
if self.library == 'spacy_ke':
return [str(term) for term, score in output._.extract_keywords(n) if str(term) in original_document]
return [str(i.text) for i in output._.phrases[:n] if str(i.text) in original_document]
def sentence_keyword(self, string: str, n: int = 10):
out = self.nlp(string)
sentence = [str(i) for i in out.sents if len(i) > 0]
keyword = self._get_keyword(out, string, n)
return sentence, keyword
def sentence(self, string: str):
return [str(i) for i in self.nlp(string).sents if len(i) > 0]
def token(self, string: str):
return [str(i) for i in self.nlp.tokenizer(string)]
def keyword(self, string: str, n: int = 10):
return self._get_keyword(self.nlp(string), string, n)
def ner(self, string: str, n: int = None):
keywords = self.nlp(string).ents
return keywords[:min(len(keywords), n)] if n is not None else keywords
@property
def language(self):
return self.nlp.lang |