import spacy __all__ = 'SpacyPipeline' MODELS = { "en": "en_core_web_sm", "ja": "ja_core_news_sm", "zh": "zh_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", "it": "it_core_news_sm", "ko": "ko_core_news_sm", "ru": "ru_core_news_sm", "fr": "fr_core_news_sm", "vi": "vi_core_news_lg" } VALID_METHODS = ['positionrank', 'textrank', 'biasedtextrank', 'positionrank', 'ner'] class SpacyPipeline: def __init__(self, language, algorithm: str = None): model = "vi_core_news_lg" if language not in MODELS else MODELS[language] self.nlp = spacy.load(model) self.nlp.add_pipe("sentencizer") self.algorithm = algorithm self.library = None if self.algorithm is not None and self.algorithm != 'ner': assert algorithm in VALID_METHODS, f'invalid algorithm {algorithm}\n- valid list: {VALID_METHODS}' if self.algorithm == 'yake': import spacy_ke # need to load yake self.nlp.add_pipe("yake") self.library = 'spacy_ke' elif self.algorithm in ['textrank', 'biasedtextrank', 'positionrank']: import pytextrank self.nlp.add_pipe(algorithm) self.library = 'pytextrank' else: raise ValueError(f'unknown algorithm: {self.algorithm}') def _get_keyword(self, output, original_document=None, n=None): if self.algorithm == 'ner': return [str(i) for i in output.ents] assert original_document is not None assert n is not None if self.library == 'spacy_ke': return [str(term) for term, score in output._.extract_keywords(n) if str(term) in original_document] return [str(i.text) for i in output._.phrases[:n] if str(i.text) in original_document] def sentence_keyword(self, string: str, n: int = 10): out = self.nlp(string) sentence = [str(i) for i in out.sents if len(i) > 0] keyword = self._get_keyword(out, string, n) return sentence, keyword def sentence(self, string: str): return [str(i) for i in self.nlp(string).sents if len(i) > 0] def token(self, string: str): return [str(i) for i in self.nlp.tokenizer(string)] def keyword(self, string: str, n: int = 10): return self._get_keyword(self.nlp(string), string, n) def ner(self, string: str, n: int = None): keywords = self.nlp(string).ents return keywords[:min(len(keywords), n)] if n is not None else keywords @property def language(self): return self.nlp.lang