# -*- coding: utf-8 -*- """ Лемматизатор для R&D прототипирования NLP задач в Питоне 25.03.2020 добавлена ефикация в get_lemma2 05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT """ from __future__ import division from __future__ import print_function import os import pickle import pathlib import gzip def decode_pos(pos): if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']: return u'ГЛАГОЛ' else: return pos class Lemmatizer(object): def __init__(self): pass def load(self, dict_path=None): """ Загружаем модель лемматизации, созданную отдельным скриптом builder.py """ dict_filename = 'rulemma.dat' if dict_path is None: module_folder = str(pathlib.Path(__file__).resolve().parent) p = os.path.join(module_folder, '../tmp', dict_filename) if not os.path.exists(p): p = os.path.join(module_folder, dict_filename) else: p = dict_path with gzip.open(p, 'r') as f: self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f) def get_lemma(self, word): if word in self.forms: return self.forms[word] elif word in self.forms2: return self.forms2[word][0] elif word in self.special_lemmas: return self.special_lemmas[word] else: return word def decode_pos_tags(self, pos_tags): stags1 = [] part_of_speech = u'unk' short_tag_index = -1 for tag in pos_tags.split('|'): if tag == 'NOUN': part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ' elif tag == 'VERB': part_of_speech = u'ГЛАГОЛ' elif tag == 'ADJ': part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ' stags1.append((u'КРАТКИЙ', u'0')) short_tag_index = 0 elif tag == 'ADV': part_of_speech = u'НАРЕЧИЕ' elif tag == 'PRON': part_of_speech = u'МЕСТОИМЕНИЕ' elif tag == 'ADP': part_of_speech = u'ПРЕДЛОГ' elif tag == 'CONJ': part_of_speech = u'СОЮЗ' elif tag == 'PART': part_of_speech = u'ЧАСТИЦА' elif tag == 'PUNCT': part_of_speech = u'ПУНКТУАТОР' elif '=' in tag: if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ': if tag == u'Case=Nom': stags1.append((u'ПАДЕЖ', u'ИМ')) elif tag == u'Case=Acc': stags1.append((u'ПАДЕЖ', u'ВИН')) elif tag == u'Case=Dat': stags1.append((u'ПАДЕЖ', u'ДАТ')) elif tag == u'Case=Ins': stags1.append((u'ПАДЕЖ', u'ТВОР')) elif tag == u'Case=Prep': stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) elif tag == u'Case=Loc': stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ' elif tag == u'Case=Gen': stags1.append((u'ПАДЕЖ', u'РОД')) elif tag == u'Case=Voc': stags1.append((u'ПАДЕЖ', u'ЗВАТ')) elif tag == u'Number=Sing': stags1.append((u'ЧИСЛО', u'ЕД')) elif tag == u'Number=Plur': stags1.append((u'ЧИСЛО', u'МН')) elif tag == u'Gender=Masc': stags1.append((u'РОД', u'МУЖ')) elif tag == u'Gender=Fem': stags1.append((u'РОД', u'ЖЕН')) elif tag == u'Gender=Neut': stags1.append((u'РОД', u'СР')) else: print(u'неизвестный тэг "{}"'.format(tag)) raise NotImplementedError() elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ': if tag == u'Case=Nom': stags1.append((u'ПАДЕЖ', u'ИМ')) elif tag == u'Case=Acc': stags1.append((u'ПАДЕЖ', u'ВИН')) elif tag == u'Case=Dat': stags1.append((u'ПАДЕЖ', u'ДАТ')) elif tag == u'Case=Ins': stags1.append((u'ПАДЕЖ', u'ТВОР')) elif tag == u'Case=Prep': stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) elif tag == u'Case=Loc': stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ' elif tag == u'Case=Gen': stags1.append((u'ПАДЕЖ', u'РОД')) elif tag == u'Number=Sing': stags1.append((u'ЧИСЛО', u'ЕД')) elif tag == u'Number=Plur': stags1.append((u'ЧИСЛО', u'МН')) elif tag == u'Gender=Masc': stags1.append((u'РОД', u'МУЖ')) elif tag == u'Gender=Fem': stags1.append((u'РОД', u'ЖЕН')) elif tag == u'Gender=Neut': stags1.append((u'РОД', u'СР')) elif tag == u'Degree=Cmp': stags1.append((u'СТЕПЕНЬ', u'СРАВН')) elif tag == u'Degree=Pos': stags1.append((u'СТЕПЕНЬ', u'АТРИБ')) elif tag in (u'Variant=Short', u'Variant=Brev'): stags1[short_tag_index] = (u'КРАТКИЙ', u'1') else: print(u'неизвестный тэг "{}"'.format(tag)) raise NotImplementedError() elif part_of_speech == u'ГЛАГОЛ': if tag == u'Number=Sing': stags1.append((u'ЧИСЛО', u'ЕД')) elif tag == u'Number=Plur': stags1.append((u'ЧИСЛО', u'МН')) elif tag == u'Gender=Masc': stags1.append((u'РОД', u'МУЖ')) elif tag == u'Gender=Fem': stags1.append((u'РОД', u'ЖЕН')) elif tag == u'Gender=Neut': stags1.append((u'РОД', u'СР')) elif tag == u'Mood=Ind': stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ')) elif tag == u'Mood=Imp': stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД')) elif tag == u'Tense=Past': stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ')) elif tag == u'Tense=Fut': stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ')) elif tag == u'Tense=Notpast': stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ')) elif tag == u'Tense=Pres': stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ')) elif tag == u'Person=1': stags1.append((u'ЛИЦО', u'1')) elif tag == u'Person=2': stags1.append((u'ЛИЦО', u'2')) elif tag == u'Person=3': stags1.append((u'ЛИЦО', u'3')) elif tag == u'VerbForm=Fin': pass elif tag == u'VerbForm=Inf': pass elif tag == u'VerbForm=Conv': pass else: msg = u'неизвестный тэг "{}"'.format(tag) raise RuntimeError(msg) elif part_of_speech == u'НАРЕЧИЕ': if tag == u'Degree=Pos': stags1.append((u'СТЕПЕНЬ', u'АТРИБ')) elif tag == u'Degree=Cmp': stags1.append((u'СТЕПЕНЬ', u'СРАВН')) else: raise NotImplementedError() else: pass return part_of_speech, stags1 def get_lemma2(self, word, pos_tags): part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags) nword = word.lower().replace('ё', 'е') if nword in self.special_lemmas: return self.special_lemmas[nword], part_of_speech, decoded_tags if nword in self.forms: lemma = self.forms[nword] return lemma, part_of_speech, decoded_tags elif nword in self.forms2: if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ': # Для существительных учитываем падеж. required_case = None for tag in decoded_tags: if tag[0] == 'ПАДЕЖ': required_case = tag[1] break for lemma, lemma_part_of_speech, tag in self.forms2[nword]: if lemma_part_of_speech == part_of_speech and tag == required_case: return lemma, part_of_speech, decoded_tags else: for lemma, lemma_part_of_speech, tags in self.forms2[nword]: if lemma_part_of_speech == part_of_speech: return lemma, part_of_speech, decoded_tags elif len(word) > 4: # используем модель лемматизации для OV-слов ending = nword[-4:] key = ending + u'|' + part_of_speech if key in self.key2transducer: transducer = self.key2transducer[key] if transducer[0] > 0: lemma = word[:-transducer[0]] + transducer[1] else: lemma = word + transducer[1] return lemma.lower(), part_of_speech, decoded_tags # fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы return nword, part_of_speech, decoded_tags def lemmatize(self, tagged_words): """Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи""" return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]