Den4ikAI's picture
Upload 66 files
153c03b verified
raw
history blame
11.2 kB
# -*- coding: utf-8 -*-
"""
Лемматизатор для R&D прототипирования NLP задач в Питоне
25.03.2020 добавлена ефикация в get_lemma2
05.04.2020 добавлено декодирование для частей речи CONJ, PART и PUNCT
"""
from __future__ import division
from __future__ import print_function
import os
import pickle
import pathlib
import gzip
def decode_pos(pos):
if pos in [u'ДЕЕПРИЧАСТИЕ', u'ГЛАГОЛ', u'ИНФИНИТИВ']:
return u'ГЛАГОЛ'
else:
return pos
class Lemmatizer(object):
def __init__(self):
pass
def load(self, dict_path=None):
""" Загружаем модель лемматизации, созданную отдельным скриптом builder.py """
dict_filename = 'rulemma.dat'
if dict_path is None:
module_folder = str(pathlib.Path(__file__).resolve().parent)
p = os.path.join(module_folder, '../tmp', dict_filename)
if not os.path.exists(p):
p = os.path.join(module_folder, dict_filename)
else:
p = dict_path
with gzip.open(p, 'r') as f:
self.forms, self.forms2, self.special_lemmas, self.key2transducer = pickle.load(f)
def get_lemma(self, word):
if word in self.forms:
return self.forms[word]
elif word in self.forms2:
return self.forms2[word][0]
elif word in self.special_lemmas:
return self.special_lemmas[word]
else:
return word
def decode_pos_tags(self, pos_tags):
stags1 = []
part_of_speech = u'unk'
short_tag_index = -1
for tag in pos_tags.split('|'):
if tag == 'NOUN':
part_of_speech = u'СУЩЕСТВИТЕЛЬНОЕ'
elif tag == 'VERB':
part_of_speech = u'ГЛАГОЛ'
elif tag == 'ADJ':
part_of_speech = u'ПРИЛАГАТЕЛЬНОЕ'
stags1.append((u'КРАТКИЙ', u'0'))
short_tag_index = 0
elif tag == 'ADV':
part_of_speech = u'НАРЕЧИЕ'
elif tag == 'PRON':
part_of_speech = u'МЕСТОИМЕНИЕ'
elif tag == 'ADP':
part_of_speech = u'ПРЕДЛОГ'
elif tag == 'CONJ':
part_of_speech = u'СОЮЗ'
elif tag == 'PART':
part_of_speech = u'ЧАСТИЦА'
elif tag == 'PUNCT':
part_of_speech = u'ПУНКТУАТОР'
elif '=' in tag:
if part_of_speech == u'СУЩЕСТВИТЕЛЬНОЕ':
if tag == u'Case=Nom':
stags1.append((u'ПАДЕЖ', u'ИМ'))
elif tag == u'Case=Acc':
stags1.append((u'ПАДЕЖ', u'ВИН'))
elif tag == u'Case=Dat':
stags1.append((u'ПАДЕЖ', u'ДАТ'))
elif tag == u'Case=Ins':
stags1.append((u'ПАДЕЖ', u'ТВОР'))
elif tag == u'Case=Prep':
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
elif tag == u'Case=Loc':
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
elif tag == u'Case=Gen':
stags1.append((u'ПАДЕЖ', u'РОД'))
elif tag == u'Case=Voc':
stags1.append((u'ПАДЕЖ', u'ЗВАТ'))
elif tag == u'Number=Sing':
stags1.append((u'ЧИСЛО', u'ЕД'))
elif tag == u'Number=Plur':
stags1.append((u'ЧИСЛО', u'МН'))
elif tag == u'Gender=Masc':
stags1.append((u'РОД', u'МУЖ'))
elif tag == u'Gender=Fem':
stags1.append((u'РОД', u'ЖЕН'))
elif tag == u'Gender=Neut':
stags1.append((u'РОД', u'СР'))
else:
print(u'неизвестный тэг "{}"'.format(tag))
raise NotImplementedError()
elif part_of_speech == u'ПРИЛАГАТЕЛЬНОЕ':
if tag == u'Case=Nom':
stags1.append((u'ПАДЕЖ', u'ИМ'))
elif tag == u'Case=Acc':
stags1.append((u'ПАДЕЖ', u'ВИН'))
elif tag == u'Case=Dat':
stags1.append((u'ПАДЕЖ', u'ДАТ'))
elif tag == u'Case=Ins':
stags1.append((u'ПАДЕЖ', u'ТВОР'))
elif tag == u'Case=Prep':
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ'))
elif tag == u'Case=Loc':
stags1.append((u'ПАДЕЖ', u'ПРЕДЛ')) # 03-02-2020 u'МЕСТ'
elif tag == u'Case=Gen':
stags1.append((u'ПАДЕЖ', u'РОД'))
elif tag == u'Number=Sing':
stags1.append((u'ЧИСЛО', u'ЕД'))
elif tag == u'Number=Plur':
stags1.append((u'ЧИСЛО', u'МН'))
elif tag == u'Gender=Masc':
stags1.append((u'РОД', u'МУЖ'))
elif tag == u'Gender=Fem':
stags1.append((u'РОД', u'ЖЕН'))
elif tag == u'Gender=Neut':
stags1.append((u'РОД', u'СР'))
elif tag == u'Degree=Cmp':
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
elif tag == u'Degree=Pos':
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
elif tag in (u'Variant=Short', u'Variant=Brev'):
stags1[short_tag_index] = (u'КРАТКИЙ', u'1')
else:
print(u'неизвестный тэг "{}"'.format(tag))
raise NotImplementedError()
elif part_of_speech == u'ГЛАГОЛ':
if tag == u'Number=Sing':
stags1.append((u'ЧИСЛО', u'ЕД'))
elif tag == u'Number=Plur':
stags1.append((u'ЧИСЛО', u'МН'))
elif tag == u'Gender=Masc':
stags1.append((u'РОД', u'МУЖ'))
elif tag == u'Gender=Fem':
stags1.append((u'РОД', u'ЖЕН'))
elif tag == u'Gender=Neut':
stags1.append((u'РОД', u'СР'))
elif tag == u'Mood=Ind':
stags1.append((u'НАКЛОНЕНИЕ', u'ИЗЪЯВ'))
elif tag == u'Mood=Imp':
stags1.append((u'НАКЛОНЕНИЕ', u'ПОБУД'))
elif tag == u'Tense=Past':
stags1.append((u'ВРЕМЯ', u'ПРОШЕДШЕЕ'))
elif tag == u'Tense=Fut':
stags1.append((u'ВРЕМЯ', u'БУДУЩЕЕ'))
elif tag == u'Tense=Notpast':
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
elif tag == u'Tense=Pres':
stags1.append((u'ВРЕМЯ', u'НАСТОЯЩЕЕ'))
elif tag == u'Person=1':
stags1.append((u'ЛИЦО', u'1'))
elif tag == u'Person=2':
stags1.append((u'ЛИЦО', u'2'))
elif tag == u'Person=3':
stags1.append((u'ЛИЦО', u'3'))
elif tag == u'VerbForm=Fin':
pass
elif tag == u'VerbForm=Inf':
pass
elif tag == u'VerbForm=Conv':
pass
else:
msg = u'неизвестный тэг "{}"'.format(tag)
raise RuntimeError(msg)
elif part_of_speech == u'НАРЕЧИЕ':
if tag == u'Degree=Pos':
stags1.append((u'СТЕПЕНЬ', u'АТРИБ'))
elif tag == u'Degree=Cmp':
stags1.append((u'СТЕПЕНЬ', u'СРАВН'))
else:
raise NotImplementedError()
else:
pass
return part_of_speech, stags1
def get_lemma2(self, word, pos_tags):
part_of_speech, decoded_tags = self.decode_pos_tags(pos_tags)
nword = word.lower().replace('ё', 'е')
if nword in self.special_lemmas:
return self.special_lemmas[nword], part_of_speech, decoded_tags
if nword in self.forms:
lemma = self.forms[nword]
return lemma, part_of_speech, decoded_tags
elif nword in self.forms2:
if part_of_speech == 'СУЩЕСТВИТЕЛЬНОЕ':
# Для существительных учитываем падеж.
required_case = None
for tag in decoded_tags:
if tag[0] == 'ПАДЕЖ':
required_case = tag[1]
break
for lemma, lemma_part_of_speech, tag in self.forms2[nword]:
if lemma_part_of_speech == part_of_speech and tag == required_case:
return lemma, part_of_speech, decoded_tags
else:
for lemma, lemma_part_of_speech, tags in self.forms2[nword]:
if lemma_part_of_speech == part_of_speech:
return lemma, part_of_speech, decoded_tags
elif len(word) > 4:
# используем модель лемматизации для OV-слов
ending = nword[-4:]
key = ending + u'|' + part_of_speech
if key in self.key2transducer:
transducer = self.key2transducer[key]
if transducer[0] > 0:
lemma = word[:-transducer[0]] + transducer[1]
else:
lemma = word + transducer[1]
return lemma.lower(), part_of_speech, decoded_tags
# fallback-вариант - возвращаем исходное слово в нижнем регистре в качестве леммы
return nword, part_of_speech, decoded_tags
def lemmatize(self, tagged_words):
"""Для результата работы rupostagger'а добавляем лемму и извлеченный код части речи"""
return [(word, tags,)+tuple(self.get_lemma2(word, tags)) for (word, tags) in tagged_words]