Spaces:

vanessbut
/

tldr_keywords

Build error

App Files Files Community

tldr_keywords / utils /utils.py

vanessbut

Исправлен импорт.

12b4943 over 2 years ago

raw

history blame

4.82 kB

	import re
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.metrics.pairwise import euclidean_distances
	from scipy.special import softmax

	def preprocess(strings):
	"""
	Заменить символы '\n' на пробелы и убрать лишние пробелы.

	strings - список строк.
	"""

	for index in range(len(strings)):
	strings[index] = strings[index].replace('\n', ' ')
	strings[index] = re.sub(' +', ' ', strings[index])

	return strings


	def get_candidates(text, nlp, min_df=0.0, ngram_range=(1, 3), max_words=None):
	"""
	Получить список из max(max_words, #слов в text) кандидатов в ключевые слова.

	text - входной текст.
	nlp - инструмент для анализа языка (см. spacy)
	min_df - минимальная частота вхождения слова в текст.
	ngram_range - число грам в ключевом слове.
	max_words - максимальное число слов на выходе.
	"""

	# Получим самый базовый набор грам.
	count = CountVectorizer(ngram_range=ngram_range,
	stop_words="english",
	min_df=min_df,
	max_features=max_words).fit([text])
	candidates = count.get_feature_names()
	#print(candidates)

	# Обработаем полученный список.
	nlp_result = nlp(text)

	# Фразы, содержащие существительные.
	noun_phrases = set(chunk.text.strip().lower() for chunk in nlp_result.noun_chunks)
	#print(noun_phrases)

	# Отдельно существительные.
	noun_lemmas = set()
	for token in nlp_result:
	if token.pos_ == "NOUN":
	noun_lemmas.add(token.lemma_) # Для одного слова всё-таки бессмысленно хранить форму.
	#print(noun_lemmas)

	nouns = set()
	for token in nlp_result:
	if token.pos_ == "NOUN" and not (token.text in noun_lemmas):
	nouns.add(token.text)
	#print(nouns)
	nouns = nouns.union(noun_lemmas)

	# Объединение.
	with_nouns = nouns.union(noun_phrases)

	# Отфильтровывание.
	candidates = list(filter(lambda candidate: candidate in with_nouns, candidates))

	return candidates


	def get_embedding(texts, model, tokenizer, chunk_size=128):
	"""
	Перевести набор текстов в эмбеддинги.
	"""

	n_chunks = len(texts) // chunk_size + int(len(texts) % chunk_size != 0)
	embeddings = []

	for chunk_index in range(n_chunks):
	start = chunk_index * chunk_size
	end = min(start + chunk_size, len(texts))
	chunk = texts[start:end]

	chunk_tokens = tokenizer(chunk, padding=True, truncation=True, return_tensors="pt")
	chunk_embeddings = model(**chunk_tokens)["pooler_output"]
	chunk_embeddings = chunk_embeddings.detach().numpy()

	embeddings.append(chunk_embeddings)

	embeddings = np.vstack(embeddings)

	return embeddings


	def score_candidates(text, candidates, model, tokenizer):
	"""
	Ранжирование ключевых слов.
	"""

	if len(candidates) == 1:
	return np.array([1.0])
	elif len(candidates) == 0:
	return np.array([])

	# Эмбеддинг для текста.
	text_embedding = get_embedding([text], model, tokenizer)

	# Эмбеддинг для ключевых слов.
	candidate_embeddings = get_embedding(candidates, model, tokenizer)

	# Будем брать softmax от нормированных косинусных расстояний.
	distances = cosine_similarity(text_embedding, candidate_embeddings)
	score = softmax((distances - np.mean(distances)) / np.std(distances))[0]

	return score


	def get_keywords(text, nlp, model, tokenizer, top=0.95, max_words=None):
	candidates = get_candidates(text, nlp)
	score = score_candidates(text, candidates, model, tokenizer)

	candidates_scored = [(candidates[index], score[index]) for index in score.argsort()[::-1]]

	result = []
	sum_probability = 0.0
	max_words = len(candidates_scored) if max_words is None else min(len(candidates_scored), max_words)
	for index in range(max_words):
	if sum_probability > top:
	break

	result.append(candidates_scored[index])
	sum_probability += candidates_scored[index][1]

	return result