Spaces:

MilaNLProc
/

wordify

Build error

wordify / src /wordifier.py

Pietro Lesci

reproducibility

11bd087 almost 3 years ago

4.98 kB

	from typing import Dict, List, Tuple

	import numpy as np
	import pandas as pd
	import streamlit as st
	from pandas.core.frame import DataFrame
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils import resample

	from .configs import InputTransformConfigs, ModelConfigs


	def input_transform(
	text: pd.Series, labels: pd.Series, configs=InputTransformConfigs
	) -> Dict[str, np.ndarray]:
	"""
	Encodes text in mathematical object ameanable to training algorithm
	"""
	tfidf_vectorizer = TfidfVectorizer(
	input="content", # default: file already in memory
	encoding="utf-8", # default
	decode_error="strict", # default
	strip_accents=None, # do nothing
	lowercase=False, # do nothing
	preprocessor=None, # do nothing - default
	tokenizer=None, # default
	stop_words=None, # do nothing
	analyzer="word",
	ngram_range=configs.NGRAM_RANGE.value, # maximum 3-ngrams
	min_df=configs.MIN_DF.value,
	max_df=configs.MAX_DF.value,
	sublinear_tf=configs.SUBLINEAR.value,
	)
	label_encoder = LabelEncoder()

	X = tfidf_vectorizer.fit_transform(text.values)
	y = label_encoder.fit_transform(labels.values)

	return {
	"X": X,
	"y": y,
	"X_names": np.array(tfidf_vectorizer.get_feature_names_out()),
	"y_names": label_encoder.classes_,
	}


	def wordifier(
	X: np.ndarray,
	y: np.ndarray,
	X_names: List[str],
	y_names: List[str],
	configs=ModelConfigs,
	) -> List[Tuple[str, float, str]]:

	n_instances, n_features = X.shape
	n_classes = len(y_names)

	# NOTE: the * 10 / 10 trick is to have "nice" round-ups
	sample_fraction = np.ceil((n_features / n_instances) * 10) / 10

	sample_size = min(
	# this is the maximum supported
	configs.MAX_SELECTION.value,
	# at minimum you want MIN_SELECTION but in general you want
	# n_instances * sample_fraction
	max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
	# however if previous one is bigger the the available instances take
	# the number of available instances
	n_instances,
	)

	# TODO: might want to try out something to subsample features at each iteration

	# initialize coefficient matrices
	pos_scores = np.zeros((n_classes, n_features), dtype=int)
	neg_scores = np.zeros((n_classes, n_features), dtype=int)

	pbar = st.progress(0)
	for i, _ in enumerate(range(configs.NUM_ITERS.value)):

	# run randomized regression
	clf = LogisticRegression(
	penalty="l1",
	C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
	solver="liblinear",
	multi_class="auto",
	max_iter=500,
	class_weight="balanced",
	random_state=42,
	)

	# sample indices to subsample matrix
	selection = resample(
	np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
	)

	# fit
	try:
	clf.fit(X[selection], y[selection])
	except ValueError:
	continue

	# record coefficients
	if n_classes == 2:
	pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
	neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
	pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
	neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
	else:
	pos_scores += clf.coef_ > 0
	neg_scores += clf.coef_ < 0

	pbar.progress(round(i / configs.NUM_ITERS.value, 1))

	# normalize
	pos_scores = pos_scores / configs.NUM_ITERS.value
	neg_scores = neg_scores / configs.NUM_ITERS.value

	# get only active features
	pos_positions = np.where(
	pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
	)
	neg_positions = np.where(
	neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
	)

	# prepare DataFrame
	pos = [
	(X_names[i], pos_scores[c, i], y_names[c])
	for c, i in zip(*pos_positions.nonzero())
	]
	neg = [
	(X_names[i], neg_scores[c, i], y_names[c])
	for c, i in zip(*neg_positions.nonzero())
	]

	return pos, neg


	def output_transform(
	pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]
	) -> DataFrame:
	posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
	["label", "score"], ascending=False
	)
	posdf["correlation"] = "positive"
	negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
	["label", "score"], ascending=False
	)
	negdf["correlation"] = "negative"

	output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
	output.columns = output.columns.str.title()

	return output