from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
import joblib


# Load RuBERT model and tokenizer
rubert_model_name = "cointegrated/rubert-tiny2"  # Example model name, adjust as needed
tokenizer = AutoTokenizer.from_pretrained(rubert_model_name)
model = AutoModel.from_pretrained(rubert_model_name)

# Load Logistic Regression model
logreg_model_path = "/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/logreg_model_v2.joblib"
logreg_model = joblib.load(logreg_model_path)

def embed_bert_cls(text, model, tokenizer):
    """Generate embeddings for input text using the RuBERT model."""
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model):
    """Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression."""
    embeddings = embed_bert_cls(text, model, tokenizer)
    prediction = classifier.predict(embeddings)
    dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'}

    return dict_class[prediction[0]]