from transformers import AutoModel, AutoTokenizer import torch import numpy as np from sklearn.linear_model import LogisticRegression import joblib # Load RuBERT model and tokenizer rubert_model_name = "cointegrated/rubert-tiny2" # Example model name, adjust as needed tokenizer = AutoTokenizer.from_pretrained(rubert_model_name) model = AutoModel.from_pretrained(rubert_model_name) # Load Logistic Regression model logreg_model_path = "/home/owly/ds_bootcamp/phase_2/NLP_processing/misc/logreg_model_v2.joblib" logreg_model = joblib.load(logreg_model_path) def embed_bert_cls(text, model, tokenizer): """Generate embeddings for input text using the RuBERT model.""" inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state[:, 0, :] embeddings = torch.nn.functional.normalize(embeddings) return embeddings.cpu().numpy() def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model): """Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression.""" embeddings = embed_bert_cls(text, model, tokenizer) prediction = classifier.predict(embeddings) dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'} return dict_class[prediction[0]]