|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
|
class LanguageDetector: |
|
|
|
def __init__(self): |
|
|
|
|
|
|
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") |
|
self.model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") |
|
|
|
|
|
def predict_language(self, text): |
|
|
|
inputs = self.tokenizer(text, return_tensors="pt") |
|
|
|
|
|
outputs = self.model(**inputs) |
|
|
|
|
|
prediction_idx = outputs.logits.argmax(dim=-1).item() |
|
|
|
|
|
language_code = self.model.config.id2label[prediction_idx] |
|
|
|
return language_code |
|
|
|
|