IndexError: index out of range in self
Thank you so much for this upload, I can't understand why unbabel didn't enable this format yet.
I am trying to use your model for evaluating translation quality for product descriptions.
I'm running generations from the same model in a loop and always get to an IndexError after 20 generations
translation_evaluator.py - it's the most simple basic wrapper around the model to run generations on the same model loaded to memory
from transformers import XLMRobertaTokenizerFast, AutoModel
import logging as log
log.basicConfig(level=log.INFO)
model_id = "vince62s/wmt23-cometkiwi-da-roberta-xl" # it's a modified unbabel model to use default Roberta Architecture
class TranslationEvaluator:
def __init__(self):
log.info(f"Preparing Translation Evaluation model: {model_id}")
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_id, trust_remote_code=True)
self.model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
log.info(f"Model loaded.")
def _generate_evaluation(self, text:str, translation:str):
input_text = f"{text}</s></s>{translation}"
encoded_text = self.tokenizer(input_text, return_tensors='pt')
output = self.model(**encoded_text)
return output
def evaluate(self, text:str, translation:str) -> float:
"""
Evaluates translation between two languages.
Outputs a score between 0 and 1, where 0 means very bad and 1 perfect translation.
"""
output = self._generate_evaluation(text=text, translation=translation)
return float(output[0][0][0])
My run script:
from evaluator import TranslationEvaluator
from data import preparer as prep
from data import extracted_samples_file as samples_file
from data import LanguageKeys as LK
import torch
import logging as log
log.basicConfig(level=log.INFO)
samples_file = prep.load_json(samples_file)
trans_eval = TranslationEvaluator()
log_file = './scoring/scores.json'
eval_n_samples = 21
start_at = 19
scores = {}
for i in range(len(samples_file[start_at:start_at+eval_n_samples])):
en = samples_file[i][LK.en_ie]
pl = samples_file[i][LK.pl_pl]
score = trans_eval.evaluate(
text=en,
translation=pl
)
scores[i] = score
log.info(f"Sample {i}: {score}")
torch.clear_autocast_cache()
prep.save_json(
data=scores,
save_path=log_file
)
The outputs
(base) [lael.alhalawani@potatoai finetuning_v2]$ /home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/bin/python /home/lael.alhalawani/python/finetuning_v2/run.py
INFO:root:Preparing Translation Evaluation model: vince62s/wmt23-cometkiwi-da-roberta-xl
INFO:root:Model loaded.
INFO:root:Sample 0: 0.5147500038146973
INFO:root:Sample 1: 0.49919557571411133
INFO:root:Sample 2: 0.606634259223938
INFO:root:Sample 3: 0.6328783631324768
INFO:root:Sample 4: 0.5938654541969299
INFO:root:Sample 5: 0.289816290140152
INFO:root:Sample 6: 0.2865508496761322
INFO:root:Sample 7: 0.6063771843910217
INFO:root:Sample 8: 0.6033931970596313
INFO:root:Sample 9: 0.5737144947052002
INFO:root:Sample 10: 0.6372798681259155
INFO:root:Sample 11: 0.5346137881278992
INFO:root:Sample 12: 0.5788921117782593
INFO:root:Sample 13: 0.5837757587432861
INFO:root:Sample 14: 0.612902820110321
INFO:root:Sample 15: 0.6258988380432129
INFO:root:Sample 16: 0.6790519952774048
INFO:root:Sample 17: 0.6715947389602661
INFO:root:Sample 18: 0.5604020953178406
INFO:root:Sample 19: 0.425453782081604
Traceback (most recent call last):
File "/home/lael.alhalawani/python/finetuning_v2/run.py", line 20, in <module>
score = trans_eval.evaluate(
^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/python/finetuning_v2/evaluator/translation_evaluator.py", line 29, in evaluate
output = self._generate_evaluation(text=text, translation=translation)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/python/finetuning_v2/evaluator/translation_evaluator.py", line 20, in _generate_evaluation
output = self.model(**encoded_text)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/.cache/huggingface/modules/transformers_modules/vince62s/wmt23-cometkiwi-da-roberta-xl/7db83b33c1432adcd0d4ea75dc39246ffeef8e1a/modeling_xlm_roberta_xl.py", line 1514, in forward
outputs = self.roberta(
^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/.cache/huggingface/modules/transformers_modules/vince62s/wmt23-cometkiwi-da-roberta-xl/7db83b33c1432adcd0d4ea75dc39246ffeef8e1a/modeling_xlm_roberta_xl.py", line 800, in forward
embedding_output = self.embeddings(
^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/.cache/huggingface/modules/transformers_modules/vince62s/wmt23-cometkiwi-da-roberta-xl/7db83b33c1432adcd0d4ea75dc39246ffeef8e1a/modeling_xlm_roberta_xl.py", line 121, in forward
position_embeddings = self.position_embeddings(position_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 163, in forward
return F.embedding(
^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/functional.py", line 2264, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: index out of range in self
Do you have any idea what might be causing these errors? How do I get to a point when lengths are not matching?
Is there some cache that I need to clear like pytorch cache or something else is causng this issue?
It always happens after 20 samples, doesn't matter if i test 0-19 or 100-119.
Thank you and kind regards!
If I use XLMRobertaXLForSequenceClassification i get this error
Traceback (most recent call last):
File "/home/lael.alhalawani/python/finetuning_v2/run.py", line 21, in
score = trans_eval.evaluate(
^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/python/finetuning_v2/evaluator/translation_evaluator.py", line 29, in evaluate
output = self._generate_evaluation(text=text, translation=translation)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/python/finetuning_v2/evaluator/translation_evaluator.py", line 20, in _generate_evaluation
output = self.model(**encoded_text)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py", line 1155, in forward
outputs = self.roberta(
^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lael.alhalawani/miniconda3/envs/p311_phi3_finetuneing/lib/python3.11/site-packages/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py", line 767, in forward
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: The expanded size of the tensor (519) must match the existing size (514) at non-singleton dimension 1. Target sizes: [1, 519]. Tensor sizes: [1, 514]
I thought it could be the cuda chache that leaves some residuals, but clearing it didn't help. Running reload_model method also doesn't help.
from transformers import XLMRobertaTokenizerFast, AutoModel
import logging as log
import torch
import gc
log.basicConfig(level=log.INFO)
model_id = "vince62s/wmt23-cometkiwi-da-roberta-xl" # it's a modified unbabel model to use default Roberta Architecture
class TranslationEvaluator:
def __init__(self):
log.info(f"Preparing Translation Evaluation model: {model_id}")
self.model_id = model_id
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(self.model_id, trust_remote_code=True)
self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True)
log.info(f"Model loaded.")
def _generate_evaluation(self, text:str, translation:str):
input_text = f"{text}</s></s>{translation}"
encoded_text = self.tokenizer(input_text, return_tensors='pt')
output = self.model(**encoded_text)
return output
def evaluate(self, text:str, translation:str) -> float:
"""
Evaluates translation between two languages.
Outputs a score between 0 and 1, where 0 means very bad and 1 perfect translation.
Over 0.8 seems like a good translation.
"""
output = self._generate_evaluation(text=text, translation=translation)
return float(output[0][0][0])
def reload_model(self):
log.info(f"Reloading model {self.model_id}")
self.model = None
self.tokenizer = None
gc.collect()
torch.cuda.empty_cache()
self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True)
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(self.model_id, trust_remote_code=True)
log.info(f"Model reloaded.")
and here's the latest updated run.py
from evaluator import TranslationEvaluator
from data import preparer as prep
from data import extracted_samples_file as samples_file
from data import LanguageKeys as LK
import torch
import logging as log
log.basicConfig(level=log.INFO)
samples_file = prep.load_json(samples_file)
trans_eval = TranslationEvaluator()
log_file = './scoring/scores.json'
eval_n_samples = 21
start_at = 19
scores = {}
for i in range(len(samples_file[start_at:start_at+eval_n_samples])):
en = samples_file[i][LK.en_ie]
pl = samples_file[i][LK.pl_pl]
try:
score = trans_eval.evaluate(
text=en,
translation=pl
)
except IndexError as e:
log.warn("Error during evaluation. Reloading model and retrying.")
trans_eval.reload_model()
score = trans_eval.evaluate(
text=en,
translation=pl
)
scores[i] = score
log.info(f"Sample {i}: {score}")
prep.save_json(
data=scores,
save_path=log_file
)
Still the same error of IndexError index out of range in self
Sorry I don't have time to debug.
Instead I implemented (and am using) this: https://github.com/eole-nlp/eole/tree/main/recipes/cometkiwi
it is faster more versatile.