File size: 2,976 Bytes
e555415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
os.environ["HF_HOME"] = "./cache/hgCache/"
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
import pandas as pd
import time
import random
import torch
random.seed(42)
# wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet
df = pd.read_parquet("reviews.parquet")
print(df)
tokenizer = AutoTokenizer.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
)
tokenizer.truncation_side = "right"
modernBert = AutoModelForMaskedLM.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
)
modernBert.eval()
modernBert.to("cuda", dtype=torch.float16)
print(modernBert.dtype)
modernBertTrueTokenCount = 0
modernBertElapsedTime = 0
def mask_tokens(inputs):
inputsCopy = inputs.clone()
s = list(range(1, len(inputs[0]) - 1))
random.shuffle(s)
masked_indices = s[: int(len(s) * 0.15)] # mask ratio
inputsCopy[0][masked_indices] = 4
return inputsCopy, masked_indices
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
start = time.time()
with torch.no_grad():
outputs = model(masked_input_ids)
predictions = outputs.logits.cpu()
# Get the predicted tokens
predicted_index = torch.argmax(predictions[0], dim=-1)
trueTokenCount = (
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
).sum()
end = time.time()
elapsedTime = end - start
return trueTokenCount, elapsedTime, predicted_index
totalMaskedTokens = 0
from tqdm import tqdm
concatenatedText = ""
for row in tqdm(df.review.values):
text = row.replace("I", "ı").lower()
concatenatedText += text
if len(concatenatedText.split()) > 6000:
inputs = tokenizer(
concatenatedText, return_tensors="pt", max_length=8192, truncation=True
)
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
masked_input_ids = masked_input_ids.to("cuda")
""" print("Original Text:", text)
print(len(masked_input_ids[0]))
print(
"Masked Text:",
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
) """
# modernBert
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
modernBert, inputs, masked_input_ids, masked_indices
)
modernBertTrueTokenCount += trueTokenCount
modernBertElapsedTime += elapsedTime
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
totalMaskedTokens += len(masked_indices)
concatenatedText = ""
print(totalMaskedTokens)
print(modernBertTrueTokenCount, modernBertElapsedTime)
print(modernBertTrueTokenCount / totalMaskedTokens)
|