Upload 8 files
Browse files- assets/cover.jpg +0 -0
- assets/eval_qa.py +134 -0
- assets/eval_qa_long_context.py +111 -0
- assets/eval_recipes.py +134 -0
- assets/eval_recipes_long_context.py +111 -0
- assets/eval_results.jpg +0 -0
- assets/eval_reviews.py +134 -0
- assets/eval_reviews_long_context.py +111 -0
assets/cover.jpg
ADDED
![]() |
assets/eval_qa.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
|
17 |
+
df = pd.read_parquet("qa.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
|
30 |
+
|
31 |
+
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
|
32 |
+
|
33 |
+
modernBert.eval()
|
34 |
+
cosmos.eval()
|
35 |
+
dbmdz.eval()
|
36 |
+
|
37 |
+
modernBert.to("cuda", dtype=torch.float16)
|
38 |
+
print(modernBert.dtype)
|
39 |
+
cosmos.to("cuda")
|
40 |
+
dbmdz.to("cuda")
|
41 |
+
|
42 |
+
|
43 |
+
modernBertTrueTokenCount = 0
|
44 |
+
cosmosTrueTokenCount = 0
|
45 |
+
dbmdzTrueTokenCount = 0
|
46 |
+
|
47 |
+
modernBertElapsedTime = 0
|
48 |
+
cosmosElapsedTime = 0
|
49 |
+
dbmdzElapsedTime = 0
|
50 |
+
|
51 |
+
|
52 |
+
def mask_tokens(inputs):
|
53 |
+
inputsCopy = inputs.clone()
|
54 |
+
|
55 |
+
s = list(range(1, len(inputs[0]) - 1))
|
56 |
+
random.shuffle(s)
|
57 |
+
|
58 |
+
masked_indices = s[: int(len(s) * 0.05)] # mask ratio
|
59 |
+
|
60 |
+
inputsCopy[0][masked_indices] = 4
|
61 |
+
|
62 |
+
return inputsCopy, masked_indices
|
63 |
+
|
64 |
+
|
65 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
66 |
+
start = time.time()
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = model(masked_input_ids)
|
69 |
+
predictions = outputs.logits.cpu()
|
70 |
+
|
71 |
+
# Get the predicted tokens
|
72 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
73 |
+
|
74 |
+
trueTokenCount = (
|
75 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
76 |
+
).sum()
|
77 |
+
|
78 |
+
end = time.time()
|
79 |
+
elapsedTime = end - start
|
80 |
+
|
81 |
+
return trueTokenCount, elapsedTime, predicted_index
|
82 |
+
|
83 |
+
|
84 |
+
totalMaskedTokens = 0
|
85 |
+
|
86 |
+
from tqdm import tqdm
|
87 |
+
|
88 |
+
for row in tqdm(df.output.values):
|
89 |
+
text = row.replace("I", "ı").lower()
|
90 |
+
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
|
91 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
92 |
+
|
93 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
94 |
+
|
95 |
+
""" print("Original Text:", text)
|
96 |
+
print(
|
97 |
+
"Masked Text:",
|
98 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
99 |
+
) """
|
100 |
+
|
101 |
+
# modernBert
|
102 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
103 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
104 |
+
)
|
105 |
+
modernBertTrueTokenCount += trueTokenCount
|
106 |
+
modernBertElapsedTime += elapsedTime
|
107 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
108 |
+
|
109 |
+
# cosmos
|
110 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
111 |
+
cosmos, inputs, masked_input_ids, masked_indices
|
112 |
+
)
|
113 |
+
cosmosTrueTokenCount += trueTokenCount
|
114 |
+
cosmosElapsedTime += elapsedTime
|
115 |
+
# print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
|
116 |
+
|
117 |
+
# dbmdz
|
118 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
119 |
+
dbmdz, inputs, masked_input_ids, masked_indices
|
120 |
+
)
|
121 |
+
dbmdzTrueTokenCount += trueTokenCount
|
122 |
+
dbmdzElapsedTime += elapsedTime
|
123 |
+
# print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
|
124 |
+
|
125 |
+
totalMaskedTokens += len(masked_indices)
|
126 |
+
|
127 |
+
print(totalMaskedTokens)
|
128 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
129 |
+
print(cosmosTrueTokenCount, cosmosElapsedTime)
|
130 |
+
print(dbmdzTrueTokenCount, dbmdzElapsedTime)
|
131 |
+
|
132 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|
133 |
+
print(cosmosTrueTokenCount / totalMaskedTokens)
|
134 |
+
print(dbmdzTrueTokenCount / totalMaskedTokens)
|
assets/eval_qa_long_context.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
|
17 |
+
df = pd.read_parquet("qa.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
modernBert.eval()
|
31 |
+
|
32 |
+
modernBert.to("cuda", dtype=torch.float16)
|
33 |
+
print(modernBert.dtype)
|
34 |
+
|
35 |
+
|
36 |
+
modernBertTrueTokenCount = 0
|
37 |
+
|
38 |
+
modernBertElapsedTime = 0
|
39 |
+
|
40 |
+
|
41 |
+
def mask_tokens(inputs):
|
42 |
+
inputsCopy = inputs.clone()
|
43 |
+
|
44 |
+
s = list(range(1, len(inputs[0]) - 1))
|
45 |
+
random.shuffle(s)
|
46 |
+
|
47 |
+
masked_indices = s[: int(len(s) * 0.05)] # mask ratio
|
48 |
+
|
49 |
+
inputsCopy[0][masked_indices] = 4
|
50 |
+
|
51 |
+
return inputsCopy, masked_indices
|
52 |
+
|
53 |
+
|
54 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
55 |
+
start = time.time()
|
56 |
+
with torch.no_grad():
|
57 |
+
outputs = model(masked_input_ids)
|
58 |
+
predictions = outputs.logits.cpu()
|
59 |
+
|
60 |
+
# Get the predicted tokens
|
61 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
62 |
+
|
63 |
+
trueTokenCount = (
|
64 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
65 |
+
).sum()
|
66 |
+
|
67 |
+
end = time.time()
|
68 |
+
elapsedTime = end - start
|
69 |
+
|
70 |
+
return trueTokenCount, elapsedTime, predicted_index
|
71 |
+
|
72 |
+
|
73 |
+
totalMaskedTokens = 0
|
74 |
+
|
75 |
+
from tqdm import tqdm
|
76 |
+
|
77 |
+
concatenatedText = ""
|
78 |
+
for row in tqdm(df.output.values):
|
79 |
+
text = row.replace("I", "ı").lower()
|
80 |
+
concatenatedText += text
|
81 |
+
|
82 |
+
if len(concatenatedText.split()) > 6000:
|
83 |
+
inputs = tokenizer(
|
84 |
+
concatenatedText, return_tensors="pt", max_length=8192, truncation=True
|
85 |
+
)
|
86 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
87 |
+
|
88 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
89 |
+
|
90 |
+
""" print("Original Text:", text)
|
91 |
+
print(len(masked_input_ids[0]))
|
92 |
+
print(
|
93 |
+
"Masked Text:",
|
94 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
95 |
+
) """
|
96 |
+
|
97 |
+
# modernBert
|
98 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
99 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
100 |
+
)
|
101 |
+
modernBertTrueTokenCount += trueTokenCount
|
102 |
+
modernBertElapsedTime += elapsedTime
|
103 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
104 |
+
|
105 |
+
totalMaskedTokens += len(masked_indices)
|
106 |
+
concatenatedText = ""
|
107 |
+
|
108 |
+
print(totalMaskedTokens)
|
109 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
110 |
+
|
111 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|
assets/eval_recipes.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# hg dataset name: SedatAl/Turkish_Recipe
|
17 |
+
df = pd.read_parquet("recipe.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
|
30 |
+
|
31 |
+
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
|
32 |
+
|
33 |
+
modernBert.eval()
|
34 |
+
cosmos.eval()
|
35 |
+
dbmdz.eval()
|
36 |
+
|
37 |
+
modernBert.to("cuda", dtype=torch.float16)
|
38 |
+
print(modernBert.dtype)
|
39 |
+
cosmos.to("cuda")
|
40 |
+
dbmdz.to("cuda")
|
41 |
+
|
42 |
+
|
43 |
+
modernBertTrueTokenCount = 0
|
44 |
+
cosmosTrueTokenCount = 0
|
45 |
+
dbmdzTrueTokenCount = 0
|
46 |
+
|
47 |
+
modernBertElapsedTime = 0
|
48 |
+
cosmosElapsedTime = 0
|
49 |
+
dbmdzElapsedTime = 0
|
50 |
+
|
51 |
+
|
52 |
+
def mask_tokens(inputs):
|
53 |
+
inputsCopy = inputs.clone()
|
54 |
+
|
55 |
+
s = list(range(1, len(inputs[0]) - 1))
|
56 |
+
random.shuffle(s)
|
57 |
+
|
58 |
+
masked_indices = s[: int(len(s) * 0.1)] # mask ratio
|
59 |
+
|
60 |
+
inputsCopy[0][masked_indices] = 4
|
61 |
+
|
62 |
+
return inputsCopy, masked_indices
|
63 |
+
|
64 |
+
|
65 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
66 |
+
start = time.time()
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = model(masked_input_ids)
|
69 |
+
predictions = outputs.logits.cpu()
|
70 |
+
|
71 |
+
# Get the predicted tokens
|
72 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
73 |
+
|
74 |
+
trueTokenCount = (
|
75 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
76 |
+
).sum()
|
77 |
+
|
78 |
+
end = time.time()
|
79 |
+
elapsedTime = end - start
|
80 |
+
|
81 |
+
return trueTokenCount, elapsedTime, predicted_index
|
82 |
+
|
83 |
+
|
84 |
+
totalMaskedTokens = 0
|
85 |
+
|
86 |
+
from tqdm import tqdm
|
87 |
+
|
88 |
+
for row in tqdm(df.tarif.values):
|
89 |
+
text = row.replace("I", "ı").lower()
|
90 |
+
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
|
91 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
92 |
+
|
93 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
94 |
+
|
95 |
+
""" print("Original Text:", text)
|
96 |
+
print(
|
97 |
+
"Masked Text:",
|
98 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
99 |
+
) """
|
100 |
+
|
101 |
+
# modernBert
|
102 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
103 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
104 |
+
)
|
105 |
+
modernBertTrueTokenCount += trueTokenCount
|
106 |
+
modernBertElapsedTime += elapsedTime
|
107 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
108 |
+
|
109 |
+
# cosmos
|
110 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
111 |
+
cosmos, inputs, masked_input_ids, masked_indices
|
112 |
+
)
|
113 |
+
cosmosTrueTokenCount += trueTokenCount
|
114 |
+
cosmosElapsedTime += elapsedTime
|
115 |
+
# print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
|
116 |
+
|
117 |
+
# dbmdz
|
118 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
119 |
+
dbmdz, inputs, masked_input_ids, masked_indices
|
120 |
+
)
|
121 |
+
dbmdzTrueTokenCount += trueTokenCount
|
122 |
+
dbmdzElapsedTime += elapsedTime
|
123 |
+
# print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
|
124 |
+
|
125 |
+
totalMaskedTokens += len(masked_indices)
|
126 |
+
|
127 |
+
print(totalMaskedTokens)
|
128 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
129 |
+
print(cosmosTrueTokenCount, cosmosElapsedTime)
|
130 |
+
print(dbmdzTrueTokenCount, dbmdzElapsedTime)
|
131 |
+
|
132 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|
133 |
+
print(cosmosTrueTokenCount / totalMaskedTokens)
|
134 |
+
print(dbmdzTrueTokenCount / totalMaskedTokens)
|
assets/eval_recipes_long_context.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# hg dataset name: SedatAl/Turkish_Recipe
|
17 |
+
df = pd.read_parquet("recipe.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
modernBert.eval()
|
31 |
+
|
32 |
+
modernBert.to("cuda", dtype=torch.float16)
|
33 |
+
print(modernBert.dtype)
|
34 |
+
|
35 |
+
|
36 |
+
modernBertTrueTokenCount = 0
|
37 |
+
|
38 |
+
modernBertElapsedTime = 0
|
39 |
+
|
40 |
+
|
41 |
+
def mask_tokens(inputs):
|
42 |
+
inputsCopy = inputs.clone()
|
43 |
+
|
44 |
+
s = list(range(1, len(inputs[0]) - 1))
|
45 |
+
random.shuffle(s)
|
46 |
+
|
47 |
+
masked_indices = s[: int(len(s) * 0.15)] # mask ratio
|
48 |
+
|
49 |
+
inputsCopy[0][masked_indices] = 4
|
50 |
+
|
51 |
+
return inputsCopy, masked_indices
|
52 |
+
|
53 |
+
|
54 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
55 |
+
start = time.time()
|
56 |
+
with torch.no_grad():
|
57 |
+
outputs = model(masked_input_ids)
|
58 |
+
predictions = outputs.logits.cpu()
|
59 |
+
|
60 |
+
# Get the predicted tokens
|
61 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
62 |
+
|
63 |
+
trueTokenCount = (
|
64 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
65 |
+
).sum()
|
66 |
+
|
67 |
+
end = time.time()
|
68 |
+
elapsedTime = end - start
|
69 |
+
|
70 |
+
return trueTokenCount, elapsedTime, predicted_index
|
71 |
+
|
72 |
+
|
73 |
+
totalMaskedTokens = 0
|
74 |
+
|
75 |
+
from tqdm import tqdm
|
76 |
+
|
77 |
+
concatenatedText = ""
|
78 |
+
for row in tqdm(df.tarif.values):
|
79 |
+
text = row.replace("I", "ı").lower()
|
80 |
+
concatenatedText += text
|
81 |
+
|
82 |
+
if len(concatenatedText.split()) > 6000:
|
83 |
+
inputs = tokenizer(
|
84 |
+
concatenatedText, return_tensors="pt", max_length=8192, truncation=True
|
85 |
+
)
|
86 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
87 |
+
|
88 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
89 |
+
|
90 |
+
""" print("Original Text:", text)
|
91 |
+
print(len(masked_input_ids[0]))
|
92 |
+
print(
|
93 |
+
"Masked Text:",
|
94 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
95 |
+
) """
|
96 |
+
|
97 |
+
# modernBert
|
98 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
99 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
100 |
+
)
|
101 |
+
modernBertTrueTokenCount += trueTokenCount
|
102 |
+
modernBertElapsedTime += elapsedTime
|
103 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
104 |
+
|
105 |
+
totalMaskedTokens += len(masked_indices)
|
106 |
+
concatenatedText = ""
|
107 |
+
|
108 |
+
print(totalMaskedTokens)
|
109 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
110 |
+
|
111 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|
assets/eval_results.jpg
ADDED
![]() |
assets/eval_reviews.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet
|
17 |
+
df = pd.read_parquet("reviews.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
|
30 |
+
|
31 |
+
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
|
32 |
+
|
33 |
+
modernBert.eval()
|
34 |
+
cosmos.eval()
|
35 |
+
dbmdz.eval()
|
36 |
+
|
37 |
+
modernBert.to("cuda", dtype=torch.float16)
|
38 |
+
print(modernBert.dtype)
|
39 |
+
cosmos.to("cuda")
|
40 |
+
dbmdz.to("cuda")
|
41 |
+
|
42 |
+
|
43 |
+
modernBertTrueTokenCount = 0
|
44 |
+
cosmosTrueTokenCount = 0
|
45 |
+
dbmdzTrueTokenCount = 0
|
46 |
+
|
47 |
+
modernBertElapsedTime = 0
|
48 |
+
cosmosElapsedTime = 0
|
49 |
+
dbmdzElapsedTime = 0
|
50 |
+
|
51 |
+
|
52 |
+
def mask_tokens(inputs):
|
53 |
+
inputsCopy = inputs.clone()
|
54 |
+
|
55 |
+
s = list(range(1, len(inputs[0]) - 1))
|
56 |
+
random.shuffle(s)
|
57 |
+
|
58 |
+
masked_indices = s[: int(len(s) * 0.15)] # mask ratio
|
59 |
+
|
60 |
+
inputsCopy[0][masked_indices] = 4
|
61 |
+
|
62 |
+
return inputsCopy, masked_indices
|
63 |
+
|
64 |
+
|
65 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
66 |
+
start = time.time()
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = model(masked_input_ids)
|
69 |
+
predictions = outputs.logits.cpu()
|
70 |
+
|
71 |
+
# Get the predicted tokens
|
72 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
73 |
+
|
74 |
+
trueTokenCount = (
|
75 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
76 |
+
).sum()
|
77 |
+
|
78 |
+
end = time.time()
|
79 |
+
elapsedTime = end - start
|
80 |
+
|
81 |
+
return trueTokenCount, elapsedTime, predicted_index
|
82 |
+
|
83 |
+
|
84 |
+
totalMaskedTokens = 0
|
85 |
+
|
86 |
+
from tqdm import tqdm
|
87 |
+
|
88 |
+
for row in tqdm(df.review.values):
|
89 |
+
text = row.replace("I", "ı").lower()
|
90 |
+
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
|
91 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
92 |
+
|
93 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
94 |
+
|
95 |
+
""" print("Original Text:", text)
|
96 |
+
print(
|
97 |
+
"Masked Text:",
|
98 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
99 |
+
) """
|
100 |
+
|
101 |
+
# modernBert
|
102 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
103 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
104 |
+
)
|
105 |
+
modernBertTrueTokenCount += trueTokenCount
|
106 |
+
modernBertElapsedTime += elapsedTime
|
107 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
108 |
+
|
109 |
+
# cosmos
|
110 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
111 |
+
cosmos, inputs, masked_input_ids, masked_indices
|
112 |
+
)
|
113 |
+
cosmosTrueTokenCount += trueTokenCount
|
114 |
+
cosmosElapsedTime += elapsedTime
|
115 |
+
# print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
|
116 |
+
|
117 |
+
# dbmdz
|
118 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
119 |
+
dbmdz, inputs, masked_input_ids, masked_indices
|
120 |
+
)
|
121 |
+
dbmdzTrueTokenCount += trueTokenCount
|
122 |
+
dbmdzElapsedTime += elapsedTime
|
123 |
+
# print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
|
124 |
+
|
125 |
+
totalMaskedTokens += len(masked_indices)
|
126 |
+
|
127 |
+
print(totalMaskedTokens)
|
128 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
129 |
+
print(cosmosTrueTokenCount, cosmosElapsedTime)
|
130 |
+
print(dbmdzTrueTokenCount, dbmdzElapsedTime)
|
131 |
+
|
132 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|
133 |
+
print(cosmosTrueTokenCount / totalMaskedTokens)
|
134 |
+
print(dbmdzTrueTokenCount / totalMaskedTokens)
|
assets/eval_reviews_long_context.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
|
4 |
+
os.environ["HF_HOME"] = "./cache/hgCache/"
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import pipeline
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
import time
|
11 |
+
import random
|
12 |
+
import torch
|
13 |
+
|
14 |
+
random.seed(42)
|
15 |
+
|
16 |
+
# wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet
|
17 |
+
df = pd.read_parquet("reviews.parquet")
|
18 |
+
print(df)
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
|
22 |
+
)
|
23 |
+
tokenizer.truncation_side = "right"
|
24 |
+
|
25 |
+
modernBert = AutoModelForMaskedLM.from_pretrained(
|
26 |
+
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
modernBert.eval()
|
31 |
+
|
32 |
+
modernBert.to("cuda", dtype=torch.float16)
|
33 |
+
print(modernBert.dtype)
|
34 |
+
|
35 |
+
|
36 |
+
modernBertTrueTokenCount = 0
|
37 |
+
|
38 |
+
modernBertElapsedTime = 0
|
39 |
+
|
40 |
+
|
41 |
+
def mask_tokens(inputs):
|
42 |
+
inputsCopy = inputs.clone()
|
43 |
+
|
44 |
+
s = list(range(1, len(inputs[0]) - 1))
|
45 |
+
random.shuffle(s)
|
46 |
+
|
47 |
+
masked_indices = s[: int(len(s) * 0.15)] # mask ratio
|
48 |
+
|
49 |
+
inputsCopy[0][masked_indices] = 4
|
50 |
+
|
51 |
+
return inputsCopy, masked_indices
|
52 |
+
|
53 |
+
|
54 |
+
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
|
55 |
+
start = time.time()
|
56 |
+
with torch.no_grad():
|
57 |
+
outputs = model(masked_input_ids)
|
58 |
+
predictions = outputs.logits.cpu()
|
59 |
+
|
60 |
+
# Get the predicted tokens
|
61 |
+
predicted_index = torch.argmax(predictions[0], dim=-1)
|
62 |
+
|
63 |
+
trueTokenCount = (
|
64 |
+
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
|
65 |
+
).sum()
|
66 |
+
|
67 |
+
end = time.time()
|
68 |
+
elapsedTime = end - start
|
69 |
+
|
70 |
+
return trueTokenCount, elapsedTime, predicted_index
|
71 |
+
|
72 |
+
|
73 |
+
totalMaskedTokens = 0
|
74 |
+
|
75 |
+
from tqdm import tqdm
|
76 |
+
|
77 |
+
concatenatedText = ""
|
78 |
+
for row in tqdm(df.review.values):
|
79 |
+
text = row.replace("I", "ı").lower()
|
80 |
+
concatenatedText += text
|
81 |
+
|
82 |
+
if len(concatenatedText.split()) > 6000:
|
83 |
+
inputs = tokenizer(
|
84 |
+
concatenatedText, return_tensors="pt", max_length=8192, truncation=True
|
85 |
+
)
|
86 |
+
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
|
87 |
+
|
88 |
+
masked_input_ids = masked_input_ids.to("cuda")
|
89 |
+
|
90 |
+
""" print("Original Text:", text)
|
91 |
+
print(len(masked_input_ids[0]))
|
92 |
+
print(
|
93 |
+
"Masked Text:",
|
94 |
+
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
|
95 |
+
) """
|
96 |
+
|
97 |
+
# modernBert
|
98 |
+
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
|
99 |
+
modernBert, inputs, masked_input_ids, masked_indices
|
100 |
+
)
|
101 |
+
modernBertTrueTokenCount += trueTokenCount
|
102 |
+
modernBertElapsedTime += elapsedTime
|
103 |
+
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
|
104 |
+
|
105 |
+
totalMaskedTokens += len(masked_indices)
|
106 |
+
concatenatedText = ""
|
107 |
+
|
108 |
+
print(totalMaskedTokens)
|
109 |
+
print(modernBertTrueTokenCount, modernBertElapsedTime)
|
110 |
+
|
111 |
+
print(modernBertTrueTokenCount / totalMaskedTokens)
|