99eren99 commited on
Commit
e555415
·
verified ·
1 Parent(s): 1f0a03b

Upload 8 files

Browse files
assets/cover.jpg ADDED
assets/eval_qa.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
17
+ df = pd.read_parquet("qa.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+ cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
30
+
31
+ dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
32
+
33
+ modernBert.eval()
34
+ cosmos.eval()
35
+ dbmdz.eval()
36
+
37
+ modernBert.to("cuda", dtype=torch.float16)
38
+ print(modernBert.dtype)
39
+ cosmos.to("cuda")
40
+ dbmdz.to("cuda")
41
+
42
+
43
+ modernBertTrueTokenCount = 0
44
+ cosmosTrueTokenCount = 0
45
+ dbmdzTrueTokenCount = 0
46
+
47
+ modernBertElapsedTime = 0
48
+ cosmosElapsedTime = 0
49
+ dbmdzElapsedTime = 0
50
+
51
+
52
+ def mask_tokens(inputs):
53
+ inputsCopy = inputs.clone()
54
+
55
+ s = list(range(1, len(inputs[0]) - 1))
56
+ random.shuffle(s)
57
+
58
+ masked_indices = s[: int(len(s) * 0.05)] # mask ratio
59
+
60
+ inputsCopy[0][masked_indices] = 4
61
+
62
+ return inputsCopy, masked_indices
63
+
64
+
65
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
66
+ start = time.time()
67
+ with torch.no_grad():
68
+ outputs = model(masked_input_ids)
69
+ predictions = outputs.logits.cpu()
70
+
71
+ # Get the predicted tokens
72
+ predicted_index = torch.argmax(predictions[0], dim=-1)
73
+
74
+ trueTokenCount = (
75
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
76
+ ).sum()
77
+
78
+ end = time.time()
79
+ elapsedTime = end - start
80
+
81
+ return trueTokenCount, elapsedTime, predicted_index
82
+
83
+
84
+ totalMaskedTokens = 0
85
+
86
+ from tqdm import tqdm
87
+
88
+ for row in tqdm(df.output.values):
89
+ text = row.replace("I", "ı").lower()
90
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
91
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
92
+
93
+ masked_input_ids = masked_input_ids.to("cuda")
94
+
95
+ """ print("Original Text:", text)
96
+ print(
97
+ "Masked Text:",
98
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
99
+ ) """
100
+
101
+ # modernBert
102
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
103
+ modernBert, inputs, masked_input_ids, masked_indices
104
+ )
105
+ modernBertTrueTokenCount += trueTokenCount
106
+ modernBertElapsedTime += elapsedTime
107
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
108
+
109
+ # cosmos
110
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
111
+ cosmos, inputs, masked_input_ids, masked_indices
112
+ )
113
+ cosmosTrueTokenCount += trueTokenCount
114
+ cosmosElapsedTime += elapsedTime
115
+ # print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
116
+
117
+ # dbmdz
118
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
119
+ dbmdz, inputs, masked_input_ids, masked_indices
120
+ )
121
+ dbmdzTrueTokenCount += trueTokenCount
122
+ dbmdzElapsedTime += elapsedTime
123
+ # print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
124
+
125
+ totalMaskedTokens += len(masked_indices)
126
+
127
+ print(totalMaskedTokens)
128
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
129
+ print(cosmosTrueTokenCount, cosmosElapsedTime)
130
+ print(dbmdzTrueTokenCount, dbmdzElapsedTime)
131
+
132
+ print(modernBertTrueTokenCount / totalMaskedTokens)
133
+ print(cosmosTrueTokenCount / totalMaskedTokens)
134
+ print(dbmdzTrueTokenCount / totalMaskedTokens)
assets/eval_qa_long_context.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
17
+ df = pd.read_parquet("qa.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+
30
+ modernBert.eval()
31
+
32
+ modernBert.to("cuda", dtype=torch.float16)
33
+ print(modernBert.dtype)
34
+
35
+
36
+ modernBertTrueTokenCount = 0
37
+
38
+ modernBertElapsedTime = 0
39
+
40
+
41
+ def mask_tokens(inputs):
42
+ inputsCopy = inputs.clone()
43
+
44
+ s = list(range(1, len(inputs[0]) - 1))
45
+ random.shuffle(s)
46
+
47
+ masked_indices = s[: int(len(s) * 0.05)] # mask ratio
48
+
49
+ inputsCopy[0][masked_indices] = 4
50
+
51
+ return inputsCopy, masked_indices
52
+
53
+
54
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
55
+ start = time.time()
56
+ with torch.no_grad():
57
+ outputs = model(masked_input_ids)
58
+ predictions = outputs.logits.cpu()
59
+
60
+ # Get the predicted tokens
61
+ predicted_index = torch.argmax(predictions[0], dim=-1)
62
+
63
+ trueTokenCount = (
64
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
65
+ ).sum()
66
+
67
+ end = time.time()
68
+ elapsedTime = end - start
69
+
70
+ return trueTokenCount, elapsedTime, predicted_index
71
+
72
+
73
+ totalMaskedTokens = 0
74
+
75
+ from tqdm import tqdm
76
+
77
+ concatenatedText = ""
78
+ for row in tqdm(df.output.values):
79
+ text = row.replace("I", "ı").lower()
80
+ concatenatedText += text
81
+
82
+ if len(concatenatedText.split()) > 6000:
83
+ inputs = tokenizer(
84
+ concatenatedText, return_tensors="pt", max_length=8192, truncation=True
85
+ )
86
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
87
+
88
+ masked_input_ids = masked_input_ids.to("cuda")
89
+
90
+ """ print("Original Text:", text)
91
+ print(len(masked_input_ids[0]))
92
+ print(
93
+ "Masked Text:",
94
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
95
+ ) """
96
+
97
+ # modernBert
98
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
99
+ modernBert, inputs, masked_input_ids, masked_indices
100
+ )
101
+ modernBertTrueTokenCount += trueTokenCount
102
+ modernBertElapsedTime += elapsedTime
103
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
104
+
105
+ totalMaskedTokens += len(masked_indices)
106
+ concatenatedText = ""
107
+
108
+ print(totalMaskedTokens)
109
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
110
+
111
+ print(modernBertTrueTokenCount / totalMaskedTokens)
assets/eval_recipes.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # hg dataset name: SedatAl/Turkish_Recipe
17
+ df = pd.read_parquet("recipe.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+ cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
30
+
31
+ dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
32
+
33
+ modernBert.eval()
34
+ cosmos.eval()
35
+ dbmdz.eval()
36
+
37
+ modernBert.to("cuda", dtype=torch.float16)
38
+ print(modernBert.dtype)
39
+ cosmos.to("cuda")
40
+ dbmdz.to("cuda")
41
+
42
+
43
+ modernBertTrueTokenCount = 0
44
+ cosmosTrueTokenCount = 0
45
+ dbmdzTrueTokenCount = 0
46
+
47
+ modernBertElapsedTime = 0
48
+ cosmosElapsedTime = 0
49
+ dbmdzElapsedTime = 0
50
+
51
+
52
+ def mask_tokens(inputs):
53
+ inputsCopy = inputs.clone()
54
+
55
+ s = list(range(1, len(inputs[0]) - 1))
56
+ random.shuffle(s)
57
+
58
+ masked_indices = s[: int(len(s) * 0.1)] # mask ratio
59
+
60
+ inputsCopy[0][masked_indices] = 4
61
+
62
+ return inputsCopy, masked_indices
63
+
64
+
65
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
66
+ start = time.time()
67
+ with torch.no_grad():
68
+ outputs = model(masked_input_ids)
69
+ predictions = outputs.logits.cpu()
70
+
71
+ # Get the predicted tokens
72
+ predicted_index = torch.argmax(predictions[0], dim=-1)
73
+
74
+ trueTokenCount = (
75
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
76
+ ).sum()
77
+
78
+ end = time.time()
79
+ elapsedTime = end - start
80
+
81
+ return trueTokenCount, elapsedTime, predicted_index
82
+
83
+
84
+ totalMaskedTokens = 0
85
+
86
+ from tqdm import tqdm
87
+
88
+ for row in tqdm(df.tarif.values):
89
+ text = row.replace("I", "ı").lower()
90
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
91
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
92
+
93
+ masked_input_ids = masked_input_ids.to("cuda")
94
+
95
+ """ print("Original Text:", text)
96
+ print(
97
+ "Masked Text:",
98
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
99
+ ) """
100
+
101
+ # modernBert
102
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
103
+ modernBert, inputs, masked_input_ids, masked_indices
104
+ )
105
+ modernBertTrueTokenCount += trueTokenCount
106
+ modernBertElapsedTime += elapsedTime
107
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
108
+
109
+ # cosmos
110
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
111
+ cosmos, inputs, masked_input_ids, masked_indices
112
+ )
113
+ cosmosTrueTokenCount += trueTokenCount
114
+ cosmosElapsedTime += elapsedTime
115
+ # print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
116
+
117
+ # dbmdz
118
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
119
+ dbmdz, inputs, masked_input_ids, masked_indices
120
+ )
121
+ dbmdzTrueTokenCount += trueTokenCount
122
+ dbmdzElapsedTime += elapsedTime
123
+ # print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
124
+
125
+ totalMaskedTokens += len(masked_indices)
126
+
127
+ print(totalMaskedTokens)
128
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
129
+ print(cosmosTrueTokenCount, cosmosElapsedTime)
130
+ print(dbmdzTrueTokenCount, dbmdzElapsedTime)
131
+
132
+ print(modernBertTrueTokenCount / totalMaskedTokens)
133
+ print(cosmosTrueTokenCount / totalMaskedTokens)
134
+ print(dbmdzTrueTokenCount / totalMaskedTokens)
assets/eval_recipes_long_context.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # hg dataset name: SedatAl/Turkish_Recipe
17
+ df = pd.read_parquet("recipe.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+
30
+ modernBert.eval()
31
+
32
+ modernBert.to("cuda", dtype=torch.float16)
33
+ print(modernBert.dtype)
34
+
35
+
36
+ modernBertTrueTokenCount = 0
37
+
38
+ modernBertElapsedTime = 0
39
+
40
+
41
+ def mask_tokens(inputs):
42
+ inputsCopy = inputs.clone()
43
+
44
+ s = list(range(1, len(inputs[0]) - 1))
45
+ random.shuffle(s)
46
+
47
+ masked_indices = s[: int(len(s) * 0.15)] # mask ratio
48
+
49
+ inputsCopy[0][masked_indices] = 4
50
+
51
+ return inputsCopy, masked_indices
52
+
53
+
54
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
55
+ start = time.time()
56
+ with torch.no_grad():
57
+ outputs = model(masked_input_ids)
58
+ predictions = outputs.logits.cpu()
59
+
60
+ # Get the predicted tokens
61
+ predicted_index = torch.argmax(predictions[0], dim=-1)
62
+
63
+ trueTokenCount = (
64
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
65
+ ).sum()
66
+
67
+ end = time.time()
68
+ elapsedTime = end - start
69
+
70
+ return trueTokenCount, elapsedTime, predicted_index
71
+
72
+
73
+ totalMaskedTokens = 0
74
+
75
+ from tqdm import tqdm
76
+
77
+ concatenatedText = ""
78
+ for row in tqdm(df.tarif.values):
79
+ text = row.replace("I", "ı").lower()
80
+ concatenatedText += text
81
+
82
+ if len(concatenatedText.split()) > 6000:
83
+ inputs = tokenizer(
84
+ concatenatedText, return_tensors="pt", max_length=8192, truncation=True
85
+ )
86
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
87
+
88
+ masked_input_ids = masked_input_ids.to("cuda")
89
+
90
+ """ print("Original Text:", text)
91
+ print(len(masked_input_ids[0]))
92
+ print(
93
+ "Masked Text:",
94
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
95
+ ) """
96
+
97
+ # modernBert
98
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
99
+ modernBert, inputs, masked_input_ids, masked_indices
100
+ )
101
+ modernBertTrueTokenCount += trueTokenCount
102
+ modernBertElapsedTime += elapsedTime
103
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
104
+
105
+ totalMaskedTokens += len(masked_indices)
106
+ concatenatedText = ""
107
+
108
+ print(totalMaskedTokens)
109
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
110
+
111
+ print(modernBertTrueTokenCount / totalMaskedTokens)
assets/eval_results.jpg ADDED
assets/eval_reviews.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet
17
+ df = pd.read_parquet("reviews.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+ cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
30
+
31
+ dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
32
+
33
+ modernBert.eval()
34
+ cosmos.eval()
35
+ dbmdz.eval()
36
+
37
+ modernBert.to("cuda", dtype=torch.float16)
38
+ print(modernBert.dtype)
39
+ cosmos.to("cuda")
40
+ dbmdz.to("cuda")
41
+
42
+
43
+ modernBertTrueTokenCount = 0
44
+ cosmosTrueTokenCount = 0
45
+ dbmdzTrueTokenCount = 0
46
+
47
+ modernBertElapsedTime = 0
48
+ cosmosElapsedTime = 0
49
+ dbmdzElapsedTime = 0
50
+
51
+
52
+ def mask_tokens(inputs):
53
+ inputsCopy = inputs.clone()
54
+
55
+ s = list(range(1, len(inputs[0]) - 1))
56
+ random.shuffle(s)
57
+
58
+ masked_indices = s[: int(len(s) * 0.15)] # mask ratio
59
+
60
+ inputsCopy[0][masked_indices] = 4
61
+
62
+ return inputsCopy, masked_indices
63
+
64
+
65
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
66
+ start = time.time()
67
+ with torch.no_grad():
68
+ outputs = model(masked_input_ids)
69
+ predictions = outputs.logits.cpu()
70
+
71
+ # Get the predicted tokens
72
+ predicted_index = torch.argmax(predictions[0], dim=-1)
73
+
74
+ trueTokenCount = (
75
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
76
+ ).sum()
77
+
78
+ end = time.time()
79
+ elapsedTime = end - start
80
+
81
+ return trueTokenCount, elapsedTime, predicted_index
82
+
83
+
84
+ totalMaskedTokens = 0
85
+
86
+ from tqdm import tqdm
87
+
88
+ for row in tqdm(df.review.values):
89
+ text = row.replace("I", "ı").lower()
90
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
91
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
92
+
93
+ masked_input_ids = masked_input_ids.to("cuda")
94
+
95
+ """ print("Original Text:", text)
96
+ print(
97
+ "Masked Text:",
98
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
99
+ ) """
100
+
101
+ # modernBert
102
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
103
+ modernBert, inputs, masked_input_ids, masked_indices
104
+ )
105
+ modernBertTrueTokenCount += trueTokenCount
106
+ modernBertElapsedTime += elapsedTime
107
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
108
+
109
+ # cosmos
110
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
111
+ cosmos, inputs, masked_input_ids, masked_indices
112
+ )
113
+ cosmosTrueTokenCount += trueTokenCount
114
+ cosmosElapsedTime += elapsedTime
115
+ # print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
116
+
117
+ # dbmdz
118
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
119
+ dbmdz, inputs, masked_input_ids, masked_indices
120
+ )
121
+ dbmdzTrueTokenCount += trueTokenCount
122
+ dbmdzElapsedTime += elapsedTime
123
+ # print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
124
+
125
+ totalMaskedTokens += len(masked_indices)
126
+
127
+ print(totalMaskedTokens)
128
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
129
+ print(cosmosTrueTokenCount, cosmosElapsedTime)
130
+ print(dbmdzTrueTokenCount, dbmdzElapsedTime)
131
+
132
+ print(modernBertTrueTokenCount / totalMaskedTokens)
133
+ print(cosmosTrueTokenCount / totalMaskedTokens)
134
+ print(dbmdzTrueTokenCount / totalMaskedTokens)
assets/eval_reviews_long_context.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
4
+ os.environ["HF_HOME"] = "./cache/hgCache/"
5
+
6
+ import torch
7
+ from transformers import pipeline
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
9
+ import pandas as pd
10
+ import time
11
+ import random
12
+ import torch
13
+
14
+ random.seed(42)
15
+
16
+ # wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet
17
+ df = pd.read_parquet("reviews.parquet")
18
+ print(df)
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
22
+ )
23
+ tokenizer.truncation_side = "right"
24
+
25
+ modernBert = AutoModelForMaskedLM.from_pretrained(
26
+ "99eren99/ModernBERT-base-Turkish-uncased-mlm",
27
+ )
28
+
29
+
30
+ modernBert.eval()
31
+
32
+ modernBert.to("cuda", dtype=torch.float16)
33
+ print(modernBert.dtype)
34
+
35
+
36
+ modernBertTrueTokenCount = 0
37
+
38
+ modernBertElapsedTime = 0
39
+
40
+
41
+ def mask_tokens(inputs):
42
+ inputsCopy = inputs.clone()
43
+
44
+ s = list(range(1, len(inputs[0]) - 1))
45
+ random.shuffle(s)
46
+
47
+ masked_indices = s[: int(len(s) * 0.15)] # mask ratio
48
+
49
+ inputsCopy[0][masked_indices] = 4
50
+
51
+ return inputsCopy, masked_indices
52
+
53
+
54
+ def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
55
+ start = time.time()
56
+ with torch.no_grad():
57
+ outputs = model(masked_input_ids)
58
+ predictions = outputs.logits.cpu()
59
+
60
+ # Get the predicted tokens
61
+ predicted_index = torch.argmax(predictions[0], dim=-1)
62
+
63
+ trueTokenCount = (
64
+ (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
65
+ ).sum()
66
+
67
+ end = time.time()
68
+ elapsedTime = end - start
69
+
70
+ return trueTokenCount, elapsedTime, predicted_index
71
+
72
+
73
+ totalMaskedTokens = 0
74
+
75
+ from tqdm import tqdm
76
+
77
+ concatenatedText = ""
78
+ for row in tqdm(df.review.values):
79
+ text = row.replace("I", "ı").lower()
80
+ concatenatedText += text
81
+
82
+ if len(concatenatedText.split()) > 6000:
83
+ inputs = tokenizer(
84
+ concatenatedText, return_tensors="pt", max_length=8192, truncation=True
85
+ )
86
+ masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
87
+
88
+ masked_input_ids = masked_input_ids.to("cuda")
89
+
90
+ """ print("Original Text:", text)
91
+ print(len(masked_input_ids[0]))
92
+ print(
93
+ "Masked Text:",
94
+ " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
95
+ ) """
96
+
97
+ # modernBert
98
+ trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
99
+ modernBert, inputs, masked_input_ids, masked_indices
100
+ )
101
+ modernBertTrueTokenCount += trueTokenCount
102
+ modernBertElapsedTime += elapsedTime
103
+ # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
104
+
105
+ totalMaskedTokens += len(masked_indices)
106
+ concatenatedText = ""
107
+
108
+ print(totalMaskedTokens)
109
+ print(modernBertTrueTokenCount, modernBertElapsedTime)
110
+
111
+ print(modernBertTrueTokenCount / totalMaskedTokens)