strongpear commited on
Commit
5ad5256
1 Parent(s): f149182

Add new SentenceTransformer model.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: strongpear/M3-retriever-Vi-Text2SQL
3
+ datasets: []
4
+ language: []
5
+ library_name: sentence-transformers
6
+ pipeline_tag: sentence-similarity
7
+ tags:
8
+ - sentence-transformers
9
+ - sentence-similarity
10
+ - feature-extraction
11
+ - generated_from_trainer
12
+ - dataset_size:1312839
13
+ - loss:TripletLoss
14
+ widget:
15
+ - source_sentence: Đã bao nhiêu giờ trôi qua kể từ khi bệnh nhân 006-172277 nhập viện?
16
+ sentences:
17
+ - CREATE TABLE chi phí(chi phí number,duy nhất text,bệnh nhânhệ thống sức khỏelưu
18
+ trúid number,loại sự kiện text,id sự kiện number,thời gian tính phí time,chi phí
19
+ number)
20
+ - CREATE TABLE chuyến bay fare(flight id int,fare id int)
21
+ - CREATE TABLE disease(uniquepid text, Patienthealthsystemstayid number, bệnh nhân
22
+ đơn vị lưu trú number, giới tính text, tuổi text, dân tộc text, bệnh viện number,
23
+ khu bệnh number, chiều cao nhập viện number, cân nặng nhập viện number, cân nặng
24
+ xuất viện number, thời gian nhập viện time, nguồn nhập viện text, thời gian xuất
25
+ viện time, trạng thái xuất viện text)
26
+ - source_sentence: tôi cần một chuyến bay từ NEWARK đến LOS ANGELES khởi hành vào
27
+ tối mai
28
+ sentences:
29
+ - CREATE TABLE time zone(time zone code text,time zone name text,hours from gmt
30
+ int)
31
+ - CREATE TABLE city(city code varchar,city name varchar,state code varchar,tên quốc
32
+ gia varchar,múi thời gian varchar);
33
+ - CREATE TABLE sân bay(airport code varchar,airport name text,airport location text,state
34
+ code varchar,country name varchar,time zone code varchar,minimum connect time
35
+ int);
36
+ - source_sentence: có bao nhiêu giám đốc từ năm 2000 đến năm 2009?
37
+ sentences:
38
+ - CREATE TABLE Bài viết(Id number,PostTypeId number,AcceptedAnswerId number,ParentId
39
+ number,CreationDate time,DeletionDate time,Score number,ViewCount number,Body
40
+ text,OwnerUserId number,OwnerDisplayName text,LastEditorUserId number,LastEditorDisplayName
41
+ text,LastEditDate time,LastActivityDate time,Tiêu đề text,Thẻ text,Đếm trả lời
42
+ number,Đếm bình luận number,Đếm yêu thích number,Ngày đóng time,Ngày sở hữu cộng
43
+ đồng time,Giấy phép nội dung text)
44
+ - CREATE TABLE table 11239("Ngày" text,"Đội thăm quan" text,"Tỷ số cuối cùng" text,"Đội
45
+ đăng cai" text,"Sân vận động" text)
46
+ - CREATE TABLE table 203 141(id number,"sr.no." number,"tên" text,"từ" text,"cho
47
+ đến" text,"nghề nghiệp" text)
48
+ - source_sentence: Tên đầy đủ, các phòng ban, thành phố, tỉnh của từng nhân viên là
49
+ gì?
50
+ sentences:
51
+ - CREATE TABLE phòng ban(department id number,department name text,manager id number,location
52
+ id number)
53
+ - CREATE TABLE khu vực(khu vực id number,tên khu vực text)
54
+ - CREATE TABLE tuyển sinh(row id number,subject id number,hadm id number,admittime
55
+ time,dischtime time,admission type text,admission location text,discharge location
56
+ text,bảo hiểm text,ngôn ngữ text,hôn nhân status text,dân tộc text,age number);
57
+ - source_sentence: Năm 2011 là bao nhiêu khi năm 2009 là 'MỘT'?
58
+ sentences:
59
+ - CREATE TABLE table name 27(điểm VARCHAR,đội khách VARCHAR,date VARCHAR)
60
+ - CREATE TABLE Bài viết(Id number,PostTypeId number,AcceptedAnswerId number,ParentId
61
+ number,CreationDate time,DeletionDate time,Score number,ViewCount number,Body
62
+ text,OwnerUserId number,OwnerDisplayName text,LastEditorUserId number,LastEditorDisplayName
63
+ text,LastEditDate time,LastActivityDate time,Title text,Tags text,AnswerCount
64
+ number,CommentCount number,FavoriteCount number,ClosedDate time,CommunityOwnedDate
65
+ time,ContentLicen text)
66
+ - CREATE TABLE table 61807("Giải đấu" text,"2009" text,"2010" text,"2011" text,"2012"
67
+ text)
68
+ ---
69
+
70
+ # SentenceTransformer based on strongpear/M3-retriever-Vi-Text2SQL
71
+
72
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [strongpear/M3-retriever-Vi-Text2SQL](https://huggingface.co/strongpear/M3-retriever-Vi-Text2SQL). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
73
+
74
+ ## Model Details
75
+
76
+ ### Model Description
77
+ - **Model Type:** Sentence Transformer
78
+ - **Base model:** [strongpear/M3-retriever-Vi-Text2SQL](https://huggingface.co/strongpear/M3-retriever-Vi-Text2SQL) <!-- at revision 667aec7aa21d1757ba3ac3147790eb39997eedc7 -->
79
+ - **Maximum Sequence Length:** 8192 tokens
80
+ - **Output Dimensionality:** 1024 tokens
81
+ - **Similarity Function:** Cosine Similarity
82
+ <!-- - **Training Dataset:** Unknown -->
83
+ <!-- - **Language:** Unknown -->
84
+ <!-- - **License:** Unknown -->
85
+
86
+ ### Model Sources
87
+
88
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
89
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
90
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
91
+
92
+ ### Full Model Architecture
93
+
94
+ ```
95
+ SentenceTransformer(
96
+ (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
97
+ (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
98
+ )
99
+ ```
100
+
101
+ ## Usage
102
+
103
+ ### Direct Usage (Sentence Transformers)
104
+
105
+ First install the Sentence Transformers library:
106
+
107
+ ```bash
108
+ pip install -U sentence-transformers
109
+ ```
110
+
111
+ Then you can load this model and run inference.
112
+ ```python
113
+ from sentence_transformers import SentenceTransformer
114
+
115
+ # Download from the 🤗 Hub
116
+ model = SentenceTransformer("strongpear/M3-retriever-Vi-Text2SQL_ver2")
117
+ # Run inference
118
+ sentences = [
119
+ "Năm 2011 là bao nhiêu khi năm 2009 là 'MỘT'?",
120
+ 'CREATE TABLE table 61807("Giải đấu" text,"2009" text,"2010" text,"2011" text,"2012" text)',
121
+ 'CREATE TABLE table name 27(điểm VARCHAR,đội khách VARCHAR,date VARCHAR)',
122
+ ]
123
+ embeddings = model.encode(sentences)
124
+ print(embeddings.shape)
125
+ # [3, 1024]
126
+
127
+ # Get the similarity scores for the embeddings
128
+ similarities = model.similarity(embeddings, embeddings)
129
+ print(similarities.shape)
130
+ # [3, 3]
131
+ ```
132
+
133
+ <!--
134
+ ### Direct Usage (Transformers)
135
+
136
+ <details><summary>Click to see the direct usage in Transformers</summary>
137
+
138
+ </details>
139
+ -->
140
+
141
+ <!--
142
+ ### Downstream Usage (Sentence Transformers)
143
+
144
+ You can finetune this model on your own dataset.
145
+
146
+ <details><summary>Click to expand</summary>
147
+
148
+ </details>
149
+ -->
150
+
151
+ <!--
152
+ ### Out-of-Scope Use
153
+
154
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
155
+ -->
156
+
157
+ <!--
158
+ ## Bias, Risks and Limitations
159
+
160
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
161
+ -->
162
+
163
+ <!--
164
+ ### Recommendations
165
+
166
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
167
+ -->
168
+
169
+ ## Training Details
170
+
171
+ ### Training Dataset
172
+
173
+ #### Unnamed Dataset
174
+
175
+
176
+ * Size: 1,312,839 training samples
177
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
178
+ * Approximate statistics based on the first 1000 samples:
179
+ | | anchor | positive | negative |
180
+ |:--------|:-----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
181
+ | type | string | string | string |
182
+ | details | <ul><li>min: 6 tokens</li><li>mean: 22.66 tokens</li><li>max: 323 tokens</li></ul> | <ul><li>min: 12 tokens</li><li>mean: 56.47 tokens</li><li>max: 159 tokens</li></ul> | <ul><li>min: 11 tokens</li><li>mean: 41.05 tokens</li><li>max: 621 tokens</li></ul> |
183
+ * Samples:
184
+ | anchor | positive | negative |
185
+ |:------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
186
+ | <code>Chủ đề StackOverflow vào tháng 5 năm 2010. Sẽ thật tuyệt nếu có một biểu đồ, tôi biết :)</code> | <code>CREATE TABLE Bài viết(Id number,PostTypeId number,AcceptedAnswerId number,ParentId number,CreationDate time,DeletionDate time,Score number,ViewCount number,Nội dung text,OwnerUserId number,OwnerDisplayName text,LastEditorUserId number,LastEditorDisplayName text,LastEditDate time,LastActivityDate time,Title text,Tags text,AnswerCount number,CommentCount number,FavoriteCount number,ClosedDate time,CommunityOwnedDate time,ContentLince text);</code> | <code>CREATE TABLE PostTypes(Id number,Name text)</code> |
187
+ | <code>sao2 của bệnh nhân 31854 trong lần khám tại bệnh viện hiện tại có bình thường không?</code> | <code>CREATE TABLE icustays(row id number,subject id number,hadm id number,icustay id number,first careunit text,last careunit text,first wardid number,last wardid number,intime time,outtime time);</code> | <code>CREATE TABLE inputevents cv(row id number,subject id number,hadm id number,icustay id number,charttime time,itemid number,amount number)</code> |
188
+ | <code>chuyến bay nào từ PITTSBURGH đến ATLANTA vào sáng thứ Tư phục vụ BỮA SÁNG</code> | <code>CREATE TABLE sân bay(airport code varchar,airport name text,airport location text,state code varchar,country name varchar,time zone code varchar,minimum connect time int);</code> | <code>CREATE TABLE máy bay(aircraft code varchar,aircraft description varchar,nhà sản xuất varchar,basic type varchar,engines int,động cơ varchar,thân rộng varchar,sải cánh int,chiều dài int,trọng lượng int,công suất int,tải trả int,tốc độ bay int,phạm vi dặm int,có áp suất varchar)</code> |
189
+ * Loss: [<code>TripletLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) with these parameters:
190
+ ```json
191
+ {
192
+ "distance_metric": "TripletDistanceMetric.EUCLIDEAN",
193
+ "triplet_margin": 5
194
+ }
195
+ ```
196
+
197
+ ### Evaluation Dataset
198
+
199
+ #### Unnamed Dataset
200
+
201
+
202
+ * Size: 69,098 evaluation samples
203
+ * Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
204
+ * Approximate statistics based on the first 1000 samples:
205
+ | | anchor | positive | negative |
206
+ |:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
207
+ | type | string | string | string |
208
+ | details | <ul><li>min: 8 tokens</li><li>mean: 23.11 tokens</li><li>max: 323 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 57.77 tokens</li><li>max: 181 tokens</li></ul> | <ul><li>min: 12 tokens</li><li>mean: 42.41 tokens</li><li>max: 207 tokens</li></ul> |
209
+ * Samples:
210
+ | anchor | positive | negative |
211
+ |:----------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
212
+ | <code>Đã bao nhiêu ngày kể từ lần cuối bệnh nhân 1561 nhận được một lượng lactate ringer trong lần thăm khám tại phòng chăm sóc đặc biệt hiện tại?</code> | <code>CREATE TABLE inputevents cv(row id number,subject id number,hadm id number,icustay id number,charttime time,itemid number,amount number);</code> | <code>CREATE TABLE d icd diagnoses(row id number,icd9 code text,short title text,tiêu đề dài text)</code> |
213
+ | <code>Có nhiều giáo sư dạy ECON 631 trong kỳ Xuân Hè 2003 không?</code> | <code>CREATE TABLE giảng viên(instructor id int,name varchar,tên uniq varchar);</code> | <code>CREATE TABLE học kỳ(semester id int,semester varchar,year int)</code> |
214
+ | <code>bệnh nhân 011-55642 có được chẩn đoán mắc bệnh gì trong lần khám tại bệnh viện hiện tại không?</code> | <code>CREATE TABLE bệnh nhân(uniquepid text,bệnh nhânhealthsystemstayid number,bệnh nhân đơn vị ở lạiid number,giới tính text,tuổi text,dân tộc text,bệnh viện number,khu bệnh number,chiều cao nhập viện number,cân nặng nhập viện number,cân nặng xuất viện number,thời gian nhập viện time,nguồn nhập viện text,thời gian nhập viện đơn vị time,thời gian nhập viện time,thời gian xuất viện time,trạng thái xuất viện text)</code> | <code>CREATE TABLE inputoutput(intakeoutputid number,Patientunitstayid number,cellpath text,celllabel text,cellvaluenumeric number,intakeoutputtime time)</code> |
215
+ * Loss: [<code>TripletLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) with these parameters:
216
+ ```json
217
+ {
218
+ "distance_metric": "TripletDistanceMetric.EUCLIDEAN",
219
+ "triplet_margin": 5
220
+ }
221
+ ```
222
+
223
+ ### Training Hyperparameters
224
+ #### Non-Default Hyperparameters
225
+
226
+ - `eval_strategy`: steps
227
+ - `gradient_accumulation_steps`: 4
228
+ - `learning_rate`: 1.08e-08
229
+ - `weight_decay`: 0.1
230
+ - `num_train_epochs`: 1
231
+ - `warmup_steps`: 500
232
+ - `fp16`: True
233
+ - `remove_unused_columns`: False
234
+
235
+ #### All Hyperparameters
236
+ <details><summary>Click to expand</summary>
237
+
238
+ - `overwrite_output_dir`: False
239
+ - `do_predict`: False
240
+ - `eval_strategy`: steps
241
+ - `prediction_loss_only`: True
242
+ - `per_device_train_batch_size`: 8
243
+ - `per_device_eval_batch_size`: 8
244
+ - `per_gpu_train_batch_size`: None
245
+ - `per_gpu_eval_batch_size`: None
246
+ - `gradient_accumulation_steps`: 4
247
+ - `eval_accumulation_steps`: None
248
+ - `torch_empty_cache_steps`: None
249
+ - `learning_rate`: 1.08e-08
250
+ - `weight_decay`: 0.1
251
+ - `adam_beta1`: 0.9
252
+ - `adam_beta2`: 0.999
253
+ - `adam_epsilon`: 1e-08
254
+ - `max_grad_norm`: 1.0
255
+ - `num_train_epochs`: 1
256
+ - `max_steps`: -1
257
+ - `lr_scheduler_type`: linear
258
+ - `lr_scheduler_kwargs`: {}
259
+ - `warmup_ratio`: 0.0
260
+ - `warmup_steps`: 500
261
+ - `log_level`: passive
262
+ - `log_level_replica`: warning
263
+ - `log_on_each_node`: True
264
+ - `logging_nan_inf_filter`: True
265
+ - `save_safetensors`: True
266
+ - `save_on_each_node`: False
267
+ - `save_only_model`: False
268
+ - `restore_callback_states_from_checkpoint`: False
269
+ - `no_cuda`: False
270
+ - `use_cpu`: False
271
+ - `use_mps_device`: False
272
+ - `seed`: 42
273
+ - `data_seed`: None
274
+ - `jit_mode_eval`: False
275
+ - `use_ipex`: False
276
+ - `bf16`: False
277
+ - `fp16`: True
278
+ - `fp16_opt_level`: O1
279
+ - `half_precision_backend`: auto
280
+ - `bf16_full_eval`: False
281
+ - `fp16_full_eval`: False
282
+ - `tf32`: None
283
+ - `local_rank`: 0
284
+ - `ddp_backend`: None
285
+ - `tpu_num_cores`: None
286
+ - `tpu_metrics_debug`: False
287
+ - `debug`: []
288
+ - `dataloader_drop_last`: False
289
+ - `dataloader_num_workers`: 0
290
+ - `dataloader_prefetch_factor`: None
291
+ - `past_index`: -1
292
+ - `disable_tqdm`: False
293
+ - `remove_unused_columns`: False
294
+ - `label_names`: None
295
+ - `load_best_model_at_end`: False
296
+ - `ignore_data_skip`: False
297
+ - `fsdp`: []
298
+ - `fsdp_min_num_params`: 0
299
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
300
+ - `fsdp_transformer_layer_cls_to_wrap`: None
301
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
302
+ - `deepspeed`: None
303
+ - `label_smoothing_factor`: 0.0
304
+ - `optim`: adamw_torch
305
+ - `optim_args`: None
306
+ - `adafactor`: False
307
+ - `group_by_length`: False
308
+ - `length_column_name`: length
309
+ - `ddp_find_unused_parameters`: None
310
+ - `ddp_bucket_cap_mb`: None
311
+ - `ddp_broadcast_buffers`: False
312
+ - `dataloader_pin_memory`: True
313
+ - `dataloader_persistent_workers`: False
314
+ - `skip_memory_metrics`: True
315
+ - `use_legacy_prediction_loop`: False
316
+ - `push_to_hub`: False
317
+ - `resume_from_checkpoint`: None
318
+ - `hub_model_id`: None
319
+ - `hub_strategy`: every_save
320
+ - `hub_private_repo`: False
321
+ - `hub_always_push`: False
322
+ - `gradient_checkpointing`: False
323
+ - `gradient_checkpointing_kwargs`: None
324
+ - `include_inputs_for_metrics`: False
325
+ - `eval_do_concat_batches`: True
326
+ - `fp16_backend`: auto
327
+ - `push_to_hub_model_id`: None
328
+ - `push_to_hub_organization`: None
329
+ - `mp_parameters`:
330
+ - `auto_find_batch_size`: False
331
+ - `full_determinism`: False
332
+ - `torchdynamo`: None
333
+ - `ray_scope`: last
334
+ - `ddp_timeout`: 1800
335
+ - `torch_compile`: False
336
+ - `torch_compile_backend`: None
337
+ - `torch_compile_mode`: None
338
+ - `dispatch_batches`: None
339
+ - `split_batches`: None
340
+ - `include_tokens_per_second`: False
341
+ - `include_num_input_tokens_seen`: False
342
+ - `neftune_noise_alpha`: None
343
+ - `optim_target_modules`: None
344
+ - `batch_eval_metrics`: False
345
+ - `eval_on_start`: False
346
+ - `eval_use_gather_object`: False
347
+ - `batch_sampler`: batch_sampler
348
+ - `multi_dataset_batch_sampler`: proportional
349
+
350
+ </details>
351
+
352
+ ### Training Logs
353
+ | Epoch | Step | Training Loss | loss |
354
+ |:------:|:-----:|:-------------:|:------:|
355
+ | 0.0244 | 1000 | 0.0334 | 0.0444 |
356
+ | 0.0487 | 2000 | 0.0284 | 0.0444 |
357
+ | 0.0731 | 3000 | 0.0304 | 0.0444 |
358
+ | 0.0975 | 4000 | 0.0244 | 0.0444 |
359
+ | 0.1219 | 5000 | 0.0276 | 0.0444 |
360
+ | 0.1462 | 6000 | 0.0286 | 0.0444 |
361
+ | 0.1706 | 7000 | 0.0276 | 0.0444 |
362
+ | 0.1950 | 8000 | 0.0252 | 0.0444 |
363
+ | 0.2194 | 9000 | 0.0267 | 0.0444 |
364
+ | 0.2437 | 10000 | 0.0258 | 0.0444 |
365
+ | 0.2681 | 11000 | 0.0216 | 0.0444 |
366
+ | 0.2925 | 12000 | 0.0224 | 0.0444 |
367
+ | 0.3169 | 13000 | 0.0209 | 0.0443 |
368
+ | 0.3412 | 14000 | 0.0211 | 0.0443 |
369
+ | 0.3656 | 15000 | 0.0221 | 0.0443 |
370
+ | 0.3900 | 16000 | 0.0183 | 0.0443 |
371
+ | 0.4144 | 17000 | 0.0207 | 0.0443 |
372
+ | 0.4387 | 18000 | 0.0175 | 0.0443 |
373
+ | 0.4631 | 19000 | 0.0192 | 0.0443 |
374
+ | 0.4875 | 20000 | 0.016 | 0.0443 |
375
+ | 0.5119 | 21000 | 0.0208 | 0.0443 |
376
+ | 0.5362 | 22000 | 0.0165 | 0.0443 |
377
+ | 0.5606 | 23000 | 0.016 | 0.0443 |
378
+ | 0.5850 | 24000 | 0.0141 | 0.0443 |
379
+ | 0.6094 | 25000 | 0.0185 | 0.0443 |
380
+ | 0.6337 | 26000 | 0.0143 | 0.0443 |
381
+ | 0.6581 | 27000 | 0.0175 | 0.0443 |
382
+ | 0.6825 | 28000 | 0.0155 | 0.0443 |
383
+ | 0.7069 | 29000 | 0.0172 | 0.0443 |
384
+ | 0.7312 | 30000 | 0.0174 | 0.0443 |
385
+ | 0.7556 | 31000 | 0.0185 | 0.0443 |
386
+ | 0.7800 | 32000 | 0.0166 | 0.0443 |
387
+ | 0.8044 | 33000 | 0.0171 | 0.0443 |
388
+ | 0.8287 | 34000 | 0.018 | 0.0443 |
389
+ | 0.8531 | 35000 | 0.0194 | 0.0443 |
390
+ | 0.8775 | 36000 | 0.0228 | 0.0443 |
391
+ | 0.9019 | 37000 | 0.0239 | 0.0443 |
392
+ | 0.9262 | 38000 | 0.0262 | 0.0443 |
393
+ | 0.9506 | 39000 | 0.0313 | 0.0443 |
394
+ | 0.9750 | 40000 | 0.0314 | 0.0443 |
395
+ | 0.9994 | 41000 | 0.0461 | 0.0443 |
396
+
397
+
398
+ ### Framework Versions
399
+ - Python: 3.9.19
400
+ - Sentence Transformers: 3.0.1
401
+ - Transformers: 4.44.2
402
+ - PyTorch: 2.4.0+cu121
403
+ - Accelerate: 0.34.2
404
+ - Datasets: 2.21.0
405
+ - Tokenizers: 0.19.1
406
+
407
+ ## Citation
408
+
409
+ ### BibTeX
410
+
411
+ #### Sentence Transformers
412
+ ```bibtex
413
+ @inproceedings{reimers-2019-sentence-bert,
414
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
415
+ author = "Reimers, Nils and Gurevych, Iryna",
416
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
417
+ month = "11",
418
+ year = "2019",
419
+ publisher = "Association for Computational Linguistics",
420
+ url = "https://arxiv.org/abs/1908.10084",
421
+ }
422
+ ```
423
+
424
+ #### TripletLoss
425
+ ```bibtex
426
+ @misc{hermans2017defense,
427
+ title={In Defense of the Triplet Loss for Person Re-Identification},
428
+ author={Alexander Hermans and Lucas Beyer and Bastian Leibe},
429
+ year={2017},
430
+ eprint={1703.07737},
431
+ archivePrefix={arXiv},
432
+ primaryClass={cs.CV}
433
+ }
434
+ ```
435
+
436
+ <!--
437
+ ## Glossary
438
+
439
+ *Clearly define terms in order to be accessible across audiences.*
440
+ -->
441
+
442
+ <!--
443
+ ## Model Card Authors
444
+
445
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
446
+ -->
447
+
448
+ <!--
449
+ ## Model Card Contact
450
+
451
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
452
+ -->
added_tokens.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ALTER TABLE": 250021,
3
+ "ASC": 250025,
4
+ "AVG": 250028,
5
+ "BEGIN": 250035,
6
+ "BETWEEN": 250014,
7
+ "CASE": 250032,
8
+ "COMMIT": 250037,
9
+ "CONSTRAINT": 250044,
10
+ "COUNT": 250027,
11
+ "CREATE TABLE": 250020,
12
+ "DELETE FROM": 250019,
13
+ "DESC": 250026,
14
+ "DISTINCT": 250023,
15
+ "DROP TABLE": 250022,
16
+ "EXCEPT": 250030,
17
+ "FOREIGN KEY": 250041,
18
+ "FROM": 250003,
19
+ "GROUP BY": 250005,
20
+ "HAVING": 250024,
21
+ "INDEX": 250043,
22
+ "INNER JOIN": 250008,
23
+ "INSERT INTO": 250016,
24
+ "INTERSECT": 250031,
25
+ "IS NULL": 250015,
26
+ "JOIN": 250007,
27
+ "LEFT JOIN": 250009,
28
+ "LIKE": 250013,
29
+ "NOT": 250012,
30
+ "ORDER BY": 250006,
31
+ "OUTER JOIN": 250011,
32
+ "PRIMARY KEY": 250040,
33
+ "REFERENCES": 250042,
34
+ "RIGHT JOIN": 250010,
35
+ "ROLLBACK": 250036,
36
+ "SAVEPOINT": 250038,
37
+ "SELECT": 250002,
38
+ "THEN": 250034,
39
+ "TRANSACTION": 250039,
40
+ "UNION": 250029,
41
+ "UPDATE": 250018,
42
+ "VALUES": 250017,
43
+ "WHEN": 250033,
44
+ "WHERE": 250004
45
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/retrieve_models/checkpoint/M3-retriever-Vi-Text2SQL_phase3_continue-2024-09-26_10-24-27/checkpoint-41026/",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 8194,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.44.2",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 250045
28
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.44.2",
5
+ "pytorch": "2.4.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be3db859d849d5463257b65ca3c9281bc6b899d21a36b82a60e9bd00cd222166
3
+ size 2271240584
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "</s>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e17d0483713b5aa6528e795bebbf0560fabaf365ad253d8fadee8953daf154c9
3
+ size 17093474
tokenizer_config.json ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "8852": {
36
+ "content": "IN",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "9713": {
44
+ "content": "AS",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "15513": {
52
+ "content": "ON",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "24638": {
60
+ "content": "OR",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "39015": {
68
+ "content": "END",
69
+ "lstrip": false,
70
+ "normalized": true,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "45029": {
76
+ "content": "AND",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "54080": {
84
+ "content": "SET",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "66437": {
92
+ "content": "ALL",
93
+ "lstrip": false,
94
+ "normalized": true,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "70675": {
100
+ "content": "TOP",
101
+ "lstrip": false,
102
+ "normalized": true,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "83948": {
108
+ "content": "MAX",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "84139": {
116
+ "content": "MIN",
117
+ "lstrip": false,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "93333": {
124
+ "content": "ANY",
125
+ "lstrip": false,
126
+ "normalized": true,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "96922": {
132
+ "content": "ELSE",
133
+ "lstrip": false,
134
+ "normalized": true,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "127944": {
140
+ "content": "SUM",
141
+ "lstrip": false,
142
+ "normalized": true,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "250001": {
148
+ "content": "<mask>",
149
+ "lstrip": true,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "250002": {
156
+ "content": "SELECT",
157
+ "lstrip": false,
158
+ "normalized": true,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "250003": {
164
+ "content": "FROM",
165
+ "lstrip": false,
166
+ "normalized": true,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "250004": {
172
+ "content": "WHERE",
173
+ "lstrip": false,
174
+ "normalized": true,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "250005": {
180
+ "content": "GROUP BY",
181
+ "lstrip": false,
182
+ "normalized": true,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "250006": {
188
+ "content": "ORDER BY",
189
+ "lstrip": false,
190
+ "normalized": true,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "250007": {
196
+ "content": "JOIN",
197
+ "lstrip": false,
198
+ "normalized": true,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "250008": {
204
+ "content": "INNER JOIN",
205
+ "lstrip": false,
206
+ "normalized": true,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "250009": {
212
+ "content": "LEFT JOIN",
213
+ "lstrip": false,
214
+ "normalized": true,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "250010": {
220
+ "content": "RIGHT JOIN",
221
+ "lstrip": false,
222
+ "normalized": true,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "250011": {
228
+ "content": "OUTER JOIN",
229
+ "lstrip": false,
230
+ "normalized": true,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "250012": {
236
+ "content": "NOT",
237
+ "lstrip": false,
238
+ "normalized": true,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "250013": {
244
+ "content": "LIKE",
245
+ "lstrip": false,
246
+ "normalized": true,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "250014": {
252
+ "content": "BETWEEN",
253
+ "lstrip": false,
254
+ "normalized": true,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": false
258
+ },
259
+ "250015": {
260
+ "content": "IS NULL",
261
+ "lstrip": false,
262
+ "normalized": true,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "250016": {
268
+ "content": "INSERT INTO",
269
+ "lstrip": false,
270
+ "normalized": true,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "250017": {
276
+ "content": "VALUES",
277
+ "lstrip": false,
278
+ "normalized": true,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "250018": {
284
+ "content": "UPDATE",
285
+ "lstrip": false,
286
+ "normalized": true,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": false
290
+ },
291
+ "250019": {
292
+ "content": "DELETE FROM",
293
+ "lstrip": false,
294
+ "normalized": true,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": false
298
+ },
299
+ "250020": {
300
+ "content": "CREATE TABLE",
301
+ "lstrip": false,
302
+ "normalized": true,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": false
306
+ },
307
+ "250021": {
308
+ "content": "ALTER TABLE",
309
+ "lstrip": false,
310
+ "normalized": true,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": false
314
+ },
315
+ "250022": {
316
+ "content": "DROP TABLE",
317
+ "lstrip": false,
318
+ "normalized": true,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": false
322
+ },
323
+ "250023": {
324
+ "content": "DISTINCT",
325
+ "lstrip": false,
326
+ "normalized": true,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": false
330
+ },
331
+ "250024": {
332
+ "content": "HAVING",
333
+ "lstrip": false,
334
+ "normalized": true,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": false
338
+ },
339
+ "250025": {
340
+ "content": "ASC",
341
+ "lstrip": false,
342
+ "normalized": true,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": false
346
+ },
347
+ "250026": {
348
+ "content": "DESC",
349
+ "lstrip": false,
350
+ "normalized": true,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": false
354
+ },
355
+ "250027": {
356
+ "content": "COUNT",
357
+ "lstrip": false,
358
+ "normalized": true,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": false
362
+ },
363
+ "250028": {
364
+ "content": "AVG",
365
+ "lstrip": false,
366
+ "normalized": true,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": false
370
+ },
371
+ "250029": {
372
+ "content": "UNION",
373
+ "lstrip": false,
374
+ "normalized": true,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": false
378
+ },
379
+ "250030": {
380
+ "content": "EXCEPT",
381
+ "lstrip": false,
382
+ "normalized": true,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": false
386
+ },
387
+ "250031": {
388
+ "content": "INTERSECT",
389
+ "lstrip": false,
390
+ "normalized": true,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": false
394
+ },
395
+ "250032": {
396
+ "content": "CASE",
397
+ "lstrip": false,
398
+ "normalized": true,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": false
402
+ },
403
+ "250033": {
404
+ "content": "WHEN",
405
+ "lstrip": false,
406
+ "normalized": true,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": false
410
+ },
411
+ "250034": {
412
+ "content": "THEN",
413
+ "lstrip": false,
414
+ "normalized": true,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": false
418
+ },
419
+ "250035": {
420
+ "content": "BEGIN",
421
+ "lstrip": false,
422
+ "normalized": true,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": false
426
+ },
427
+ "250036": {
428
+ "content": "ROLLBACK",
429
+ "lstrip": false,
430
+ "normalized": true,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": false
434
+ },
435
+ "250037": {
436
+ "content": "COMMIT",
437
+ "lstrip": false,
438
+ "normalized": true,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": false
442
+ },
443
+ "250038": {
444
+ "content": "SAVEPOINT",
445
+ "lstrip": false,
446
+ "normalized": true,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": false
450
+ },
451
+ "250039": {
452
+ "content": "TRANSACTION",
453
+ "lstrip": false,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": false
458
+ },
459
+ "250040": {
460
+ "content": "PRIMARY KEY",
461
+ "lstrip": false,
462
+ "normalized": true,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": false
466
+ },
467
+ "250041": {
468
+ "content": "FOREIGN KEY",
469
+ "lstrip": false,
470
+ "normalized": true,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": false
474
+ },
475
+ "250042": {
476
+ "content": "REFERENCES",
477
+ "lstrip": false,
478
+ "normalized": true,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": false
482
+ },
483
+ "250043": {
484
+ "content": "INDEX",
485
+ "lstrip": false,
486
+ "normalized": true,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": false
490
+ },
491
+ "250044": {
492
+ "content": "CONSTRAINT",
493
+ "lstrip": false,
494
+ "normalized": true,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": false
498
+ }
499
+ },
500
+ "bos_token": "<s>",
501
+ "clean_up_tokenization_spaces": true,
502
+ "cls_token": "<s>",
503
+ "eos_token": "</s>",
504
+ "mask_token": "<mask>",
505
+ "max_length": 1024,
506
+ "model_max_length": 8192,
507
+ "pad_to_multiple_of": null,
508
+ "pad_token": "</s>",
509
+ "pad_token_type_id": 0,
510
+ "padding_side": "right",
511
+ "sep_token": "</s>",
512
+ "sp_model_kwargs": {},
513
+ "stride": 0,
514
+ "tokenizer_class": "XLMRobertaTokenizer",
515
+ "truncation_side": "right",
516
+ "truncation_strategy": "longest_first",
517
+ "unk_token": "<unk>"
518
+ }