Upload 11 files
Browse files- README.md +55 -0
- added_tokens.json +8 -0
- config.json +39 -0
- generation_config.json +6 -0
- merges.txt +0 -0
- optimizer_and_scheduler.pth +3 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +9 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# GPT-2 Model Trained on Serbian Corpus
|
3 |
+
|
4 |
+
![serbia-state.png](https://cdn-uploads.huggingface.co/production/uploads/64fc6ba4e0dc35986bc3b6ee/NVA8gBkmZTKKo-ekQILI6.png)
|
5 |
+
|
6 |
+
By sharing this model, we aim to foster further research and applications in Serbian language processing.
|
7 |
+
|
8 |
+
### Introduction:
|
9 |
+
|
10 |
+
This GPT-2 model has been fine-tuned on an extensive Serbian corpus, boasting a richness of 43 million tokens. It is designed to generate high-quality text in Serbian, capturing the nuances and intricacies of the language.
|
11 |
+
|
12 |
+
### Dataset Details:
|
13 |
+
|
14 |
+
Size: 43 million tokens.
|
15 |
+
|
16 |
+
Nature: The dataset encompasses a diverse range of topics, representing various aspects of the Serbian language and culture.
|
17 |
+
|
18 |
+
### Model Usage:
|
19 |
+
|
20 |
+
This model can be utilized for various NLP tasks such as text generation, summarization, translation, and more. Due to its comprehensive training on a vast corpus, it promises accurate and contextually relevant outputs, especially for tasks related to the Serbian language.
|
21 |
+
|
22 |
+
|
23 |
+
Loading the Model:
|
24 |
+
|
25 |
+
from cryptography.fernet import Fernet
|
26 |
+
import torch
|
27 |
+
|
28 |
+
key = input("Enter the decryption key: ").encode()
|
29 |
+
cipher_suite = Fernet(key)
|
30 |
+
|
31 |
+
model_path = 'your/path/to/model/pytorch_model.bin'
|
32 |
+
|
33 |
+
try:
|
34 |
+
with open(model_path, 'rb') as file:
|
35 |
+
encrypted_data = file.read()
|
36 |
+
|
37 |
+
decrypted_data = cipher_suite.decrypt(encrypted_data)
|
38 |
+
|
39 |
+
with open(model_path, 'wb') as file:
|
40 |
+
file.write(decrypted_data)
|
41 |
+
|
42 |
+
# Dodajte poziv za učitavanje modela
|
43 |
+
loaded_model = torch.load(model_path)
|
44 |
+
|
45 |
+
except:
|
46 |
+
print("You can decrypt the model by contacting the author of this model who will add the key, email: info@edukom.rs")
|
47 |
+
|
48 |
+
# Sada možete koristiti 'loaded_model' za dalje operacije...
|
49 |
+
|
50 |
+
### Licensing:
|
51 |
+
|
52 |
+
The model is protected by encryption and its use requires a decryption key. Please check the licensing terms if you intend to use the model for commercial purposes. For any questions or if you need decryption keys, feel free to contact us at info@edukom.rs
|
53 |
+
|
54 |
+
![Screenshot.png](https://cdn-uploads.huggingface.co/production/uploads/64fc6ba4e0dc35986bc3b6ee/UoIvwAez4ZoiEsHyx-vn6.png)
|
55 |
+
|
added_tokens.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|endoftext|>": 50262,
|
3 |
+
"[CLS]": 50259,
|
4 |
+
"[MASK]": 50261,
|
5 |
+
"[PAD]": 50257,
|
6 |
+
"[SEP]": 50260,
|
7 |
+
"[UNK]": 50258
|
8 |
+
}
|
config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/var/www/html/ainabavka/proces_treninga/modeli/pretrenirani_model/checkpoint_dir",
|
3 |
+
"activation_function": "gelu_new",
|
4 |
+
"architectures": [
|
5 |
+
"GPT2LMHeadModel"
|
6 |
+
],
|
7 |
+
"attn_pdrop": 0.1,
|
8 |
+
"bos_token_id": 50256,
|
9 |
+
"embd_pdrop": 0.1,
|
10 |
+
"eos_token_id": 50256,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"layer_norm_epsilon": 1e-05,
|
13 |
+
"model_type": "gpt2",
|
14 |
+
"n_ctx": 1024,
|
15 |
+
"n_embd": 768,
|
16 |
+
"n_head": 12,
|
17 |
+
"n_inner": null,
|
18 |
+
"n_layer": 12,
|
19 |
+
"n_positions": 1024,
|
20 |
+
"reorder_and_upcast_attn": false,
|
21 |
+
"resid_pdrop": 0.1,
|
22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
23 |
+
"scale_attn_weights": true,
|
24 |
+
"summary_activation": null,
|
25 |
+
"summary_first_dropout": 0.1,
|
26 |
+
"summary_proj_to_labels": true,
|
27 |
+
"summary_type": "cls_index",
|
28 |
+
"summary_use_proj": true,
|
29 |
+
"task_specific_params": {
|
30 |
+
"text-generation": {
|
31 |
+
"do_sample": true,
|
32 |
+
"max_length": 50
|
33 |
+
}
|
34 |
+
},
|
35 |
+
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.29.0.dev0",
|
37 |
+
"use_cache": true,
|
38 |
+
"vocab_size": 50263
|
39 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 50256,
|
4 |
+
"eos_token_id": 50256,
|
5 |
+
"transformers_version": "4.29.0.dev0"
|
6 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
optimizer_and_scheduler.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d461c6e5e5fccd382a11c51588c7851f1419d36483828ff65c7a64780ae855ee
|
3 |
+
size 1506115409
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef323c746bf8baad1b802c8f054619bd76df851e2026d65a2d83fa10396f8ae1
|
3 |
+
size 680555340
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"pad_token": "[PAD]",
|
6 |
+
"unk_token": "<|endoftext|>"
|
7 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<|endoftext|>",
|
4 |
+
"clean_up_tokenization_spaces": true,
|
5 |
+
"eos_token": "<|endoftext|>",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"tokenizer_class": "GPT2Tokenizer",
|
8 |
+
"unk_token": "<|endoftext|>"
|
9 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|