|
{ |
|
"model_type": "encoder_decoder", |
|
"encoder_type": "csumlm_encoder", |
|
"decoder_type": "csumlm_decoder", |
|
"model_name": "CognoSphere/CSUMLM", |
|
"model_description": "CognoSphere Unified Multimodal Language Model (CSUMLM) is an advanced AI model capable of processing and generating text, images, and audio data. It combines transfer learning, deep learning, self-supervised learning, meta-learning, deep meta-learning, reinforcement learning, and cross-domain analogy extraction to achieve state-of-the-art performance in multimodal tasks.", |
|
"encoder": { |
|
"type": "transformer", |
|
"num_layers": 12, |
|
"hidden_size": 768, |
|
"num_attention_heads": 12, |
|
"intermediate_size": 3072 |
|
}, |
|
"decoder": { |
|
"type": "transformer", |
|
"num_layers": 12, |
|
"hidden_size": 768, |
|
"num_attention_heads": 12, |
|
"intermediate_size": 3072 |
|
}, |
|
"multimodal_fusion": { |
|
"type": "transformer", |
|
"num_layers": 6, |
|
"hidden_size": 1024, |
|
"num_attention_heads": 16, |
|
"intermediate_size": 4096 |
|
}, |
|
"training_data": { |
|
"text": [ |
|
"path/to/text/data/file1.txt", |
|
"path/to/text/data/file2.txt", |
|
"..." |
|
], |
|
"images": [ |
|
"path/to/image/data/image1.jpg", |
|
"path/to/image/data/image2.png", |
|
"..." |
|
], |
|
"audio": [ |
|
"path/to/audio/data/audio1.wav", |
|
"path/to/audio/data/audio2.mp3", |
|
"..." |
|
] |
|
}, |
|
"tokenizer": { |
|
"type": "byte-level-bpe", |
|
"vocab_size": 50000, |
|
"merge_file": "path/to/bpe/merge_file.txt" |
|
}, |
|
"optimizer": { |
|
"type": "adamw", |
|
"learning_rate": 5e-5, |
|
"weight_decay": 0.01 |
|
}, |
|
"loss_function": "cross_entropy", |
|
"evaluation_metrics": [ |
|
"bleu", |
|
"meteor", |
|
"rouge", |
|
"cider" |
|
] |
|
} |