metadata
license: mit
language:
- en
library_name: transformers
tags:
- 'vision '
- speech
- image-text-text
- audio-text-text
- Multi-Modal
Creation Process
Vmodel = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "google/vit-base-patch16-224-in21k", "LeroyDyer/Mixtral_AI_Tiny" ) _Encoder_ImageProcessor = Vmodel.encoder _Decoder_ImageTokenizer = Vmodel.decoder _VisionEncoderDecoderModel = Vmodel
Add Pad tokems
LM_MODEL.VisionEncoderDecoder = _VisionEncoderDecoderModel
Add Sub Components
LM_MODEL.Encoder_ImageProcessor = _Encoder_ImageProcessor LM_MODEL.Decoder_ImageTokenizer = _Decoder_ImageTokenizer LM_MODEL
# ADD AUDIO
```python
print('Add Audio...')
#Add Head
# Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model
_AudioFeatureExtractor = AutoFeatureExtractor.from_pretrained("openai/whisper-small")
_AudioTokenizer = AutoTokenizer.from_pretrained("openai/whisper-small")
_SpeechEncoderDecoder = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained("openai/whisper-small","openai/whisper-small")
# Add Pad tokems
_SpeechEncoderDecoder.config.decoder_start_token_id = _AudioTokenizer.cls_token_id
_SpeechEncoderDecoder.config.pad_token_id = _AudioTokenizer.pad_token_id
LM_MODEL.SpeechEncoderDecoder = _SpeechEncoderDecoder
# Add Sub Components
LM_MODEL.Decoder_AudioTokenizer = _AudioTokenizer
LM_MODEL.Encoder_AudioFeatureExtractor = _AudioFeatureExtractor
LM_MODEL
SAVE
print('Final stages:...')
print('Add tokenizer...')
LM_MODEL.resize_token_embeddings(len(tokenizer))
LM_MODEL.tokenizer = tokenizer
print('Save model...')
LM_MODEL.to(torch.float16)
LM_MODEL.save_pretrained("Mixtral_AI_MiniModalTron")
print('Save tokenizer...')
tokenizer.save_pretrained("Mixtral_AI_MiniModalTron")