--- license: mit language: - en library_name: transformers tags: - 'vision ' - speech - image-text-text - audio-text-text - Multi-Modal --- # Creation Process Vmodel = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "google/vit-base-patch16-224-in21k", "LeroyDyer/Mixtral_AI_Tiny" ) _Encoder_ImageProcessor = Vmodel.encoder _Decoder_ImageTokenizer = Vmodel.decoder _VisionEncoderDecoderModel = Vmodel # Add Pad tokems LM_MODEL.VisionEncoderDecoder = _VisionEncoderDecoderModel # Add Sub Components LM_MODEL.Encoder_ImageProcessor = _Encoder_ImageProcessor LM_MODEL.Decoder_ImageTokenizer = _Decoder_ImageTokenizer LM_MODEL ``` # ADD AUDIO ```python print('Add Audio...') #Add Head # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model _AudioFeatureExtractor = AutoFeatureExtractor.from_pretrained("openai/whisper-small") _AudioTokenizer = AutoTokenizer.from_pretrained("openai/whisper-small") _SpeechEncoderDecoder = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained("openai/whisper-small","openai/whisper-small") # Add Pad tokems _SpeechEncoderDecoder.config.decoder_start_token_id = _AudioTokenizer.cls_token_id _SpeechEncoderDecoder.config.pad_token_id = _AudioTokenizer.pad_token_id LM_MODEL.SpeechEncoderDecoder = _SpeechEncoderDecoder # Add Sub Components LM_MODEL.Decoder_AudioTokenizer = _AudioTokenizer LM_MODEL.Encoder_AudioFeatureExtractor = _AudioFeatureExtractor LM_MODEL ``` # SAVE ```python print('Final stages:...') print('Add tokenizer...') LM_MODEL.resize_token_embeddings(len(tokenizer)) LM_MODEL.tokenizer = tokenizer print('Save model...') LM_MODEL.to(torch.float16) LM_MODEL.save_pretrained("Mixtral_AI_MiniModalTron") print('Save tokenizer...') tokenizer.save_pretrained("Mixtral_AI_MiniModalTron") ```