diff --git "a/instruct.ipynb" "b/instruct.ipynb" --- "a/instruct.ipynb" +++ "b/instruct.ipynb" @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "86163110-f084-429e-bc70-5b281a679d1c", "metadata": { "colab": { @@ -387,20 +387,2177 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "ffbbd54b-e579-44ef-9652-cd8496b2fd4d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PeftModelForCausalLM(\n", + " (base_model): LoraModel(\n", + " (model): MllamaForConditionalGeneration(\n", + " (vision_model): MllamaVisionModel(\n", + " (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)\n", + " (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(\n", + " (tile_embedding): Embedding(9, 8197120)\n", + " )\n", + " (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(\n", + " (embedding): Embedding(9, 5120)\n", + " )\n", + " (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(\n", + " (embedding): Embedding(9, 5120)\n", + " )\n", + " (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " (transformer): MllamaVisionEncoder(\n", + " (layers): ModuleList(\n", + " (0-31): 32 x MllamaVisionEncoderLayer(\n", + " (self_attn): MllamaVisionSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaVisionMLP(\n", + " (activation_fn): GELUActivation()\n", + " (fc1): Linear4bit(in_features=1280, out_features=5120, bias=True)\n", + " (fc2): Linear4bit(in_features=5120, out_features=1280, bias=True)\n", + " )\n", + " (input_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " (post_attention_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " (global_transformer): MllamaVisionEncoder(\n", + " (layers): ModuleList(\n", + " (0-7): 8 x MllamaVisionEncoderLayer(\n", + " (self_attn): MllamaVisionSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=1280, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1280, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaVisionMLP(\n", + " (activation_fn): GELUActivation()\n", + " (fc1): Linear4bit(in_features=1280, out_features=5120, bias=True)\n", + " (fc2): Linear4bit(in_features=5120, out_features=1280, bias=True)\n", + " )\n", + " (input_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " (post_attention_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (language_model): MllamaForCausalLM(\n", + " (model): MllamaTextModel(\n", + " (embed_tokens): Embedding(128264, 4096, padding_idx=128004)\n", + " (layers): ModuleList(\n", + " (0-2): 3 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (3): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (4-7): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (8): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (9-12): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (13): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (14-17): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (18): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (19-22): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (23): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (24-27): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (28): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (29-32): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (33): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (34-37): 4 x MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (38): MllamaCrossAttentionDecoderLayer(\n", + " (cross_attn): MllamaTextCrossSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " (39): MllamaSelfAttentionDecoderLayer(\n", + " (self_attn): MllamaTextSelfSdpaAttention(\n", + " (q_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (v_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " )\n", + " (mlp): MllamaTextMLP(\n", + " (gate_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (up_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=4096, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=14336, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (down_proj): lora.Linear4bit(\n", + " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=14336, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=4096, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " )\n", + " )\n", + " (norm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", + " (rotary_emb): MllamaRotaryEmbedding()\n", + " )\n", + " (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n", + " )\n", + " (multi_modal_projector): Linear4bit(in_features=7680, out_features=4096, bias=True)\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "f2c2fd34-e1e4-427d-86d0-73bf74ff0005", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'What is the main factor in a patient with metastatic bone disease?',\n", + " 'context': 'Perioperative considerations in patients with metastatic bone disease.'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "eval_dataset\n", "eval_dataset[2310]" @@ -418,7 +2575,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "29d96aea-445d-482d-b7dc-861635a5389c", "metadata": { "executionInfo": { @@ -433,14 +2590,27 @@ }, "id": "X6TWyPHaAMtH" }, - "outputs": [], + "outputs": [ + { + "ename": "ValueError", + "evalue": "'' is not in list", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 46\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m#labels = batch[\"input_ids\"].clone()\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m#labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id\u001b[39;00m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m#batch[\"labels\"] = labels\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m batch\n\u001b[0;32m---> 46\u001b[0m data_collator \u001b[38;5;241m=\u001b[39m MyDataCollator(processor)\n", + "Cell \u001b[0;32mIn[9], line 5\u001b[0m, in \u001b[0;36mMyDataCollator.__init__\u001b[0;34m(self, processor)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, processor):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocessor \u001b[38;5;241m=\u001b[39m processor\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_token_id \u001b[38;5;241m=\u001b[39m processor\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39madditional_special_tokens_ids[\n\u001b[0;32m----> 5\u001b[0m processor\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39madditional_special_tokens\u001b[38;5;241m.\u001b[39mindex(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m ]\n", + "\u001b[0;31mValueError\u001b[0m: '' is not in list" + ] + } + ], "source": [ "class MyDataCollator:\n", " def __init__(self, processor):\n", " self.processor = processor\n", - " self.image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", - " processor.tokenizer.additional_special_tokens.index(\"\")\n", - " ]\n", + " #self.image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", + " # processor.tokenizer.additional_special_tokens.index(\"\")\n", + " #]\n", "\n", " def __call__(self, samples):\n", " texts = []\n",