Saving and Loading the fine-tuned model
I fine-tuned the model on my data with SentenceTransformers
library, but obviously just model.save() does not work (it saves without errors, but when I reload in next session - I get aSome weights of BertModel were not initialized from the model checkpoint at ... and are newly initialized
)
Can you please help, how can I save and reload the model (ideally with SentenceTransformers
library)
Ok, I think I got a walk around:
!git clone https://huggingface.co/jinaai/jina-bert-implementation
!mv jina-bert-implementation jina_bert_implementation
!touch jina_bert_implementation/__init__.py
from jina_bert_implementation.modeling_bert import JinaBertModel
checkpoint = "my_checkpoint"
model = JinaBertModel.from_pretrained(checkpoint)
model.to(device)
from transformers import AutoTokenizer
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {
key: val.to(device) for key, val in encoded_input.items()
}
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, max pooling.
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
Still would appreciate the help, because I want to load it as SentenceTransformer for ease of use.
hi @Maiia can you manually edit the SentenceTransformer
class, add trust_remote_code=True
when sbert doing the AutoModel.from_pretrained(...) thingy?
i think in SBert main branch they support it, not in the latest pypi release.
Was not able to find where to change it, but I adapted the function and created a class similar to SentenceTransformer (at least it does the encoding efficiently)
Maybe someone else finds it useful:
from tqdm.notebook import trange
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
class JinaSentEmbedder(AutoModel):
def __init__(self, path):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = AutoModel.from_pretrained(
path,
trust_remote_code=True
)
self.model = self.model.to(self.device)
self.tokenize = AutoTokenizer.from_pretrained(
"jinaai/jina-embeddings-v2-base-en"
)
def _text_length(self, text):
if isinstance(text, dict): #{key: value} case
return len(next(iter(text.values())))
elif not hasattr(text, '__len__'): #Object has no len() method
return 1
elif len(text) == 0 or isinstance(text[0], int): #Empty string or list of ints
return len(text)
else:
return sum([len(t) for t in text])
def encode(self, sentences,
batch_size = 32,
show_progress_bar = None,
output_value: str = 'sentence_embedding',
convert_to_numpy: bool = True,
convert_to_tensor: bool = False,
device: str = None,
normalize_embeddings: bool = False):
self.model.eval()
if convert_to_tensor:
convert_to_numpy = False
if output_value != 'sentence_embedding':
convert_to_tensor = False
convert_to_numpy = False
input_was_string = False
if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
sentences = [sentences]
input_was_string = True
all_embeddings = []
length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
sentences_batch = sentences_sorted[start_index:start_index+batch_size]
encoded_input = self.tokenize(sentences_batch, padding=True, truncation=True, return_tensors='pt')
encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
all_embeddings.extend(sentences_embeddings)
all_embeddings = [all_embeddings[idx].cpu() for idx in np.argsort(length_sorted_idx)]
if convert_to_tensor:
all_embeddings = torch.stack(all_embeddings)
elif convert_to_numpy:
all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
if input_was_string:
all_embeddings = all_embeddings[0]
return all_embeddings
Hi @Maiia, could you please share the code you utilized for fine-tuning this model?
Thank you in advance!
@metalwhale
Hello, it's just normal SentenceTransformers fine-tuning, I have marked up pairs of phrases with labels (so phrase1, phrase2, label) where "label" can be either "pos" or "neg"
title_df = pl.DataFrame({
"title 1": [el[0] for el in dedup_negatives] + [el[0] for el in hard_positives],
'title 2': [el[1] for el in dedup_negatives] + [el[1] for el in hard_positives],
'label': ['neg'] * len(dedup_negatives) + ['pos'] * len(hard_positives)
})
for _ in range(5):
title_df = title_df.sample(fraction=1, shuffle=True)
train_df, val_df = train_test_split(title_df, random_state=42,
test_size=0.1,
stratify=title_df['label'].to_list())
train_examples = []
for row in train_df.iter_rows(named=True):
train_examples.append(
InputExample(texts=[row['title 1'], row['title 2']],
label=torch.tensor(1 if row['label'] == 'pos' else 0).to(torch.float32))
)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)
sentences1 = val_df['title 1'].to_list()
sentences2 = val_df['title 2'].to_list()
scores = [torch.tensor(1 if el == 'pos' else 0).to(torch.float32) for el in val_df['label'].to_list()]
evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1,
warmup_steps=len(train_dataloader)//10,
evaluator=evaluator, evaluation_steps=len(train_dataloader)//10)
@metalwhale I hope it helps, here is the documentation: https://www.sbert.net/docs/training/overview.html
@Maiia thank you so much for your kind help. I really appreciate it!
Hi @Maiia , I followed your step to add a JinaSentEmbedder class and load the jina model with the following code:
jina_path = './jina-embeddings-v2-base-code'
model = JinaSentEmbedder(jina_path)
...
model.fit(...)
It seems fit
is not a part of JinaSentEmbedder, I read the source code of sentence-transformers and found that fit
is implemented in SentenceTransformer
.
Does this mean I should copy a SentenceTransformer
class and patch the functions implemented in JinaSentEmbedder
? Or is there another way to load JinaSentEmbedder
as SentenceTransformer
?
This might be a dumb question as I am new to transformers etc. Thank you in advance!
@shijy16 a lot has happened since this issue was created. You can now load this model and finetune with it with normal Sentence Transformers. A complete training script may look like this:
import logging
from datasets import load_dataset, Dataset
from sentence_transformers import (
SentenceTransformer,
SentenceTransformerTrainer,
SentenceTransformerTrainingArguments,
SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import InformationRetrievalEvaluator
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO
)
# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
"jinaai/jina-embeddings-v2-base-en",
trust_remote_code=True,
model_card_data=SentenceTransformerModelCardData(
language="en",
license="apache-2.0",
model_name="jina-embeddings-v2-base-en trained on Natural Questions pairs",
),
)
model_name = "jina-v2-base-natural-questions"
# 3. Load a dataset to finetune on
dataset = load_dataset("sentence-transformers/natural-questions", split="train")
dataset = dataset.add_column("id", range(len(dataset)))
train_dataset: Dataset = dataset.select(range(90_000))
eval_dataset: Dataset = dataset.select(range(90_000, len(dataset)))
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=f"models/{model_name}",
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=False, # Set to False if you get an error that your GPU can't run on FP16
bf16=True, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=200,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
logging_steps=200,
logging_first_step=True,
run_name=model_name, # Will be used in W&B if `wandb` is installed
)
# 6. (Optional) Create an evaluator & evaluate the base model
# The full corpus, but only the evaluation queries
queries = dict(zip(eval_dataset["id"], eval_dataset["query"]))
corpus = {cid: dataset[cid]["answer"] for cid in range(10_000)} | {cid: dataset[cid]["answer"] for cid in eval_dataset["id"]}
relevant_docs = {qid: {qid} for qid in eval_dataset["id"]}
dev_evaluator = InformationRetrievalEvaluator(
corpus=corpus,
queries=queries,
relevant_docs=relevant_docs,
show_progress_bar=True,
name="natural-questions-dev",
batch_size=8,
)
dev_evaluator(model)
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset.remove_columns("id"),
eval_dataset=eval_dataset.remove_columns("id"),
loss=loss,
evaluator=dev_evaluator,
)
trainer.train()
# (Optional) Evaluate the trained model on the evaluator after training
dev_evaluator(model)
# 8. Save the trained model
model.save_pretrained(f"models/{model_name}/final")
# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub(f"{model_name}")
- Tom Aarsen
@tomaarsen This is exactly the answer I am looking for. I really appreciate it. Thank you!
thanks @tomaarsen for the quick reply!
@shijy16 a lot has happened since this issue was created. You can now load this model and finetune with it with normal Sentence Transformers. A complete training script may look like this:
import logging from datasets import load_dataset, Dataset from sentence_transformers import ( SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData, ) from sentence_transformers.losses import MultipleNegativesRankingLoss from sentence_transformers.training_args import BatchSamplers from sentence_transformers.evaluation import InformationRetrievalEvaluator logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO ) # 1. Load a model to finetune with 2. (Optional) model card data model = SentenceTransformer( "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True, model_card_data=SentenceTransformerModelCardData( language="en", license="apache-2.0", model_name="jina-embeddings-v2-base-en trained on Natural Questions pairs", ), ) model_name = "jina-v2-base-natural-questions" # 3. Load a dataset to finetune on dataset = load_dataset("sentence-transformers/natural-questions", split="train") dataset = dataset.add_column("id", range(len(dataset))) train_dataset: Dataset = dataset.select(range(90_000)) eval_dataset: Dataset = dataset.select(range(90_000, len(dataset))) # 4. Define a loss function loss = MultipleNegativesRankingLoss(model) # 5. (Optional) Specify training arguments args = SentenceTransformerTrainingArguments( # Required parameter: output_dir=f"models/{model_name}", # Optional training parameters: num_train_epochs=1, per_device_train_batch_size=16, per_device_eval_batch_size=16, learning_rate=2e-5, warmup_ratio=0.1, fp16=False, # Set to False if you get an error that your GPU can't run on FP16 bf16=True, # Set to True if you have a GPU that supports BF16 batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch # Optional tracking/debugging parameters: eval_strategy="steps", eval_steps=200, save_strategy="steps", save_steps=200, save_total_limit=2, logging_steps=200, logging_first_step=True, run_name=model_name, # Will be used in W&B if `wandb` is installed ) # 6. (Optional) Create an evaluator & evaluate the base model # The full corpus, but only the evaluation queries queries = dict(zip(eval_dataset["id"], eval_dataset["query"])) corpus = {cid: dataset[cid]["answer"] for cid in range(10_000)} | {cid: dataset[cid]["answer"] for cid in eval_dataset["id"]} relevant_docs = {qid: {qid} for qid in eval_dataset["id"]} dev_evaluator = InformationRetrievalEvaluator( corpus=corpus, queries=queries, relevant_docs=relevant_docs, show_progress_bar=True, name="natural-questions-dev", batch_size=8, ) dev_evaluator(model) # 7. Create a trainer & train trainer = SentenceTransformerTrainer( model=model, args=args, train_dataset=train_dataset.remove_columns("id"), eval_dataset=eval_dataset.remove_columns("id"), loss=loss, evaluator=dev_evaluator, ) trainer.train() # (Optional) Evaluate the trained model on the evaluator after training dev_evaluator(model) # 8. Save the trained model model.save_pretrained(f"models/{model_name}/final") # 9. (Optional) Push it to the Hugging Face Hub model.push_to_hub(f"{model_name}")
- Tom Aarsen
Can I apply the same for thge new version of Jina Embeddings v3 ?