Sentence Similarity
sentence-transformers
PyTorch
English
bert
feature-extraction
mteb
custom_code
Eval Results (legacy)
text-embeddings-inference
Instructions to use Hum-Works/lodestone-base-4096-v1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use Hum-Works/lodestone-base-4096-v1 with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("Hum-Works/lodestone-base-4096-v1", trust_remote_code=True) sentences = [ "That is a happy person", "That is a happy dog", "That is a very happy person", "Today is a sunny day" ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [4, 4] - Notebooks
- Google Colab
- Kaggle
| # This training script is a duplicate of the Training.ipynb notebook but can be invoked from the terminal | |
| import os | |
| print(os.getcwd()) | |
| os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH") | |
| os.system('pip uninstall -y torch') | |
| os.system('pip uninstall -y einops') | |
| os.system('pip uninstall -y transformers') | |
| os.system('pip uninstall -y sentence_transformers') | |
| os.system('pip uninstall -y datasets') | |
| os.system('pip uninstall -y sagemaker') | |
| os.system('pip uninstall -y smart_open') | |
| os.system('pip uninstall -y pynvml') | |
| os.system('pip install -r lodestone-reqs.txt') | |
| os.system('pip install -e ./sentence-transformers') | |
| os.system('pip uninstall -y triton') | |
| os.system('pip install --no-deps triton==2.0.0.dev20221202') | |
| ##### | |
| from pynvml import * | |
| import math | |
| from sentence_transformers import models, losses | |
| from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample | |
| import logging | |
| import os | |
| import json | |
| import torch | |
| import boto3 | |
| from smart_open import open | |
| import random | |
| import time | |
| import gc | |
| os.environ["PATH"]="/usr/local/cuda-11.7/bin:"+os.getenv("PATH") | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| ##### | |
| def print_gpu_utilization(): | |
| "This helper function outputs the current GPU memory usage." | |
| nvmlInit() | |
| handle = nvmlDeviceGetHandleByIndex(0) | |
| info = nvmlDeviceGetMemoryInfo(handle) | |
| return f"GPU memory occupied: {info.used/1024**3} GB." | |
| ##### | |
| class MultiDatasetDataLoader: | |
| """ | |
| This custom dataloader class consumes a list of datasets and a batch size and produces batches randomly sampled | |
| from the datasets provided where each batch consists of records from a single dataset and datasets are chosen | |
| for batches in proportion to their total number of records. | |
| """ | |
| def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1, allow_swap=True): | |
| self.allow_swap = allow_swap | |
| self.batch_size_pairs = batch_size_pairs | |
| self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets | |
| # Compute dataset weights | |
| self.dataset_lengths = list(map(len, datasets)) | |
| self.dataset_lengths_sum = sum(self.dataset_lengths) | |
| weights = [] | |
| # if dataset_size_temp > 0: # Scale probability with dataset size | |
| # for dataset in datasets: | |
| # prob = len(dataset) / self.dataset_lengths_sum | |
| # weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000))) | |
| # else: # Equal weighting of all datasets | |
| # weights = [100] * len(datasets) | |
| for dataset in datasets: | |
| weights.append(len(dataset)) | |
| # logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights)))) | |
| self.dataset_idx = [] | |
| self.dataset_idx_pointer = 0 | |
| for idx, weight in enumerate(weights): | |
| self.dataset_idx.extend([idx] * weight) | |
| random.shuffle(self.dataset_idx) | |
| self.datasets = [] | |
| for dataset in datasets: | |
| random.shuffle(dataset) | |
| self.datasets.append({ | |
| 'elements': dataset, | |
| 'pointer': 0, | |
| }) | |
| def __iter__(self): | |
| for _ in range(int(self.__len__())): | |
| # Select dataset | |
| if self.dataset_idx_pointer >= len(self.dataset_idx): | |
| self.dataset_idx_pointer = 0 | |
| random.shuffle(self.dataset_idx) | |
| dataset_idx = self.dataset_idx[self.dataset_idx_pointer] | |
| self.dataset_idx_pointer += 1 | |
| # Select batch from this dataset | |
| dataset = self.datasets[dataset_idx] | |
| batch_size = self.batch_size_pairs if len(dataset['elements'][0].texts) == 2 else self.batch_size_triplets | |
| batch = [] | |
| texts_in_batch = set() | |
| guid_in_batch = set() | |
| while len(batch) < batch_size: | |
| example = dataset['elements'][dataset['pointer']] | |
| valid_example = True | |
| # First check if one of the texts in already in the batch | |
| for text in example.texts: | |
| text_norm = text.strip().lower() | |
| if text_norm in texts_in_batch: | |
| valid_example = False | |
| texts_in_batch.add(text_norm) | |
| # If the example has a label, check if label is in batch | |
| if example.guid is not None: | |
| valid_example = valid_example and example.guid not in guid_in_batch | |
| guid_in_batch.add(example.guid) | |
| if valid_example: | |
| if self.allow_swap and random.random() > 0.5: | |
| example.texts[0], example.texts[1] = example.texts[1], example.texts[0] | |
| batch.append(example) | |
| dataset['pointer'] += 1 | |
| if dataset['pointer'] >= len(dataset['elements']): | |
| dataset['pointer'] = 0 | |
| random.shuffle(dataset['elements']) | |
| yield self.collate_fn(batch) if self.collate_fn is not None else batch | |
| def __len__(self): | |
| return int(self.dataset_lengths_sum / self.batch_size_pairs) | |
| ##### | |
| # These four classes of custom generators parse the raw data from the files in S3 and format it into InputExamples which can be properly interpreted by a SentenceTransformer model. | |
| class RedditTitleBodyDataset: | |
| def __init__(self, source_uri, max_seq_length): | |
| self.source_uri = source_uri | |
| self.s3_client = boto3.client("s3") | |
| self.max_seq_length = max_seq_length | |
| def __iter__(self): | |
| while True: | |
| for json_line in open(self.source_uri, transport_params={"client": self.s3_client}): | |
| data_line = json.loads(json_line.strip()) | |
| if "title" in data_line and "body" in data_line: | |
| data = {'guid': None, 'texts': [" ".join(data_line['title'].split(" ")[:self.max_seq_length]), " ".join(data_line['body'].split(" ")[:self.max_seq_length])]} | |
| record = InputExample(guid=data.get('guid', None), texts=data['texts']) | |
| yield record | |
| class RedditYearDataset: | |
| def __init__(self, source_uri, max_seq_length): | |
| self.source_uri = source_uri | |
| self.s3_client = boto3.client("s3") | |
| self.max_seq_length = max_seq_length | |
| def __iter__(self): | |
| while True: | |
| for json_line in open(self.source_uri, transport_params={"client": self.s3_client}): | |
| data_line = json.loads(json_line.strip()) | |
| if "response" in data_line and "context" in data_line: | |
| data = {'guid': None, 'texts': [" ".join(data_line['response'].split(" ")[:self.max_seq_length]), " ".join(data_line['context'].split(" ")[:self.max_seq_length])]} | |
| record = InputExample(guid=data.get('guid', None), texts=data['texts']) | |
| yield record | |
| class HuggingFaceQueryPosDataset: | |
| def __init__(self, source_uri, max_seq_length): | |
| self.source_uri = source_uri | |
| self.s3_client = boto3.client("s3") | |
| self.max_seq_length = max_seq_length | |
| def __iter__(self): | |
| while True: | |
| for json_line in open(self.source_uri, transport_params={"client": self.s3_client}): | |
| data_line = json.loads(json_line.strip()) | |
| if "query" in data_line and "pos" in data_line: | |
| for i in range(len(data_line['pos'])): | |
| data = {'guid': None, 'texts': [" ".join(data_line['query'].split(" ")[:self.max_seq_length]), " ".join(data_line['pos'][i].split(" ")[:self.max_seq_length])]} | |
| record = InputExample(guid=data.get('guid', None), texts=data['texts']) | |
| yield record | |
| class Dataset: | |
| def __init__(self, source_uri, max_seq_length): | |
| self.source_uri = source_uri | |
| self.s3_client = boto3.client("s3") | |
| self.max_seq_length = max_seq_length | |
| def __iter__(self): | |
| while True: | |
| for json_line in open(self.source_uri, transport_params={"client": self.s3_client}): | |
| data_line = json.loads(json_line.strip()) | |
| if not isinstance(data_line, dict): | |
| data = {'guid': None, 'texts': data_line} | |
| for text_idx in range(len(data['texts'])): | |
| data['texts'][text_idx] = " ".join(data['texts'][text_idx].split(" ")[:self.max_seq_length]) | |
| record = InputExample(guid=data.get('guid', None), texts=data['texts']) | |
| else: | |
| for text_idx in range(len(data_line['texts'])): | |
| data_line['texts'][text_idx] = " ".join(data_line['texts'][text_idx].split(" ")[:self.max_seq_length]) | |
| record = InputExample(guid=data_line.get('guid', None), texts=data_line['texts']) | |
| yield record | |
| ##### | |
| def build_generators(data_records, max_seq_length=512, testing=False): | |
| """ | |
| This function consumes the data_records dictionary and creates a new dictionary of data generators where each entry is | |
| of the form {filename: data generator object}. | |
| """ | |
| if testing: | |
| # filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')] | |
| filepaths = [file for file in list(data_records.keys())][:3] | |
| else: | |
| filepaths = list(data_records.keys()) | |
| generators = {} | |
| for filepath in filepaths: | |
| filepath = filepath.strip() | |
| source_uri = 's3://lodestone-rnd/data/'+filepath | |
| if filepath in ['S2ORC_citations_abstracts.json.gz', 'amazon-qa.json.gz'] or 'reddit' in filepath: | |
| if "title" in filepath: | |
| generators[f'{filepath.split(".")[0]}'] = iter(RedditTitleBodyDataset(source_uri, max_seq_length)) | |
| elif "reddit" in filepath: | |
| generators[f'{filepath.split(".")[0]}'] = iter(RedditYearDataset(source_uri, max_seq_length)) | |
| else: | |
| generators[f'{filepath.split(".")[0]}'] = iter(HuggingFaceQueryPosDataset(source_uri, max_seq_length)) | |
| else: | |
| generators[f'{filepath.split(".")[0]}'] = iter(Dataset(source_uri, max_seq_length)) | |
| return generators | |
| ##### | |
| def produce_data(data_records, num_chunks, generators, batch_size, failed_on=None, first_iter=False, testing=False, temp=-1): | |
| """ | |
| This function consumes the data_records dictionary, the number of chunks to break the datasets into, the dictionary of | |
| data generators, and a batch size and returns a MultiDatasetDataloader which can be fed into the .fit method of a | |
| SentenceTransformer model. | |
| """ | |
| if testing: | |
| # filepaths = [file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')] | |
| filepaths = [file for file in list(data_records.keys())][:3] | |
| else: | |
| filepaths = list(data_records.keys()) | |
| datasets = [] | |
| for file_idx, filepath in enumerate(filepaths): | |
| filepath = filepath.strip() | |
| dataset = [] | |
| if failed_on is not None and failed_on != 1 and first_iter: | |
| for k in range((failed_on-1)*max(1, data_records[filepath]//num_chunks)): | |
| next(generators[f'{filepath.split(".")[0]}']) | |
| for m in range(max(1, data_records[filepath]//num_chunks)): | |
| dataset.append(next(generators[f'{filepath.split(".")[0]}'])) | |
| else: | |
| for n in range(max(1, data_records[filepath]//num_chunks)): | |
| dataset.append(next(generators[f'{filepath.split(".")[0]}'])) | |
| datasets.append(dataset) | |
| logging.info("{}. {}: {}".format(file_idx+1, filepath, len(dataset))) | |
| dataset_lengths_sum = sum(list(map(len, datasets))) | |
| batch_size_pairs = batch_size_triplets = batch_size | |
| # Special data loader to load from multiple datasets | |
| train_dataloader = MultiDatasetDataLoader(datasets=datasets, | |
| batch_size_pairs=batch_size_pairs, | |
| batch_size_triplets=batch_size_triplets, | |
| dataset_size_temp=temp) | |
| return train_dataloader, dataset_lengths_sum | |
| ##### | |
| def construct_model(model_name, max_seq_length=512): | |
| """ | |
| This function constructs a SentenceTransformer model from a HuggingFace transformer model name | |
| or from a local path to a transformer model repository. | |
| """ | |
| word_embedding_model = models.Transformer(model_name_or_path=model_name, | |
| max_seq_length=max_seq_length, | |
| tokenizer_name_or_path='bert-base-uncased', | |
| trust_remote_code=True, | |
| model_args={'torch_dtype': torch.bfloat16}) | |
| pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) | |
| norm = models.Normalize() | |
| model = SentenceTransformer(modules=[word_embedding_model, pooling_model, norm], device='cuda') | |
| model[0].tokenizer.model_max_length = max_seq_length | |
| return model | |
| ##### | |
| # Just some code to print debug information to stdout | |
| logging.basicConfig(format='%(asctime)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S', | |
| level=logging.INFO, | |
| handlers=[LoggingHandler()]) | |
| # /print debug information to stdout | |
| ##### | |
| # Set Hyperparameters | |
| model_name = 'mosaic-bert-base-seqlen-2048' | |
| # model_name = 'hum-lodestone-v1' | |
| batch_size = 16 | |
| batch_size_pairs = batch_size_triplets = batch_size | |
| max_seq_length = 2048 | |
| use_amp = False | |
| num_cycles = 2 | |
| num_chunks = 50 | |
| num_epochs = 2 | |
| steps_per_epoch = 10000 | |
| # Total training steps = num_cycles * num_chunks * num_epochs * steps_per_epoch = 2 * 50 * 2 * 10,000 = 2,000,000 steps | |
| warmup_steps = 500 | |
| testing = False | |
| temp = -1 | |
| ##### | |
| output_path = 'hum-lodestone-v1' | |
| logging.info("Output: "+output_path) | |
| # Instantiate SentenceTransformer Model | |
| model = construct_model(model_name=model_name, max_seq_length=max_seq_length) | |
| # Load File Names and Record Volumes | |
| with open('data_records.json') as fIn: | |
| data_records = json.load(fIn) | |
| total_pairs = sum(data_records.values()) | |
| logging.info("Total Training Pairs: {}".format(total_pairs)) | |
| # Initialize Data Generators | |
| generators = build_generators(data_records=data_records, | |
| max_seq_length=max_seq_length, | |
| testing=testing) | |
| logging.info("Data Generators Initialized") | |
| # Define Training Loss Function | |
| train_loss = losses.MultipleNegativesRankingLoss(model, | |
| scale=20, | |
| similarity_fct=util.dot_score) | |
| logging.info(print_gpu_utilization()) | |
| ##### | |
| # Configure Training Cycles | |
| failed_on = None # chunk that the process failed on | |
| random.seed(42) | |
| steps = 0 | |
| first_iter = True | |
| for cycle_num in range(num_cycles): | |
| logging.info("Starting Cycle {}".format(cycle_num+1)) | |
| for chunk_num in range(num_chunks): | |
| if failed_on is not None and (chunk_num+1) < failed_on and (cycle_num+1) == 1: | |
| pass | |
| else: | |
| logging.info("Chunk {}/{}".format(chunk_num+1, num_chunks)) | |
| logging.info("Loading {} Datasets".format(len([file for file in list(data_records.keys()) if file.startswith('S2ORC') or file.startswith('reddit_')]) if testing else len(data_records))) | |
| # t_dataload0 = time.time() | |
| # Create the training dataloader for the given chunk of data | |
| train_dataloader, dataset_lengths_sum = produce_data(data_records, | |
| num_chunks, | |
| generators, | |
| batch_size, | |
| failed_on=failed_on, | |
| first_iter=first_iter, | |
| testing=testing, | |
| temp=temp) | |
| first_iter = False | |
| # t_dataload1 = time.time() | |
| # print(t_dataload1-t_dataload0) | |
| logging.info(print_gpu_utilization()) | |
| # steps_per_epoch = dataset_lengths_sum // batch_size_pairs | |
| for epoch_num in range(num_epochs): | |
| logging.info("Performing Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1)) | |
| try: | |
| # t_fit0 = time.time() | |
| # Train the model | |
| model.fit(train_objectives=[(train_dataloader, train_loss)], | |
| evaluator=None, | |
| epochs=1, | |
| warmup_steps=warmup_steps, | |
| steps_per_epoch=steps_per_epoch, | |
| use_amp=use_amp, | |
| output_path=output_path) | |
| # t_fit1 = time.time() | |
| # print(t_fit1-t_fit0) | |
| steps += steps_per_epoch | |
| logging.info(print_gpu_utilization()) | |
| logging.info("Succeeded on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1)) | |
| logging.info("{} Steps Completed in Total".format(steps)) | |
| with open('train_logs.txt', 'a') as log: | |
| log.write("Succeeded on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps)) | |
| except: | |
| logging.info("Failed on Cycle {}, Chunk {}, Epoch {}".format(cycle_num+1, chunk_num+1, epoch_num+1)) | |
| with open('train_logs.txt', 'a') as log: | |
| log.write("Failed on Cycle {}, Chunk {}, Epoch {}: {} Steps Completed in Total\n".format(cycle_num+1, chunk_num+1, epoch_num+1, steps)) | |
| finally: | |
| warmup_steps = 0 | |
| # Clear GPU/CUDA memory cache between data chunks | |
| train_dataloader = None | |
| model = None | |
| train_loss = None | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # Reload the model and reinitialize the loss function | |
| model = construct_model(model_name='hum-lodestone-v1', max_seq_length=max_seq_length) | |
| train_loss = losses.MultipleNegativesRankingLoss(model, | |
| scale=20, | |
| similarity_fct=util.dot_score) | |
| logging.info(print_gpu_utilization()) |