|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
import torch.nn as nn |
|
from typing import List |
|
|
|
class PersonEmbeddings(nn.Module): |
|
def __init__(self, model_id: str): |
|
super().__init__() |
|
self.base_model = AutoModel.from_pretrained(model_id) |
|
self.projection = nn.Sequential( |
|
nn.Linear(768, 1024), |
|
nn.ReLU(), |
|
nn.Linear(1024, 1536) |
|
) |
|
|
|
def forward(self, input_ids, attention_mask): |
|
outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) |
|
last_hidden = outputs.last_hidden_state |
|
mean_pooled = last_hidden.mean(dim=1) |
|
embeddings = self.projection(mean_pooled) |
|
return embeddings |
|
|
|
class CustomEmbeddingPipeline: |
|
""" |
|
Loads tokenizer + PersonEmbeddings from the *same* HF repo so that |
|
the vocabulary is consistent with the model weights. |
|
""" |
|
def __init__(self, repo_id="charlieoneill/my_modernbert_person_embeddings"): |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(repo_id) |
|
|
|
|
|
|
|
self.model = PersonEmbeddings(repo_id) |
|
|
|
|
|
|
|
ckpt_path = "pytorch_model.bin" |
|
state_dict = torch.load(ckpt_path, map_location="cpu") |
|
self.model.load_state_dict(state_dict) |
|
self.model.eval() |
|
|
|
def __call__(self, text: str) -> List[float]: |
|
|
|
inputs = self.tokenizer( |
|
[text], |
|
padding=True, |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
emb = self.model( |
|
inputs["input_ids"], |
|
inputs["attention_mask"] |
|
) |
|
|
|
|
|
return emb[0].tolist() |
|
|
|
def pipeline(*args, **kwargs): |
|
return CustomEmbeddingPipeline() |