charlieoneill's picture
Update pipeline.py
efcd725 verified
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from typing import List
class PersonEmbeddings(nn.Module):
def __init__(self, model_id: str):
super().__init__()
self.base_model = AutoModel.from_pretrained(model_id)
self.projection = nn.Sequential(
nn.Linear(768, 1024),
nn.ReLU(),
nn.Linear(1024, 1536)
)
def forward(self, input_ids, attention_mask):
outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
last_hidden = outputs.last_hidden_state # (B, seq_len, 768)
mean_pooled = last_hidden.mean(dim=1) # (B, 768)
embeddings = self.projection(mean_pooled) # (B, 1536)
return embeddings
class CustomEmbeddingPipeline:
"""
Loads tokenizer + PersonEmbeddings from the *same* HF repo so that
the vocabulary is consistent with the model weights.
"""
def __init__(self, repo_id="charlieoneill/my_modernbert_person_embeddings"):
# 1. Load tokenizer from your own HF repo,
# which contains tokenizer.json, special_tokens_map.json, etc.
self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
# 2. Create your PersonEmbeddings using the same repo_id
# so AutoModel inside PersonEmbeddings will match
self.model = PersonEmbeddings(repo_id)
# 3. Load your fine-tuned state dict from local file (pytorch_model.bin).
# (It's typically named this in your HF repo. Make sure your repo has it!)
ckpt_path = "pytorch_model.bin"
state_dict = torch.load(ckpt_path, map_location="cpu")
self.model.load_state_dict(state_dict)
self.model.eval()
def __call__(self, text: str) -> List[float]:
# Tokenize input
inputs = self.tokenizer(
[text],
padding=True,
truncation=True,
return_tensors="pt"
)
with torch.no_grad():
emb = self.model(
inputs["input_ids"],
inputs["attention_mask"]
) # shape: (1, 1536)
# Return as a Python list
return emb[0].tolist()
def pipeline(*args, **kwargs):
return CustomEmbeddingPipeline()