charlieoneill
/

my_modernbert_person_embeddings

Feature Extraction

PyTorch

Model card Files Files and versions Community

charlieoneill commited on 18 days ago

Commit

efcd725

verified ·

1 Parent(s): b3ae512

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +32 -15

pipeline.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from transformers import AutoTokenizer, AutoModel
 import torch
-from typing import List
 import torch.nn as nn
 class PersonEmbeddings(nn.Module):
     def __init__(self, model_id: str):
@@ -14,32 +14,49 @@ class PersonEmbeddings(nn.Module):
         )
     def forward(self, input_ids, attention_mask):
-        outputs = self.base_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask
-        )
         last_hidden = outputs.last_hidden_state  # (B, seq_len, 768)
         mean_pooled = last_hidden.mean(dim=1)    # (B, 768)
         embeddings = self.projection(mean_pooled)  # (B, 1536)
         return embeddings
 class CustomEmbeddingPipeline:
-    def __init__(self, model_id="answerdotai/ModernBERT-base"):
-        # Load your base tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained("charlieoneill/my_modernbert_person_embeddings")
-        # Load your PersonEmbeddings
-        self.model = PersonEmbeddings(model_id)
         ckpt_path = "pytorch_model.bin"
-        state_dict = torch.load(ckpt_path)
         self.model.load_state_dict(state_dict)
         self.model.eval()
     def __call__(self, text: str) -> List[float]:
-        # Tokenize
-        inputs = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt")
         with torch.no_grad():
-            emb = self.model(inputs["input_ids"], inputs["attention_mask"])
-        # Return the embedding of shape (1, 1536) as a Python list
         return emb[0].tolist()
 def pipeline(*args, **kwargs):

 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn as nn
+from typing import List
 class PersonEmbeddings(nn.Module):
     def __init__(self, model_id: str):
         )
     def forward(self, input_ids, attention_mask):
+        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
         last_hidden = outputs.last_hidden_state  # (B, seq_len, 768)
         mean_pooled = last_hidden.mean(dim=1)    # (B, 768)
         embeddings = self.projection(mean_pooled)  # (B, 1536)
         return embeddings
 class CustomEmbeddingPipeline:
+    """
+    Loads tokenizer + PersonEmbeddings from the *same* HF repo so that
+    the vocabulary is consistent with the model weights.
+    """
+    def __init__(self, repo_id="charlieoneill/my_modernbert_person_embeddings"):
+        # 1. Load tokenizer from your own HF repo,
+        #    which contains tokenizer.json, special_tokens_map.json, etc.
+        self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
+        # 2. Create your PersonEmbeddings using the same repo_id
+        #    so AutoModel inside PersonEmbeddings will match
+        self.model = PersonEmbeddings(repo_id)
+        # 3. Load your fine-tuned state dict from local file (pytorch_model.bin).
+        #    (It's typically named this in your HF repo. Make sure your repo has it!)
         ckpt_path = "pytorch_model.bin"
+        state_dict = torch.load(ckpt_path, map_location="cpu")
         self.model.load_state_dict(state_dict)
         self.model.eval()
     def __call__(self, text: str) -> List[float]:
+        # Tokenize input
+        inputs = self.tokenizer(
+            [text],
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
         with torch.no_grad():
+            emb = self.model(
+                inputs["input_ids"],
+                inputs["attention_mask"]
+            )  # shape: (1, 1536)
+        # Return as a Python list
         return emb[0].tolist()
 def pipeline(*args, **kwargs):