charlieoneill commited on
Commit
efcd725
·
verified ·
1 Parent(s): b3ae512

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +32 -15
pipeline.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import AutoTokenizer, AutoModel
2
  import torch
3
- from typing import List
4
  import torch.nn as nn
 
5
 
6
  class PersonEmbeddings(nn.Module):
7
  def __init__(self, model_id: str):
@@ -14,32 +14,49 @@ class PersonEmbeddings(nn.Module):
14
  )
15
 
16
  def forward(self, input_ids, attention_mask):
17
- outputs = self.base_model(
18
- input_ids=input_ids,
19
- attention_mask=attention_mask
20
- )
21
  last_hidden = outputs.last_hidden_state # (B, seq_len, 768)
22
  mean_pooled = last_hidden.mean(dim=1) # (B, 768)
23
  embeddings = self.projection(mean_pooled) # (B, 1536)
24
  return embeddings
25
 
26
  class CustomEmbeddingPipeline:
27
- def __init__(self, model_id="answerdotai/ModernBERT-base"):
28
- # Load your base tokenizer
29
- self.tokenizer = AutoTokenizer.from_pretrained("charlieoneill/my_modernbert_person_embeddings")
30
- # Load your PersonEmbeddings
31
- self.model = PersonEmbeddings(model_id)
 
 
 
 
 
 
 
 
 
 
32
  ckpt_path = "pytorch_model.bin"
33
- state_dict = torch.load(ckpt_path)
34
  self.model.load_state_dict(state_dict)
35
  self.model.eval()
36
 
37
  def __call__(self, text: str) -> List[float]:
38
- # Tokenize
39
- inputs = self.tokenizer([text], padding=True, truncation=True, return_tensors="pt")
 
 
 
 
 
 
40
  with torch.no_grad():
41
- emb = self.model(inputs["input_ids"], inputs["attention_mask"])
42
- # Return the embedding of shape (1, 1536) as a Python list
 
 
 
 
43
  return emb[0].tolist()
44
 
45
  def pipeline(*args, **kwargs):
 
1
  from transformers import AutoTokenizer, AutoModel
2
  import torch
 
3
  import torch.nn as nn
4
+ from typing import List
5
 
6
  class PersonEmbeddings(nn.Module):
7
  def __init__(self, model_id: str):
 
14
  )
15
 
16
  def forward(self, input_ids, attention_mask):
17
+ outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
 
 
 
18
  last_hidden = outputs.last_hidden_state # (B, seq_len, 768)
19
  mean_pooled = last_hidden.mean(dim=1) # (B, 768)
20
  embeddings = self.projection(mean_pooled) # (B, 1536)
21
  return embeddings
22
 
23
  class CustomEmbeddingPipeline:
24
+ """
25
+ Loads tokenizer + PersonEmbeddings from the *same* HF repo so that
26
+ the vocabulary is consistent with the model weights.
27
+ """
28
+ def __init__(self, repo_id="charlieoneill/my_modernbert_person_embeddings"):
29
+ # 1. Load tokenizer from your own HF repo,
30
+ # which contains tokenizer.json, special_tokens_map.json, etc.
31
+ self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
32
+
33
+ # 2. Create your PersonEmbeddings using the same repo_id
34
+ # so AutoModel inside PersonEmbeddings will match
35
+ self.model = PersonEmbeddings(repo_id)
36
+
37
+ # 3. Load your fine-tuned state dict from local file (pytorch_model.bin).
38
+ # (It's typically named this in your HF repo. Make sure your repo has it!)
39
  ckpt_path = "pytorch_model.bin"
40
+ state_dict = torch.load(ckpt_path, map_location="cpu")
41
  self.model.load_state_dict(state_dict)
42
  self.model.eval()
43
 
44
  def __call__(self, text: str) -> List[float]:
45
+ # Tokenize input
46
+ inputs = self.tokenizer(
47
+ [text],
48
+ padding=True,
49
+ truncation=True,
50
+ return_tensors="pt"
51
+ )
52
+
53
  with torch.no_grad():
54
+ emb = self.model(
55
+ inputs["input_ids"],
56
+ inputs["attention_mask"]
57
+ ) # shape: (1, 1536)
58
+
59
+ # Return as a Python list
60
  return emb[0].tolist()
61
 
62
  def pipeline(*args, **kwargs):