karthikeyan-r commited on
Commit
d997e06
1 Parent(s): 4834106

Create embeddingsProcessor.py

Browse files
Files changed (1) hide show
  1. embeddingsProcessor.py +34 -0
embeddingsProcessor.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch
4
+ import os
5
+ import numpy as np
6
+
7
+ class EmbeddingsProcessor:
8
+ """
9
+ Class for processing text to obtain embeddings using a transformer model.
10
+ """
11
+ def __init__(self, model_name: str):
12
+ """
13
+ Initialize the EmbeddingsProcessor with a pre-trained model.
14
+
15
+ Args:
16
+ model_name (str): The name of the pre-trained model to use for generating embeddings.
17
+ """
18
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ self.model = AutoModel.from_pretrained(model_name).to('cpu') # Change 'cuda' to 'cpu'
20
+
21
+ def get_embeddings(self, texts: List[str]) -> np.ndarray:
22
+ """
23
+ Generate embeddings for a list of texts.
24
+
25
+ Args:
26
+ texts (List[str]): A list of text strings for which to generate embeddings.
27
+
28
+ Returns:
29
+ np.ndarray: A NumPy array of embeddings for the provided texts.
30
+ """
31
+ encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
32
+ encoded_input = {k: v.to('cpu') for k, v in encoded_input.items()} # Ensure all tensors are on CPU
33
+ model_output = self.model(**encoded_input)
34
+ return model_output.last_hidden_state.mean(dim=1).detach().numpy()