Vectorize

Sleeping

App Files Files Community

0xalfroz commited on Sep 17, 2024

Commit

0e41473

verified ·

1 Parent(s): 885a800

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -13

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from transformers import AutoModel, AutoTokenizer
-import numpy as np
 # Load a small CPU model for text to vector processing
 model_name = "sentence-transformers/all-mpnet-base-v2"
@@ -8,25 +8,50 @@ model = AutoModel.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 def text_to_vector(texts):
-    # Tokenize the input array of sentences
-    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-    outputs = model(**inputs)
-    vectors = outputs.pooler_output.detach().numpy()
-    # Convert each vector to a string representation and create an object
-    result = [
-        {"sentence": sentence, "vector": ", ".join(map(str, vector))}
-        for sentence, vector in zip(texts, vectors)
-    ]
-    return result
 demo = gr.Interface(
     fn=text_to_vector,
     inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
     outputs=gr.JSON(label="Sentence and Vector Pairs"),
-    title="Batch Text to Vector 769 dim",
     description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
 )
 demo.launch()

 import gradio as gr
 from transformers import AutoModel, AutoTokenizer
+import torch
 # Load a small CPU model for text to vector processing
 model_name = "sentence-transformers/all-mpnet-base-v2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 def text_to_vector(texts):
+    results = []
+    # Process each sentence individually to catch errors
+    for sentence in texts:
+        try:
+            # Tokenize the sentence
+            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
+            # Check if tokenization results in valid tokens
+            if inputs['input_ids'].shape[1] == 0:
+                raise ValueError(f"Tokenization failed for sentence: '{sentence}'")
+            # Pass through the model
+            with torch.no_grad():
+                outputs = model(**inputs)
+            # Get the vector from pooler_output or handle errors
+            if outputs.pooler_output is None:
+                raise ValueError(f"No vector generated for sentence: '{sentence}'")
+            # Convert the vector to a list of floats
+            vector = outputs.pooler_output.squeeze().numpy().tolist()
+            # Append result as sentence and vector pair
+            results.append({
+                "sentence": sentence,
+                "vector": vector
+            })
+        except Exception as e:
+            # Handle any errors for individual sentences
+            results.append({
+                "sentence": sentence,
+                "vector": f"Error: {str(e)}"
+            })
+    return results
 demo = gr.Interface(
     fn=text_to_vector,
     inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
     outputs=gr.JSON(label="Sentence and Vector Pairs"),
+    title="Batch Text to Vector",
     description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
 )
 demo.launch()