Spaces:

keras-io
/

bert-semantic-similarity

Runtime error

App Files Files Community

vumichien commited on Jul 7, 2022

Commit

d9610ab

•

1 Parent(s): 21c3d29

Create new file

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from huggingface_hub import from_pretrained_keras
+import numpy as np
+import gradio as gr
+import transformers
+class BertSemanticDataGenerator(tf.keras.utils.Sequence):
+    """Generates batches of data."""
+    def __init__(
+        self,
+        sentence_pairs,
+        labels,
+        batch_size=batch_size,
+        shuffle=True,
+        include_targets=True,
+    ):
+        self.sentence_pairs = sentence_pairs
+        self.labels = labels
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.include_targets = include_targets
+        # Load our BERT Tokenizer to encode the text.
+        # We will use base-base-uncased pretrained model.
+        self.tokenizer = transformers.BertTokenizer.from_pretrained(
+            "bert-base-uncased", do_lower_case=True
+        )
+        self.indexes = np.arange(len(self.sentence_pairs))
+        self.on_epoch_end()
+    def __len__(self):
+        # Denotes the number of batches per epoch.
+        return len(self.sentence_pairs) // self.batch_size
+    def __getitem__(self, idx):
+        # Retrieves the batch of index.
+        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
+        sentence_pairs = self.sentence_pairs[indexes]
+        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
+        # encoded together and separated by [SEP] token.
+        encoded = self.tokenizer.batch_encode_plus(
+            sentence_pairs.tolist(),
+            add_special_tokens=True,
+            max_length=max_length,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            pad_to_max_length=True,
+            return_tensors="tf",
+        )
+        # Convert batch of encoded features to numpy array.
+        input_ids = np.array(encoded["input_ids"], dtype="int32")
+        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
+        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
+        # Set to true if data generator is used for training/validation.
+        if self.include_targets:
+            labels = np.array(self.labels[indexes], dtype="int32")
+            return [input_ids, attention_masks, token_type_ids], labels
+        else:
+            return [input_ids, attention_masks, token_type_ids]
+model = from_pretrained_keras("keras-io/bert-semantic-similarity")
+def predict(sentence1, sentence2):
+    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
+    test_data = BertSemanticDataGenerator(
+        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
+    )
+    proba = model.predict(test_data[0])[0]
+    idx = np.argmax(proba)
+    proba = f"{proba[idx]*100:.2f}%"
+    pred = labels[idx]
+    return f'These two sentence is {pred} with {proba} of probability'
+inputs = [
+         gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
+]
+examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."],
+            ["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"],
+            ["A soccer game with multiple males playing", "Some men are playing a sport"],
+]
+gr.Interface(
+    fn=predict,
+    title="Semantic Similarity with BERT",
+    description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus.)",
+    inputs=["text", "text"],
+    examples=examples,
+    outputs=gr.Textbox(label='Prediction'),
+    cache_examples=False,
+    article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/nlp/semantic_similarity_with_bert/\">Mohamad Merchant</a>",
+).launch(debug=True, enable_queue=True)