Spaces:

AhmedSSabir
/

Demo-for-Gender-Score-AR

Sleeping

App Files Files Community

AhmedSSabir commited on Apr 19

Commit

59c43da

•

1 Parent(s): 0c726aa

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -43

app.py CHANGED Viewed

@@ -7,6 +7,17 @@ import re
 import os
 import gradio as gr
 import requests
 from sentence_transformers import SentenceTransformer, util
 #url = "https://github.com/simonepri/lm-scorer/tree/master/lm_scorer/models"
@@ -47,16 +58,16 @@ import re
-def Sort_Tuple(tup):
-	# (Sorts in descending order)
-	tup.sort(key = lambda x: x[1])
-	return tup[::-1]
-def softmax(x):
-	exps = np.exp(x)
-	return np.divide(exps, np.sum(exps))
 def get_sim(x):
@@ -68,7 +79,7 @@ def get_sim(x):
 # Load pre-trained model
 #model = GPT2LMHeadModel.from_pretrained('distilgpt2', output_hidden_states = True, output_attentions = True)
-model = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states = True, output_attentions = True)
 #model  =  gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
 #model.eval()
@@ -90,46 +101,77 @@ tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
-def cloze_prob(text):
-	whole_text_encoding = tokenizer.encode(text)
-	# Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
-	text_list = text.split()
-	stem = ' '.join(text_list[:-1])
-	stem_encoding = tokenizer.encode(stem)
-	# cw_encoding is just the difference between whole_text_encoding and stem_encoding
-	# note: this might not correspond exactly to the word itself
-	cw_encoding = whole_text_encoding[len(stem_encoding):]
-	# Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
-	# Put the whole text encoding into a tensor, and get the model's comprehensive output
-	tokens_tensor = torch.tensor([whole_text_encoding])
-	with torch.no_grad():
-		outputs = model(tokens_tensor)
-		predictions = outputs[0]
-	logprobs = []
-	# start at the stem and get downstream probabilities incrementally from the model(see above)
-	start = -1-len(cw_encoding)
-	for j in range(start,-1,1):
-			raw_output = []
-			for i in predictions[-1][j]:
-					raw_output.append(i.item())
-			logprobs.append(np.log(softmax(raw_output)))
-	# if the critical word is three tokens long, the raw_probabilities should look something like this:
-	# [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
-	# Then for the i'th token we want to find its associated probability
-	# this is just: raw_probabilities[i][token_index]
-	conditional_probs = []
-	for cw,prob in zip(cw_encoding,logprobs):
-			conditional_probs.append(prob[cw])
-	# now that you have all the relevant probabilities, return their product.
-	# This is the probability of the critical word given the context before it.
-	return np.exp(np.sum(conditional_probs))

 import os
 import gradio as gr
 import requests
+from doctest import OutputChecker
+import sys
+import torch
+import re
+import os
+import gradio as gr
+import requests
+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from torch.nn.functional import softmax
+import numpy as np
 from sentence_transformers import SentenceTransformer, util
 #url = "https://github.com/simonepri/lm-scorer/tree/master/lm_scorer/models"
+# def Sort_Tuple(tup):
+# 	# (Sorts in descending order)
+# 	tup.sort(key = lambda x: x[1])
+# 	return tup[::-1]
+# def softmax(x):
+# 	exps = np.exp(x)
+# 	return np.divide(exps, np.sum(exps))
 def get_sim(x):
 # Load pre-trained model
 #model = GPT2LMHeadModel.from_pretrained('distilgpt2', output_hidden_states = True, output_attentions = True)
+#model = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states = True, output_attentions = True)
 #model  =  gr.Interface.load('huggingface/distilgpt2', output_hidden_states = True, output_attentions = True)
 #model.eval()
+# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+# model = GPT2LMHeadModel.from_pretrained('gpt2')
+def sentence_prob_mean(text):
+    # Tokenize the input text and add special tokens
+    input_ids = tokenizer.encode(text, return_tensors='pt')
+    # Obtain model outputs
+    with torch.no_grad():
+        outputs = model(input_ids, labels=input_ids)
+        logits = outputs.logits  # logits are the model outputs before applying softmax
+    # Shift logits and labels so that tokens are aligned:
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = input_ids[..., 1:].contiguous()
+    # Calculate the softmax probabilities
+    probs = softmax(shift_logits, dim=-1)
+    # Gather the probabilities of the actual token IDs
+    gathered_probs = torch.gather(probs, 2, shift_labels.unsqueeze(-1)).squeeze(-1)
+    # Compute the mean probability across the tokens
+    mean_prob = torch.mean(gathered_probs).item()
+    return mean_prob
+# def cloze_prob(text):
+# 	whole_text_encoding = tokenizer.encode(text)
+# 	# Parse out the stem of the whole sentence (i.e., the part leading up to but not including the critical word)
+# 	text_list = text.split()
+# 	stem = ' '.join(text_list[:-1])
+# 	stem_encoding = tokenizer.encode(stem)
+# 	# cw_encoding is just the difference between whole_text_encoding and stem_encoding
+# 	# note: this might not correspond exactly to the word itself
+# 	cw_encoding = whole_text_encoding[len(stem_encoding):]
+# 	# Run the entire sentence through the model. Then go "back in time" to look at what the model predicted for each token, starting at the stem.
+# 	# Put the whole text encoding into a tensor, and get the model's comprehensive output
+# 	tokens_tensor = torch.tensor([whole_text_encoding])
+# 	with torch.no_grad():
+# 		outputs = model(tokens_tensor)
+# 		predictions = outputs[0]
+# 	logprobs = []
+# 	# start at the stem and get downstream probabilities incrementally from the model(see above)
+# 	start = -1-len(cw_encoding)
+# 	for j in range(start,-1,1):
+# 			raw_output = []
+# 			for i in predictions[-1][j]:
+# 					raw_output.append(i.item())
+# 			logprobs.append(np.log(softmax(raw_output)))
+# 	# if the critical word is three tokens long, the raw_probabilities should look something like this:
+# 	# [ [0.412, 0.001, ... ] ,[0.213, 0.004, ...], [0.002,0.001, 0.93 ...]]
+# 	# Then for the i'th token we want to find its associated probability
+# 	# this is just: raw_probabilities[i][token_index]
+# 	conditional_probs = []
+# 	for cw,prob in zip(cw_encoding,logprobs):
+# 			conditional_probs.append(prob[cw])
+# 	# now that you have all the relevant probabilities, return their product.
+# 	# This is the probability of the critical word given the context before it.
+# 	return np.exp(np.sum(conditional_probs))