Spaces:

vives
/

bert_auto_tagging

Runtime error

App Files Files Community

vives commited on May 24, 2022

Commit

2349e64

1 Parent(s): d56e301

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -1

app.py CHANGED Viewed

@@ -2,6 +2,11 @@ from transformers import AutoModelForMaskedLM
 from transformers import AutoTokenizer
 import spacy
 import pytextrank
 model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
@@ -13,4 +18,103 @@ POS = ["NOUN", "PROPN", "VERB"]
 nlp = spacy.load("en_core_web_sm")
 nlp.add_pipe("textrank", last=True, config={"pos_kept": POS, "token_lookback": 3})
-all_stopwords = nlp.Defaults.stop_words

 from transformers import AutoTokenizer
 import spacy
 import pytextrank
+from nlp_entities import *
+import torch
+import streamlit as st
+from sklearn.metrics.pairwise import cosine_similarity
+from collections import defaultdict
 model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
 nlp = spacy.load("en_core_web_sm")
 nlp.add_pipe("textrank", last=True, config={"pos_kept": POS, "token_lookback": 3})
+all_stopwords = nlp.Defaults.stop_words
+#streamlit stuff
+tags = st.text_input("Input tags separated by commas")
+text = st.text_input("Input text to classify")
+#Methods for tag processing
+def pool_embeddings(out, tok):
+  embeddings = out["hidden_states"][-1]
+  attention_mask = tok['attention_mask']
+  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
+  masked_embeddings = embeddings * mask
+  summed = torch.sum(masked_embeddings, 1)
+  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
+  mean_pooled = summed / summed_mask
+  return mean_pooled
+import pandas as pd
+def get_transcript(file):
+    data = pd.io.json.read_json(file)
+    transcript = data['results'].values[1][0]['transcript']
+    transcript = transcript.lower()
+    return transcript
+#
+"""preprocess tags"""
+if tags:
+  tags = [x.lower().strip() for x in tags.split(",")]
+  tags_tokens = concat_tokens(tags)
+  tags_tokens.pop("KPS")
+  with torch.no_grad():
+    outputs_tags = model(**tags_tokens)
+  pools_tags = pool_embeddings(outputs_tags, tags_tokens).detach().numpy()
+  token_dict = {}
+  for tag,embedding in zip(tags,pools_tags):
+    token_dict[tag] = embedding
+"""Code related with processing text, extracting KPs, and doing distance to tag"""
+def concat_tokens(sentences):
+  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
+  for sentence, values in sentences.items():
+      weight = values['weight']
+      # encode each sentence and append to dictionary
+      new_tokens = tokenizer.encode_plus(sentence, max_length=64,
+                                         truncation=True, padding='max_length',
+                                         return_tensors='pt')
+      tokens['input_ids'].append(new_tokens['input_ids'][0])
+      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
+      tokens['KPS'][sentence] = weight
+  # reformat list of tensors into single tensor
+  tokens['input_ids'] = torch.stack(tokens['input_ids'])
+  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
+  return tokens
+def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
+  sim_dict = {}
+  pools = pool_embeddings_count(out, tokens, idx).detach().numpy()
+  for key in kp_dict.keys():
+    if exclude_text and text in key:
+      continue
+    if exclude_words and True in [x in key for x in text.split(" ")]:
+      continue
+    sim_dict[key] = cosine_similarity(
+        pools,
+        [kp_dict[key]]
+    )[0][0] * weight
+  return sim_dict
+def pool_embeddings_count(out, tok, idx):
+  embeddings = out["hidden_states"][-1][idx:idx+1,:,:]
+  attention_mask = tok['attention_mask'][idx]
+  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
+  masked_embeddings = embeddings * mask
+  summed = torch.sum(masked_embeddings, 1)
+  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
+  mean_pooled = summed / summed_mask
+  return mean_pooled
+import pandas as pd
+def extract_tokens(text,top_kp=30):
+  kps = return_ners_and_kp([text], ret_ne=True)['KP']
+  #only process the top_kp tokens
+  kps = sorted(kps.items(), key= lambda x: x[1]['weight'], reverse = True)[:top_kp]
+  kps =  {x:y for x,y in kps}
+  return concat_tokens(kps)
+"""Process text and classify it"""
+if text and tags:
+  text = text.lower()
+  t1_tokens = extract_tokens(text)
+  t1_kps = t1_tokens.pop("KPS")
+  with torch.no_grad():
+    outputs = model(**t1_tokens)
+  tag_distance = None
+  for i,kp in enumerate(t1_kps):
+    if tag_distance is None:
+      tag_distance = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
+    else:
+      curr = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
+      tag_distance = {x:tag_distance[x] + curr[x] for x in tag_distance.keys()}
+  tag_distance = sorted(tag_distance.items(), key= lambda x: x[1], reverse = True)
+  tag_distance =  {x:y for x,y in tag_distance}
+  st.json(tag_distance)