Spaces:

hsuvaskakoty
/

wide_analysis_space

Sleeping

App Files Files Community

hsuvaskakoty commited on Jul 29

Commit

cb58c8d

•

1 Parent(s): d2324b7

Upload 2 files

Browse files

Files changed (2) hide show

app.py +27 -13
model_predict.py +150 -57

app.py CHANGED Viewed

@@ -2,25 +2,39 @@ import data_prep
 import model_predict
 import gradio as gr
 model_dict = {
-    "BERT-Base": "research-dump/bert-base-uncased_deletion_multiclass_complete_Final",
-    "BERT-Large": "research-dump/bert-large-uncased_deletion_multiclass_complete_final",
-    "RoBERTa-Base": "research-dump/roberta-base_deletion_multiclass_complete_final",
-    "RoBERTa-Large": "research-dump/roberta-large_deletion_multiclass_complete_final"
 }
-def process_url(url, model_key):
-    model_name = model_dict[model_key]
     processed_text = data_prep.process_data(url)
     final_scores = model_predict.predict_text(processed_text, model_name)
-    highest_prob_label = max(final_scores, key=final_scores.get)
-    highest_prob = final_scores[highest_prob_label]
-    progress_bars = {label: score for label, score in final_scores.items()}
-    return processed_text, highest_prob_label, highest_prob, progress_bars #,highlighted_text
-title = 'Wikipedia Deletion Discussion Classifier Demo'
 desc = """ This demo is about classifying deletion discussions from Wikipedia about Wikipedia articles. Wikipedia community engages in discussions related to an article’s quality, and map potential issues to existing templates, or Wikipedia policies, which cover diverse areas, from low notability of sources to content implausibility or vandalism.
 To this end, we design a multiclass classifier to predict the outcome of a deletion discussion, without the need for human intervention. The classifier is trained on a dataset of deletion discussions from Wikipedia, and it predicts one of the following labels: delete, keep, merge, no consensus, speedy keep, speedy delete, redirect, or withdrawn. Each of these labels corresponds to a specific outcome of the deletion discussion as described below.
@@ -40,7 +54,7 @@ The input to the classifier is a URL of a Wikipedia deletion discussion page, an
 url_input = gr.Textbox(label="URL")
-model_name_input = gr.Dropdown(label="Model Name", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
 outputs = [
     gr.Textbox(label="Processed Text"),
     gr.Textbox(label="Label with Highest Probability"),

 import model_predict
 import gradio as gr
+# model_dict = {
+#     "BERT-Base": "research-dump/bert-base-uncased_deletion_multiclass_complete_Final",
+#     "BERT-Large": "research-dump/bert-large-uncased_deletion_multiclass_complete_final",
+#     "RoBERTa-Base": "research-dump/roberta-base_deletion_multiclass_complete_final",
+#     "RoBERTa-Large": "research-dump/roberta-large_deletion_multiclass_complete_final"
+# }
 model_dict = {
+    "Outcome Prediction": "outcome",
+    "Stance Detection": "stance",
+    "Policy Prediction": "policy",
+    "Sentiment Analysis": "sentiment",
+    "Offensive Language Detection": "offensive"
 }
+def process_url(url, model_name):
     processed_text = data_prep.process_data(url)
     final_scores = model_predict.predict_text(processed_text, model_name)
+    if model_name == 'outcome':
+        highest_prob_item = max(final_scores, key=lambda x: x['score'])
+        highest_prob_label = highest_prob_item['outcome']
+        highest_prob = highest_prob_item['score']
+        progress_bars = {item['outcome']: item['score'] for item in final_scores}
+    else:
+        highest_prob_item = max(final_scores, key=lambda x: x['score'])
+        highest_prob_label = highest_prob_item[list(highest_prob_item.keys())[1]]
+        highest_prob = highest_prob_item['score']
+        progress_bars = {item[list(item.keys())[1]]: item['score'] for item in final_scores}
+    return processed_text, highest_prob_label, highest_prob, progress_bars
+title = 'Wikipedia Deletion Discussion Analysis Suite'
 desc = """ This demo is about classifying deletion discussions from Wikipedia about Wikipedia articles. Wikipedia community engages in discussions related to an article’s quality, and map potential issues to existing templates, or Wikipedia policies, which cover diverse areas, from low notability of sources to content implausibility or vandalism.
 To this end, we design a multiclass classifier to predict the outcome of a deletion discussion, without the need for human intervention. The classifier is trained on a dataset of deletion discussions from Wikipedia, and it predicts one of the following labels: delete, keep, merge, no consensus, speedy keep, speedy delete, redirect, or withdrawn. Each of these labels corresponds to a specific outcome of the deletion discussion as described below.
 url_input = gr.Textbox(label="URL")
+model_name_input = gr.Dropdown(label="Choose the Task", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
 outputs = [
     gr.Textbox(label="Processed Text"),
     gr.Textbox(label="Label with Highest Probability"),

model_predict.py CHANGED Viewed

@@ -1,19 +1,21 @@
 #using pipeline to predict the input text
 from transformers import pipeline, AutoTokenizer
-import torch
-label_mapping = {
-    'delete': [0, 'LABEL_0'],
-    'keep': [1, 'LABEL_1'],
-    'merge': [2, 'LABEL_2'],
-    'no consensus': [3, 'LABEL_3'],
-    'speedy keep': [4, 'LABEL_4'],
-    'speedy delete': [5, 'LABEL_5'],
-    'redirect': [6, 'LABEL_6'],
-    'withdrawn': [7, 'LABEL_7']
-}
-def predict_text(text, model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = pipeline("text-classification", model=model_name, return_all_scores=True)
@@ -22,62 +24,153 @@ def predict_text(text, model_name):
     truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
     results = model(truncated_text)
-    final_scores = {key: 0.0 for key in label_mapping}
     for result in results[0]:
         for key, value in label_mapping.items():
             if result['label'] == value[1]:
-                final_scores[key] = result['score']
                 break
-    return final_scores
-# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
-# import torch
-# label_mapping = {
-#     'delete': [0, 'LABEL_0'],
-#     'keep': [1, 'LABEL_1'],
-#     'merge': [2, 'LABEL_2'],
-#     'no consensus': [3, 'LABEL_3'],
-#     'speedy keep': [4, 'LABEL_4'],
-#     'speedy delete': [5, 'LABEL_5'],
-#     'redirect': [6, 'LABEL_6'],
-#     'withdrawn': [7, 'LABEL_7']
-# }
-# def predict_text(text, model_name):
-#     tokenizer = AutoTokenizer.from_pretrained(model_name)
-#     model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
-#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
-#     outputs = model(**inputs)
-#     predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-#     final_scores = {key: 0.0 for key in label_mapping}
-#     for i, score in enumerate(predictions[0]):
-#         for key, value in label_mapping.items():
-#             if i == value[0]:
-#                 final_scores[key] = score.item()
-#                 break
-#     # Calculate average attention
-#     attentions = outputs.attentions
-#     avg_attentions = torch.mean(torch.stack(attentions), dim=1)  # Average over all layers
-#     avg_attentions = avg_attentions.mean(dim=1)[0]  # Average over heads
-#     token_importance = avg_attentions.mean(dim=0)
-#     # Decode tokens and highlight important ones
-#     tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
-#     highlighted_text = []
-#     for token, importance in zip(tokens, token_importance):
-#         if importance > token_importance.mean():
-#             highlighted_text.append(f"<b>{token}</b>") #
-#         else:
-#             highlighted_text.append(token)
-#     highlighted_text = " ".join(highlighted_text)
-#     highlighted_text = highlighted_text.replace("##", "")
-#     return final_scores, highlighted_text

 #using pipeline to predict the input text
+import pandas as pd
 from transformers import pipeline, AutoTokenizer
+import pysbd
+#-----------------Outcome Prediction-----------------
+def outcome(text):
+    label_mapping = {
+        'delete': [0, 'LABEL_0'],
+        'keep': [1, 'LABEL_1'],
+        'merge': [2, 'LABEL_2'],
+        'no consensus': [3, 'LABEL_3'],
+        'speedy keep': [4, 'LABEL_4'],
+        'speedy delete': [5, 'LABEL_5'],
+        'redirect': [6, 'LABEL_6'],
+        'withdrawn': [7, 'LABEL_7']
+    }
+    model_name = "research-dump/roberta-large_deletion_multiclass_complete_final"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = pipeline("text-classification", model=model_name, return_all_scores=True)
     truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
     results = model(truncated_text)
+    res_list = []
     for result in results[0]:
         for key, value in label_mapping.items():
             if result['label'] == value[1]:
+                res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
                 break
+    return res_list
+#-----------------Stance Prediction-----------------
+def extract_response(text, model_name, label_mapping):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None)
+    tokens = tokenizer(text, truncation=True, max_length=512)
+    truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
+    results = pipe(truncated_text)
+    final_scores = {key: 0.0 for key in label_mapping}
+    for result in results[0]:
+        for key, value in label_mapping.items():
+            if result['label'] == f'LABEL_{value}':
+                final_scores[key] = result['score']
+                break
+    return final_scores
+def get_stance(text):
+    label_mapping = {
+            'delete': 0,
+            'keep': 1,
+            'merge': 2,
+            'comment': 3
+        }
+    seg = pysbd.Segmenter(language="en", clean=False)
+    text_list = seg.segment(text)
+    model = 'research-dump/bert-large-uncased_wikistance_v1'
+    res_list = []
+    for t in text_list:
+        res = extract_response(t, model,label_mapping) #, access_token)
+        highest_key = max(res, key=res.get)
+        highest_score = res[highest_key]
+        result = {'sentence':t,'stance': highest_key, 'score': highest_score}
+        res_list.append(result)
+    return res_list
+#-----------------Policy Prediction-----------------
+def get_policy(text):
+    label_mapping = {'Wikipedia:Notability': 0,
+            'Wikipedia:What Wikipedia is not': 1,
+            'Wikipedia:Neutral point of view': 2,
+            'Wikipedia:Verifiability': 3,
+            'Wikipedia:Wikipedia is not a dictionary': 4,
+            'Wikipedia:Wikipedia is not for things made up one day': 5,
+            'Wikipedia:Criteria for speedy deletion': 6,
+            'Wikipedia:Deletion policy': 7,
+            'Wikipedia:No original research': 8,
+            'Wikipedia:Biographies of living persons': 9,
+            'Wikipedia:Arguments to avoid in deletion discussions': 10,
+            'Wikipedia:Conflict of interest': 11,
+            'Wikipedia:Articles for deletion': 12
+            }
+    seg = pysbd.Segmenter(language="en", clean=False)
+    text_list = seg.segment(text)
+    model = 'research-dump/bert-large-uncased_wikistance_policy_v1'
+    res_list = []
+    for t in text_list:
+        res = extract_response(t, model,label_mapping)
+        highest_key = max(res, key=res.get)
+        highest_score = res[highest_key]
+        result = {'sentence': t, 'policy': highest_key, 'score': highest_score}
+        res_list.append(result)
+    return res_list
+#-----------------Sentiment Analysis-----------------
+def extract_highest_score_label(res):
+    flat_res = [item for sublist in res for item in sublist]
+    highest_score_item = max(flat_res, key=lambda x: x['score'])
+    highest_score_label = highest_score_item['label']
+    highest_score_value = highest_score_item['score']
+    return highest_score_label, highest_score_value
+def get_sentiment(text):
+    #sentiment analysis
+    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = pipeline("text-classification", model=model_name, top_k= None)
+    #sentence tokenize the text using pysbd
+    seg = pysbd.Segmenter(language="en", clean=False)
+    text_list = seg.segment(text)
+    res = []
+    for t in text_list:
+        results = model(t)
+        highest_label, highest_score = extract_highest_score_label(results)
+        result = {'sentence': t,'sentiment': highest_label, 'score': highest_score}
+        res.append(result)
+    return res
+#-----------------Toxicity Prediction-----------------
+def get_offensive_label(text):
+    #offensive language detection model
+    model_name = "cardiffnlp/twitter-roberta-base-offensive"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = pipeline("text-classification", model=model_name, top_k= None)
+    #sentence tokenize the text using pysbd
+    seg = pysbd.Segmenter(language="en", clean=False)
+    text_list = seg.segment(text)
+    res = []
+    for t in text_list:
+        results = model(t)
+        highest_label, highest_score = extract_highest_score_label(results)
+        result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score}
+        res.append(result)
+    return res
+#create the anchor function
+def predict_text(text, model_name):
+    if model_name == 'outcome':
+        return outcome(text)
+    elif model_name == 'stance':
+        return get_stance(text)
+    elif model_name == 'policy':
+        return get_policy(text)
+    elif model_name == 'sentiment':
+        return get_sentiment(text)
+    elif model_name == 'offensive':
+        return get_offensive_label(text)
+    else:
+        return "Invalid Task name"