hsuvaskakoty commited on
Commit
3c77d98
1 Parent(s): 3ba1861

initial commit

Browse files
Files changed (4) hide show
  1. app.py +41 -0
  2. data_prep.py +131 -0
  3. model_predict.py +28 -0
  4. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import data_prep
2
+ import model_predict
3
+ import gradio as gr
4
+
5
+ # Dictionary of model names and corresponding display names
6
+ model_dict = {
7
+ "bert-base": "research-dump/bert-base-uncased_deletion_multiclass_complete_Final",
8
+ "bert-large": "research-dump/bert-large-uncased_deletion_multiclass_complete_final",
9
+ "roberta-base": "research-dump/roberta-base_deletion_multiclass_complete_final"
10
+ }
11
+
12
+ def process_url(url, model_key):
13
+ # Get the actual model path from the model_dict
14
+ model_name = model_dict[model_key]
15
+
16
+ # Process the text from the URL
17
+ processed_text = data_prep.process_data(url)
18
+
19
+ # Predict the labels and their probabilities
20
+ final_scores = model_predict.predict_text(processed_text, model_name)
21
+
22
+ # Find the label with the highest probability
23
+ highest_prob_label = max(final_scores, key=final_scores.get)
24
+ highest_prob = final_scores[highest_prob_label]
25
+
26
+ # Create progress bar style output for all labels
27
+ progress_bars = {label: score for label, score in final_scores.items()}
28
+
29
+ return highest_prob_label, highest_prob, progress_bars
30
+
31
+ # Define the interface for the Gradio app
32
+ url_input = gr.Textbox(label="URL")
33
+ model_name_input = gr.Dropdown(label="Model Name", choices=list(model_dict.keys()), value=list(model_dict.keys())[0])
34
+ outputs = [
35
+ gr.Textbox(label="Label with Highest Probability"),
36
+ gr.Textbox(label="Probability"),
37
+ gr.JSON(label="All Labels and Probabilities")
38
+ ]
39
+
40
+ demo = gr.Interface(fn=process_url, inputs=[url_input, model_name_input], outputs=outputs)
41
+ demo.launch()
data_prep.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from datetime import datetime
6
+
7
+
8
+ def extract_div_contents_from_url(url):
9
+ response = requests.get(url)
10
+ if response.status_code != 200:
11
+ return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
12
+
13
+ soup = BeautifulSoup(response.content, 'html.parser')
14
+ divs = soup.find_all('div', class_='boilerplate afd vfd xfd-closed archived mw-archivedtalk')
15
+
16
+ # Extract the title fragment from the URL
17
+ url_fragment = url.split('#')[-1].replace('_', ' ')
18
+
19
+ data = []
20
+ for div in divs:
21
+ try:
22
+ title = None
23
+ text_url = None
24
+
25
+ # Extract title and text_url
26
+ title_tag = div.find('a')
27
+ if title_tag:
28
+ title_span = div.find('span', {'data-mw-comment-start': True})
29
+ if title_span:
30
+ title_anchor = title_span.find_next_sibling('a')
31
+ if title_anchor:
32
+ title = title_anchor.text
33
+ text_url = 'https://en.wikipedia.org' + title_anchor['href']
34
+ else:
35
+ title = title_tag.text
36
+ text_url = 'https://en.wikipedia.org' + title_tag['href']
37
+
38
+ if title == 'talk page' or title is None:
39
+ heading_tag = div.find('div', class_='mw-heading mw-heading3')
40
+ if heading_tag:
41
+ title_tag = heading_tag.find('a')
42
+ if title_tag:
43
+ title = title_tag.text
44
+ text_url = 'https://en.wikipedia.org' + title_tag['href']
45
+
46
+ if title != url_fragment:
47
+ continue # Skip if the title does not match the URL fragment
48
+
49
+ deletion_discussion = div.prettify()
50
+
51
+ # Extract label
52
+ label = ''
53
+ verdict_tag = div.find('p')
54
+ if verdict_tag:
55
+ label_b_tag = verdict_tag.find('b')
56
+ if label_b_tag:
57
+ label = label_b_tag.text.strip()
58
+
59
+ # Extract confirmation
60
+ confirmation = ''
61
+ discussion_tag = div.find('dd').find('i')
62
+ if discussion_tag:
63
+ confirmation_b_tag = discussion_tag.find('b')
64
+ if confirmation_b_tag:
65
+ confirmation = confirmation_b_tag.text.strip()
66
+
67
+ # Split deletion_discussion into discussion and verdict
68
+ parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
69
+ discussion = parts[0] if len(parts) > 0 else ''
70
+ verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
71
+
72
+ data.append([title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
73
+ except Exception as e:
74
+ print(f"Error processing div: {e}")
75
+ continue
76
+
77
+ df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion'])
78
+ df = df[['title','discussion','verdict','label']]
79
+ return df
80
+
81
+
82
+ def extract_post_links_text(discussion_html):
83
+ split_point = '<span class="plainlinks">'
84
+ if split_point in discussion_html:
85
+ parts = discussion_html.split(split_point)
86
+ if len(parts) > 1:
87
+ return parts[1]
88
+ return discussion_html
89
+
90
+ def process_discussion(df):
91
+ df['discussion_cleaned'] = df['verdict'].apply(extract_post_links_text)
92
+ return df
93
+
94
+ def html_to_plaintext(html_content):
95
+ soup = BeautifulSoup(html_content, 'html.parser')
96
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl']):
97
+ tag.insert_before('\n')
98
+ tag.insert_after('\n')
99
+ for br in soup.find_all('br'):
100
+ br.replace_with('\n')
101
+
102
+ text = soup.get_text(separator=' ', strip=True)
103
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
104
+
105
+ return text
106
+ def process_html_to_plaintext(df):
107
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext)
108
+ df = df[['title', 'discussion_cleaned', 'label']]
109
+ return df
110
+
111
+ import pysbd
112
+ def split_text_into_sentences(text):
113
+ seg = pysbd.Segmenter(language="en", clean=False)
114
+ sentences = seg.segment(text)
115
+ return ' '.join(sentences[1:])
116
+ def process_split_text_into_sentences(df):
117
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
118
+ return df
119
+
120
+
121
+
122
+ def process_data(url):
123
+ df = extract_div_contents_from_url(url)
124
+ df = process_discussion(df)
125
+ df = process_html_to_plaintext(df)
126
+ df = process_split_text_into_sentences(df)
127
+ #if not empty
128
+ if not df.empty:
129
+ return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
130
+ else:
131
+ return 'Empty DataFrame'
model_predict.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #using pipeline to predict the input text
2
+ from transformers import pipeline
3
+ import torch
4
+
5
+ label_mapping = {
6
+ 'delete': [0, 'LABEL_0'],
7
+ 'keep': [1, 'LABEL_1'],
8
+ 'merge': [2, 'LABEL_2'],
9
+ 'no consensus': [3, 'LABEL_3'],
10
+ 'speedy keep': [4, 'LABEL_4'],
11
+ 'speedy delete': [5, 'LABEL_5'],
12
+ 'redirect': [6, 'LABEL_6'],
13
+ 'withdrawn': [7, 'LABEL_7']
14
+ }
15
+
16
+ def predict_text(text, model_name):
17
+ model = pipeline("text-classification", model=model_name, return_all_scores=True)
18
+ results = model(text)
19
+ final_scores = {key: 0.0 for key in label_mapping}
20
+
21
+ for result in results[0]:
22
+ for key, value in label_mapping.items():
23
+ if result['label'] == value[1]:
24
+ final_scores[key] = result['score']
25
+ break
26
+
27
+ return final_scores
28
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ pandas
4
+ numpy
5
+ pysbd
6
+ transformers
7
+ torch