Spaces:
Sleeping
Sleeping
miesnerjacob
commited on
Commit
β’
a00f9ba
1
Parent(s):
59fcc9f
added class and method docstrings
Browse files- emotion_detection.py +12 -10
- keyword_extraction.py +52 -49
- named_entity_recognition.py +10 -7
- part_of_speech_tagging.py +5 -5
- sentiment_analysis.py +16 -14
emotion_detection.py
CHANGED
@@ -5,7 +5,8 @@ import pandas as pd
|
|
5 |
|
6 |
|
7 |
class EmotionDetection:
|
8 |
-
"""
|
|
|
9 |
|
10 |
Attributes:
|
11 |
tokenizer: An instance of Hugging Face Tokenizer
|
@@ -21,13 +22,13 @@ class EmotionDetection:
|
|
21 |
|
22 |
def justify(self, text):
|
23 |
"""
|
24 |
-
|
25 |
|
26 |
Parameters:
|
27 |
-
|
28 |
|
29 |
Returns:
|
30 |
-
|
31 |
"""
|
32 |
|
33 |
word_attributions = self.explainer(text)
|
@@ -37,13 +38,13 @@ class EmotionDetection:
|
|
37 |
|
38 |
def classify(self, text):
|
39 |
"""
|
40 |
-
|
41 |
|
42 |
Parameters:
|
43 |
-
|
44 |
|
45 |
Returns:
|
46 |
-
|
47 |
"""
|
48 |
|
49 |
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
@@ -57,13 +58,14 @@ class EmotionDetection:
|
|
57 |
|
58 |
def run(self, text):
|
59 |
"""
|
60 |
-
|
61 |
|
62 |
Parameters:
|
63 |
-
|
64 |
|
65 |
Returns:
|
66 |
-
|
|
|
67 |
"""
|
68 |
|
69 |
preds = self.classify(text)
|
|
|
5 |
|
6 |
|
7 |
class EmotionDetection:
|
8 |
+
"""
|
9 |
+
Emotion Detection on text data.
|
10 |
|
11 |
Attributes:
|
12 |
tokenizer: An instance of Hugging Face Tokenizer
|
|
|
22 |
|
23 |
def justify(self, text):
|
24 |
"""
|
25 |
+
Get html annotation for displaying emotion justification over text.
|
26 |
|
27 |
Parameters:
|
28 |
+
text (str): The user input string to emotion justification
|
29 |
|
30 |
Returns:
|
31 |
+
html (hmtl): html object for plotting emotion prediction justification
|
32 |
"""
|
33 |
|
34 |
word_attributions = self.explainer(text)
|
|
|
38 |
|
39 |
def classify(self, text):
|
40 |
"""
|
41 |
+
Recognize Emotion in text.
|
42 |
|
43 |
Parameters:
|
44 |
+
text (str): The user input string to perform emotion classification on
|
45 |
|
46 |
Returns:
|
47 |
+
predictions (str): The predicted probabilities for emotion classes
|
48 |
"""
|
49 |
|
50 |
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
|
|
58 |
|
59 |
def run(self, text):
|
60 |
"""
|
61 |
+
Classify and Justify Emotion in text.
|
62 |
|
63 |
Parameters:
|
64 |
+
text (str): The user input string to perform emotion classification on
|
65 |
|
66 |
Returns:
|
67 |
+
predictions (str): The predicted probabilities for emotion classes
|
68 |
+
html (hmtl): html object for plotting emotion prediction justification
|
69 |
"""
|
70 |
|
71 |
preds = self.classify(text)
|
keyword_extraction.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
import spacy
|
2 |
-
import pytextrank
|
3 |
import re
|
4 |
from operator import itemgetter
|
5 |
import en_core_web_sm
|
6 |
|
7 |
|
8 |
class KeywordExtractor:
|
9 |
-
"""
|
|
|
10 |
|
11 |
Attributes:
|
12 |
nlp: An instance English pipeline optimized for CPU for spacy
|
@@ -18,13 +17,13 @@ class KeywordExtractor:
|
|
18 |
|
19 |
def get_keywords(self, text, max_keywords):
|
20 |
"""
|
21 |
-
|
22 |
|
23 |
Parameters:
|
24 |
-
|
25 |
|
26 |
Returns:
|
27 |
-
|
28 |
"""
|
29 |
|
30 |
doc = self.nlp(text)
|
@@ -33,41 +32,43 @@ class KeywordExtractor:
|
|
33 |
|
34 |
return kws
|
35 |
|
36 |
-
def
|
37 |
"""
|
38 |
-
|
39 |
|
40 |
Parameters:
|
41 |
-
|
|
|
42 |
|
43 |
Returns:
|
44 |
-
|
45 |
"""
|
46 |
|
47 |
-
|
48 |
-
for s in
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
return
|
53 |
|
54 |
-
def
|
55 |
"""
|
56 |
-
|
57 |
|
58 |
Parameters:
|
59 |
-
|
60 |
|
61 |
Returns:
|
62 |
-
|
63 |
"""
|
64 |
|
65 |
# Sort the array on the basis of start values of intervals.
|
66 |
-
|
|
|
67 |
stack = []
|
68 |
# insert first interval into stack
|
69 |
-
stack.append(
|
70 |
-
for i in
|
71 |
# Check for overlapping interval,
|
72 |
# if interval overlap
|
73 |
if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
|
@@ -76,69 +77,71 @@ class KeywordExtractor:
|
|
76 |
stack.append(i)
|
77 |
return stack
|
78 |
|
79 |
-
def merge_until_finished(self,
|
80 |
"""
|
81 |
-
|
82 |
|
83 |
Parameters:
|
84 |
-
|
85 |
|
86 |
Returns:
|
87 |
-
|
88 |
"""
|
89 |
|
90 |
-
|
91 |
while True:
|
92 |
-
merged = self.
|
93 |
-
if
|
94 |
-
|
95 |
-
return
|
96 |
else:
|
97 |
-
|
98 |
|
99 |
-
def get_annotation(self, text,
|
100 |
"""
|
101 |
-
|
102 |
|
103 |
Parameters:
|
104 |
-
|
105 |
|
106 |
Returns:
|
107 |
-
|
108 |
"""
|
109 |
|
110 |
arr = list(text)
|
111 |
-
for idx in sorted(
|
112 |
arr.insert(idx[0], "<kw>")
|
113 |
arr.insert(idx[1]+1, "XXXxxxXXXxxxXXX <kw>")
|
114 |
-
|
115 |
-
split =
|
116 |
-
|
117 |
|
118 |
kws_check = []
|
119 |
-
for i in
|
120 |
if type(i) is tuple:
|
121 |
kws_check.append(i[0])
|
122 |
|
123 |
-
return
|
124 |
|
125 |
def generate(self, text, max_keywords):
|
126 |
"""
|
127 |
-
|
128 |
|
129 |
Parameters:
|
130 |
-
|
|
|
131 |
|
132 |
Returns:
|
133 |
-
|
|
|
134 |
"""
|
135 |
|
136 |
kws = self.get_keywords(text, max_keywords)
|
137 |
|
138 |
-
|
139 |
-
if
|
140 |
-
|
141 |
-
annotation = self.get_annotation(text,
|
142 |
else:
|
143 |
annotation = None
|
144 |
|
|
|
|
|
|
|
1 |
import re
|
2 |
from operator import itemgetter
|
3 |
import en_core_web_sm
|
4 |
|
5 |
|
6 |
class KeywordExtractor:
|
7 |
+
"""
|
8 |
+
Keyword Extraction on text data
|
9 |
|
10 |
Attributes:
|
11 |
nlp: An instance English pipeline optimized for CPU for spacy
|
|
|
17 |
|
18 |
def get_keywords(self, text, max_keywords):
|
19 |
"""
|
20 |
+
Extract keywords from text.
|
21 |
|
22 |
Parameters:
|
23 |
+
text (str): The user input string to extract keywords from
|
24 |
|
25 |
Returns:
|
26 |
+
kws (list): list of extracted keywords
|
27 |
"""
|
28 |
|
29 |
doc = self.nlp(text)
|
|
|
32 |
|
33 |
return kws
|
34 |
|
35 |
+
def get_keyword_indices(self, kws, text):
|
36 |
"""
|
37 |
+
Extract keywords from text.
|
38 |
|
39 |
Parameters:
|
40 |
+
kws (list): list of extracted keywords
|
41 |
+
text (str): The user input string to extract keywords from
|
42 |
|
43 |
Returns:
|
44 |
+
keyword_indices (list): list of indices for keyword boundaries in text
|
45 |
"""
|
46 |
|
47 |
+
keyword_indices = []
|
48 |
+
for s in kws:
|
49 |
+
indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
|
50 |
+
keyword_indices.extend(indices)
|
51 |
|
52 |
+
return keyword_indices
|
53 |
|
54 |
+
def merge_overlapping_indices(self, keyword_indices):
|
55 |
"""
|
56 |
+
Merge overlapping keyword indices.
|
57 |
|
58 |
Parameters:
|
59 |
+
keyword_indices (list): list of indices for keyword boundaries in text
|
60 |
|
61 |
Returns:
|
62 |
+
keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
|
63 |
"""
|
64 |
|
65 |
# Sort the array on the basis of start values of intervals.
|
66 |
+
keyword_indices.sort()
|
67 |
+
|
68 |
stack = []
|
69 |
# insert first interval into stack
|
70 |
+
stack.append(keyword_indices[0])
|
71 |
+
for i in keyword_indices[1:]:
|
72 |
# Check for overlapping interval,
|
73 |
# if interval overlap
|
74 |
if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
|
|
|
77 |
stack.append(i)
|
78 |
return stack
|
79 |
|
80 |
+
def merge_until_finished(self, keyword_indices):
|
81 |
"""
|
82 |
+
Loop until no overlapping keyword indices left.
|
83 |
|
84 |
Parameters:
|
85 |
+
keyword_indices (list): list of indices for keyword boundaries in text
|
86 |
|
87 |
Returns:
|
88 |
+
keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
|
89 |
"""
|
90 |
|
91 |
+
len_indices = 0
|
92 |
while True:
|
93 |
+
merged = self.merge_overlapping_indices(keyword_indices)
|
94 |
+
if len_indices == len(merged):
|
95 |
+
out_indices = sorted(merged, key=itemgetter(0))
|
96 |
+
return out_indices
|
97 |
else:
|
98 |
+
len_indices = len(merged)
|
99 |
|
100 |
+
def get_annotation(self, text, keyword_indices):
|
101 |
"""
|
102 |
+
Create text annotation for extracted keywords.
|
103 |
|
104 |
Parameters:
|
105 |
+
keyword_indices (list): list of indices for keyword boundaries in text
|
106 |
|
107 |
Returns:
|
108 |
+
annotation (list): list of tuples for generating html
|
109 |
"""
|
110 |
|
111 |
arr = list(text)
|
112 |
+
for idx in sorted(keyword_indices, reverse=True):
|
113 |
arr.insert(idx[0], "<kw>")
|
114 |
arr.insert(idx[1]+1, "XXXxxxXXXxxxXXX <kw>")
|
115 |
+
joined_annotation = ''.join(arr)
|
116 |
+
split = joined_annotation.split('<kw>')
|
117 |
+
annotation = [(x.replace('XXXxxxXXXxxxXXX ', ''), "KEY", "#26aaef") if "XXXxxxXXXxxxXXX" in x else x for x in split]
|
118 |
|
119 |
kws_check = []
|
120 |
+
for i in annotation:
|
121 |
if type(i) is tuple:
|
122 |
kws_check.append(i[0])
|
123 |
|
124 |
+
return annotation
|
125 |
|
126 |
def generate(self, text, max_keywords):
|
127 |
"""
|
128 |
+
Create text annotation for extracted keywords.
|
129 |
|
130 |
Parameters:
|
131 |
+
text (str): The user input string to extract keywords from
|
132 |
+
max_keywords (int): Limit on number of keywords to generate
|
133 |
|
134 |
Returns:
|
135 |
+
annotation (list): list of tuples for generating html
|
136 |
+
kws (list): list of extracted keywords
|
137 |
"""
|
138 |
|
139 |
kws = self.get_keywords(text, max_keywords)
|
140 |
|
141 |
+
indices = list(self.get_keyword_indices(kws, text))
|
142 |
+
if indices:
|
143 |
+
indices_merged = self.merge_until_finished(indices)
|
144 |
+
annotation = self.get_annotation(text, indices_merged, kws)
|
145 |
else:
|
146 |
annotation = None
|
147 |
|
named_entity_recognition.py
CHANGED
@@ -3,7 +3,8 @@ from transformers import pipeline
|
|
3 |
|
4 |
|
5 |
class NamedEntityRecognition:
|
6 |
-
"""
|
|
|
7 |
|
8 |
Attributes:
|
9 |
tokenizer: An instance of Hugging Face Tokenizer
|
@@ -18,13 +19,14 @@ class NamedEntityRecognition:
|
|
18 |
|
19 |
def get_annotation(self, preds, text):
|
20 |
"""
|
21 |
-
|
22 |
|
23 |
Parameters:
|
24 |
-
|
|
|
25 |
|
26 |
Returns:
|
27 |
-
|
28 |
"""
|
29 |
|
30 |
splits = [0]
|
@@ -48,13 +50,14 @@ class NamedEntityRecognition:
|
|
48 |
|
49 |
def classify(self, text):
|
50 |
"""
|
51 |
-
|
52 |
|
53 |
Parameters:
|
54 |
-
|
55 |
|
56 |
Returns:
|
57 |
-
|
|
|
58 |
"""
|
59 |
|
60 |
preds = self.nlp(text)
|
|
|
3 |
|
4 |
|
5 |
class NamedEntityRecognition:
|
6 |
+
"""
|
7 |
+
Named Entity Recognition on text data.
|
8 |
|
9 |
Attributes:
|
10 |
tokenizer: An instance of Hugging Face Tokenizer
|
|
|
19 |
|
20 |
def get_annotation(self, preds, text):
|
21 |
"""
|
22 |
+
Get html annotation for displaying entities over text.
|
23 |
|
24 |
Parameters:
|
25 |
+
preds (dict): List of entities and their associated metadata
|
26 |
+
text (str): The user input string to generate entity tags for
|
27 |
|
28 |
Returns:
|
29 |
+
final_annotation (list): List of tuples to pass to text annotation html creator
|
30 |
"""
|
31 |
|
32 |
splits = [0]
|
|
|
50 |
|
51 |
def classify(self, text):
|
52 |
"""
|
53 |
+
Recognize Named Entities in text.
|
54 |
|
55 |
Parameters:
|
56 |
+
text (str): The user input string to generate entity tags for
|
57 |
|
58 |
Returns:
|
59 |
+
predictions (str): The user input string to generate entity tags for
|
60 |
+
ner_annotation (str): The user input string to generate entity tags for
|
61 |
"""
|
62 |
|
63 |
preds = self.nlp(text)
|
part_of_speech_tagging.py
CHANGED
@@ -12,15 +12,15 @@ class POSTagging:
|
|
12 |
|
13 |
def classify(self, text):
|
14 |
"""
|
15 |
-
|
16 |
|
17 |
Parameters:
|
18 |
-
|
19 |
|
20 |
Returns:
|
21 |
-
|
22 |
"""
|
23 |
|
24 |
text = word_tokenize(text)
|
25 |
-
|
26 |
-
return
|
|
|
12 |
|
13 |
def classify(self, text):
|
14 |
"""
|
15 |
+
Generate Part of Speech tags.
|
16 |
|
17 |
Parameters:
|
18 |
+
text (str): The user input string to generate tags for
|
19 |
|
20 |
Returns:
|
21 |
+
predictions (list): list of tuples containing words and their respective tags
|
22 |
"""
|
23 |
|
24 |
text = word_tokenize(text)
|
25 |
+
predictions = nltk.pos_tag(text)
|
26 |
+
return predictions
|
sentiment_analysis.py
CHANGED
@@ -5,7 +5,8 @@ import pandas as pd
|
|
5 |
|
6 |
|
7 |
class SentimentAnalysis:
|
8 |
-
"""
|
|
|
9 |
|
10 |
Attributes:
|
11 |
tokenizer: An instance of Hugging Face Tokenizer
|
@@ -32,13 +33,13 @@ class SentimentAnalysis:
|
|
32 |
|
33 |
def justify(self, text):
|
34 |
"""
|
35 |
-
|
36 |
|
37 |
Parameters:
|
38 |
-
|
39 |
|
40 |
Returns:
|
41 |
-
|
42 |
"""
|
43 |
|
44 |
word_attributions = self.explainer(text)
|
@@ -48,35 +49,36 @@ class SentimentAnalysis:
|
|
48 |
|
49 |
def classify(self, text):
|
50 |
"""
|
51 |
-
|
52 |
|
53 |
Parameters:
|
54 |
-
|
55 |
|
56 |
Returns:
|
57 |
-
|
58 |
"""
|
59 |
|
60 |
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
61 |
outputs = self.model(**tokens)
|
62 |
probs = torch.nn.functional.softmax(outputs[0], dim=-1)
|
63 |
probs = probs.mean(dim=0).detach().numpy()
|
64 |
-
|
65 |
|
66 |
-
return
|
67 |
|
68 |
def run(self, text):
|
69 |
"""
|
70 |
-
|
71 |
|
72 |
Parameters:
|
73 |
-
|
74 |
|
75 |
Returns:
|
76 |
-
|
|
|
77 |
"""
|
78 |
|
79 |
-
|
80 |
html = self.justify(text)
|
81 |
|
82 |
-
return
|
|
|
5 |
|
6 |
|
7 |
class SentimentAnalysis:
|
8 |
+
"""
|
9 |
+
Sentiment on text data.
|
10 |
|
11 |
Attributes:
|
12 |
tokenizer: An instance of Hugging Face Tokenizer
|
|
|
33 |
|
34 |
def justify(self, text):
|
35 |
"""
|
36 |
+
Get html annotation for displaying sentiment justification over text.
|
37 |
|
38 |
Parameters:
|
39 |
+
text (str): The user input string to sentiment justification
|
40 |
|
41 |
Returns:
|
42 |
+
html (hmtl): html object for plotting sentiment prediction justification
|
43 |
"""
|
44 |
|
45 |
word_attributions = self.explainer(text)
|
|
|
49 |
|
50 |
def classify(self, text):
|
51 |
"""
|
52 |
+
Recognize Sentiment in text.
|
53 |
|
54 |
Parameters:
|
55 |
+
text (str): The user input string to perform sentiment classification on
|
56 |
|
57 |
Returns:
|
58 |
+
predictions (str): The predicted probabilities for sentiment classes
|
59 |
"""
|
60 |
|
61 |
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
62 |
outputs = self.model(**tokens)
|
63 |
probs = torch.nn.functional.softmax(outputs[0], dim=-1)
|
64 |
probs = probs.mean(dim=0).detach().numpy()
|
65 |
+
predictions = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
|
66 |
|
67 |
+
return predictions
|
68 |
|
69 |
def run(self, text):
|
70 |
"""
|
71 |
+
Classify and Justify Sentiment in text.
|
72 |
|
73 |
Parameters:
|
74 |
+
text (str): The user input string to perform sentiment classification on
|
75 |
|
76 |
Returns:
|
77 |
+
predictions (str): The predicted probabilities for sentiment classes
|
78 |
+
html (hmtl): html object for plotting sentiment prediction justification
|
79 |
"""
|
80 |
|
81 |
+
predictions = self.classify(text)
|
82 |
html = self.justify(text)
|
83 |
|
84 |
+
return predictions, html
|