Upload 2 files
Browse files- TweetNormalizer.py +59 -0
- requirements.txt +11 -0
TweetNormalizer.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from emoji import demojize
|
2 |
+
from nltk.tokenize import TweetTokenizer
|
3 |
+
|
4 |
+
|
5 |
+
tokenizer = TweetTokenizer()
|
6 |
+
|
7 |
+
|
8 |
+
def normalizeToken(token):
|
9 |
+
lowercased_token = token.lower()
|
10 |
+
if token.startswith("@"):
|
11 |
+
return "@USER"
|
12 |
+
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
|
13 |
+
return "HTTPURL"
|
14 |
+
elif len(token) == 1:
|
15 |
+
return demojize(token)
|
16 |
+
else:
|
17 |
+
if token == "’":
|
18 |
+
return "'"
|
19 |
+
elif token == "…":
|
20 |
+
return "..."
|
21 |
+
else:
|
22 |
+
return token
|
23 |
+
|
24 |
+
|
25 |
+
def normalizeTweet(tweet):
|
26 |
+
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
|
27 |
+
normTweet = " ".join([normalizeToken(token) for token in tokens])
|
28 |
+
|
29 |
+
normTweet = (
|
30 |
+
normTweet.replace("cannot ", "can not ")
|
31 |
+
.replace("n't ", " n't ")
|
32 |
+
.replace("n 't ", " n't ")
|
33 |
+
.replace("ca n't", "can't")
|
34 |
+
.replace("ai n't", "ain't")
|
35 |
+
)
|
36 |
+
normTweet = (
|
37 |
+
normTweet.replace("'m ", " 'm ")
|
38 |
+
.replace("'re ", " 're ")
|
39 |
+
.replace("'s ", " 's ")
|
40 |
+
.replace("'ll ", " 'll ")
|
41 |
+
.replace("'d ", " 'd ")
|
42 |
+
.replace("'ve ", " 've ")
|
43 |
+
)
|
44 |
+
normTweet = (
|
45 |
+
normTweet.replace(" p . m .", " p.m.")
|
46 |
+
.replace(" p . m ", " p.m ")
|
47 |
+
.replace(" a . m .", " a.m.")
|
48 |
+
.replace(" a . m ", " a.m ")
|
49 |
+
)
|
50 |
+
|
51 |
+
return " ".join(normTweet.split())
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
print(
|
56 |
+
normalizeTweet(
|
57 |
+
"SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
|
58 |
+
)
|
59 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
joblib
|
2 |
+
transformers
|
3 |
+
matplotlib
|
4 |
+
pandas
|
5 |
+
emoji
|
6 |
+
nltk
|
7 |
+
seaborn
|
8 |
+
numpy
|
9 |
+
torch
|
10 |
+
tensorflow
|
11 |
+
tf-keras
|