File size: 1,786 Bytes
31d5dab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from emoji import demojize
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
def normalizeToken(token):
lowercased_token = token.lower()
if token.startswith("@"):
return "@USER"
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
return "HTTPURL"
elif len(token) == 1:
return demojize(token)
else:
if token == "’":
return "'"
elif token == "…":
return "..."
else:
return token
def normalizeTweet(tweet):
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
normTweet = " ".join([normalizeToken(token) for token in tokens])
normTweet = (
normTweet.replace("cannot ", "can not ")
.replace("n't ", " n't ")
.replace("n 't ", " n't ")
.replace("ca n't", "can't")
.replace("ai n't", "ain't")
)
normTweet = (
normTweet.replace("'m ", " 'm ")
.replace("'re ", " 're ")
.replace("'s ", " 's ")
.replace("'ll ", " 'll ")
.replace("'d ", " 'd ")
.replace("'ve ", " 've ")
)
normTweet = (
normTweet.replace(" p . m .", " p.m.")
.replace(" p . m ", " p.m ")
.replace(" a . m .", " a.m.")
.replace(" a . m ", " a.m ")
)
return " ".join(normTweet.split())
if __name__ == "__main__":
print(
normalizeTweet(
"SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
)
)
|