from emoji import demojize from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer() def normalizeToken(token): lowercased_token = token.lower() if token.startswith("@"): return "@USER" elif lowercased_token.startswith("http") or lowercased_token.startswith("www"): return "HTTPURL" elif len(token) == 1: return demojize(token) else: if token == "’": return "'" elif token == "…": return "..." else: return token def normalizeTweet(tweet): tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "...")) normTweet = " ".join([normalizeToken(token) for token in tokens]) normTweet = ( normTweet.replace("cannot ", "can not ") .replace("n't ", " n't ") .replace("n 't ", " n't ") .replace("ca n't", "can't") .replace("ai n't", "ain't") ) normTweet = ( normTweet.replace("'m ", " 'm ") .replace("'re ", " 're ") .replace("'s ", " 's ") .replace("'ll ", " 'll ") .replace("'d ", " 'd ") .replace("'ve ", " 've ") ) normTweet = ( normTweet.replace(" p . m .", " p.m.") .replace(" p . m ", " p.m ") .replace(" a . m .", " a.m.") .replace(" a . m ", " a.m ") ) return " ".join(normTweet.split()) if __name__ == "__main__": print( normalizeTweet( "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier" ) )