Tymec commited on
Commit
e1645d7
1 Parent(s): d29d6fe

Add slang map

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -1
  2. .gitignore +2 -2
  3. README.md +3 -0
  4. app/constants.py +3 -0
  5. app/data.py +100 -3
  6. data/slang.json +229 -0
.gitattributes CHANGED
@@ -5,6 +5,7 @@
5
  # Hide from GitHub's language detection
6
  *.yaml linguist-documentation
7
  *.toml linguist-documentation
 
8
 
9
  # Remove assets from github statistics
10
  *.yaml linguist-vendored
@@ -12,10 +13,11 @@
12
 
13
  # Set the language for these files to ensure GitHub doesn't show the comments as errors
14
  .vscode/*.json linguist-language=JSON5
 
15
 
16
  # Do not try and merge these files
17
  poetry.lock -diff
18
- *.ipynb -diff
19
 
20
  # LFS
21
  models/** filter=lfs diff=lfs merge=lfs -text
 
5
  # Hide from GitHub's language detection
6
  *.yaml linguist-documentation
7
  *.toml linguist-documentation
8
+ *.json linguist-documentation
9
 
10
  # Remove assets from github statistics
11
  *.yaml linguist-vendored
 
13
 
14
  # Set the language for these files to ensure GitHub doesn't show the comments as errors
15
  .vscode/*.json linguist-language=JSON5
16
+ data/* binary
17
 
18
  # Do not try and merge these files
19
  poetry.lock -diff
20
+ *.pkl -diff
21
 
22
  # LFS
23
  models/** filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -194,6 +194,6 @@ pyrightconfig.json
194
  # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
195
 
196
  # Custom
197
- data/
198
- cache/
199
  flagged/
 
194
  # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
195
 
196
  # Custom
197
+ data/*
198
+ !data/slang.json
199
  flagged/
README.md CHANGED
@@ -138,6 +138,9 @@ python -m app evaluate --help
138
  | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
139
  | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
140
 
 
 
 
141
 
142
  ### Vectorizers
143
  | Option | Description | When to Use |
 
138
  | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
139
  | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
140
 
141
+ #### Used for text preprocessing
142
+ - [Slang Map](Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing)
143
+
144
 
145
  ### Vectorizers
146
  | Option | Description | When to Use |
app/constants.py CHANGED
@@ -19,6 +19,9 @@ IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-5
19
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
20
  TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
21
 
 
 
 
22
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
23
  DATA_DIR.mkdir(exist_ok=True, parents=True)
24
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
 
19
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
20
  TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
21
 
22
+ SLANGMAP_PATH = DATA_DIR / "slang.json"
23
+ SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing"
24
+
25
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
26
  DATA_DIR.mkdir(exist_ok=True, parents=True)
27
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
app/data.py CHANGED
@@ -1,8 +1,12 @@
1
  from __future__ import annotations
2
 
3
  import bz2
 
 
 
4
  from typing import TYPE_CHECKING, Literal, Sequence
5
 
 
6
  import pandas as pd
7
  import spacy
8
  from tqdm import tqdm
@@ -14,11 +18,15 @@ from app.constants import (
14
  IMDB50K_URL,
15
  SENTIMENT140_PATH,
16
  SENTIMENT140_URL,
 
 
17
  TEST_DATASET_PATH,
18
  TEST_DATASET_URL,
19
  )
20
 
21
  if TYPE_CHECKING:
 
 
22
  from spacy.tokens import Doc
23
 
24
  __all__ = ["load_data", "tokenize"]
@@ -35,6 +43,81 @@ except OSError:
35
  nlp = spacy.load("en_core_web_sm")
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
39
  """Lemmatize the provided text using spaCy.
40
 
@@ -46,12 +129,15 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
46
  Sequence of lemmatized tokens
47
  """
48
  return [
49
- token.lemma_.lower().strip()
50
  for token in doc
51
  if not token.is_stop # Ignore stop words
52
  and not token.is_punct # Ignore punctuation
 
 
 
53
  and not token.is_alpha # Ignore non-alphabetic tokens
54
- and not (len(token.lemma_) < threshold) # Ignore short tokens
55
  ]
56
 
57
 
@@ -74,14 +160,25 @@ def tokenize(
74
  Returns:
75
  Tokenized text data
76
  """
 
 
 
 
 
 
 
 
 
 
77
  return pd.Series(
78
  [
79
  _lemmatize(doc, character_threshold)
80
  for doc in tqdm(
81
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
82
  total=len(text_data),
83
- disable=not show_progress,
84
  unit="doc",
 
85
  )
86
  ],
87
  )
 
1
  from __future__ import annotations
2
 
3
  import bz2
4
+ import json
5
+ import re
6
+ from functools import lru_cache
7
  from typing import TYPE_CHECKING, Literal, Sequence
8
 
9
+ import emoji
10
  import pandas as pd
11
  import spacy
12
  from tqdm import tqdm
 
18
  IMDB50K_URL,
19
  SENTIMENT140_PATH,
20
  SENTIMENT140_URL,
21
+ SLANGMAP_PATH,
22
+ SLANGMAP_URL,
23
  TEST_DATASET_PATH,
24
  TEST_DATASET_URL,
25
  )
26
 
27
  if TYPE_CHECKING:
28
+ from re import Pattern
29
+
30
  from spacy.tokens import Doc
31
 
32
  __all__ = ["load_data", "tokenize"]
 
43
  nlp = spacy.load("en_core_web_sm")
44
 
45
 
46
+ @lru_cache(maxsize=1)
47
+ def slang() -> tuple[Pattern, dict[str, str]]:
48
+ """Compile a re pattern for slang terms.
49
+
50
+ Returns:
51
+ Slang pattern and mapping
52
+
53
+ Raises:
54
+ FileNotFoundError: If the file is not found
55
+ """
56
+ if not SLANGMAP_PATH.exists():
57
+ # msg = f"Missing slang mapping file: {SLANG_PATH}"
58
+ msg = (
59
+ f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
60
+ "Please download the file from:\n"
61
+ f"{SLANGMAP_URL}"
62
+ ) # fmt: off
63
+ raise FileNotFoundError(msg)
64
+
65
+ with SLANGMAP_PATH.open() as f:
66
+ mapping = json.load(f)
67
+
68
+ return re.compile(r"\b(" + "|".join(map(re.escape, mapping.keys())) + r")\b"), mapping
69
+
70
+
71
+ def _clean(text: str) -> str:
72
+ """Perform basic text cleaning.
73
+
74
+ Args:
75
+ text: Text to clean
76
+
77
+ Returns:
78
+ Cleaned text
79
+ """
80
+ # Make text lowercase
81
+ text = text.lower()
82
+
83
+ # Remove HTML tags
84
+ text = re.sub(r"<[^>]*>", "", text)
85
+
86
+ # Map slang terms
87
+ slang_pattern, slang_mapping = slang()
88
+ text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
89
+
90
+ # Remove acronyms and abbreviations
91
+ # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
92
+ text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
93
+
94
+ # Remove honorifics
95
+ text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
96
+
97
+ # Remove year abbreviations
98
+ text = re.sub(r"\b(?:\d{3}0|\d0)s?\b", "", text)
99
+
100
+ # Remove hashtags
101
+ text = re.sub(r"#[^\s]+", "", text)
102
+
103
+ # Replace mentions with a generic tag
104
+ text = re.sub(r"@[^\s]+", "user", text)
105
+
106
+ # Replace X/Y with X or Y
107
+ text = re.sub(r"\b([a-z]+)[//]([a-z]+)\b", r"\1 or \2", text)
108
+
109
+ # Convert emojis to text
110
+ text = emoji.demojize(text, delimiters=("emoji_", ""))
111
+
112
+ # Remove special characters
113
+ text = re.sub(r"[^a-z0-9\s]", "", text)
114
+
115
+ # EXTRA: imdb50k specific cleaning
116
+ text = re.sub(r"mst3k", "", text) # Very common acronym for Mystery Science Theater 3000
117
+
118
+ return text.strip()
119
+
120
+
121
  def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
122
  """Lemmatize the provided text using spaCy.
123
 
 
129
  Sequence of lemmatized tokens
130
  """
131
  return [
132
+ tok
133
  for token in doc
134
  if not token.is_stop # Ignore stop words
135
  and not token.is_punct # Ignore punctuation
136
+ and not token.like_email # Ignore email addresses
137
+ and not token.like_url # Ignore URLs
138
+ and not token.like_num # Ignore numbers
139
  and not token.is_alpha # Ignore non-alphabetic tokens
140
+ and not (len(tok := token.lemma_.lower().strip()) < threshold) # Ignore short tokens
141
  ]
142
 
143
 
 
160
  Returns:
161
  Tokenized text data
162
  """
163
+ text_data = [
164
+ _clean(text)
165
+ for text in tqdm(
166
+ text_data,
167
+ desc="Cleaning",
168
+ unit="doc",
169
+ disable=not show_progress,
170
+ )
171
+ ]
172
+
173
  return pd.Series(
174
  [
175
  _lemmatize(doc, character_threshold)
176
  for doc in tqdm(
177
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
178
  total=len(text_data),
179
+ desc="Lemmatization",
180
  unit="doc",
181
+ disable=not show_progress,
182
  )
183
  ],
184
  )
data/slang.json ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$": " dollar ",
3
+ "€": " euro ",
4
+ "4ao": "for adults only",
5
+ "a.m": "before midday",
6
+ "a3": "anytime anywhere anyplace",
7
+ "aamof": "as a matter of fact",
8
+ "acct": "account",
9
+ "adih": "another day in hell",
10
+ "afaic": "as far as i am concerned",
11
+ "afaict": "as far as i can tell",
12
+ "afaik": "as far as i know",
13
+ "afair": "as far as i remember",
14
+ "afk": "away from keyboard",
15
+ "app": "application",
16
+ "approx": "approximately",
17
+ "apps": "applications",
18
+ "asap": "as soon as possible",
19
+ "asl": "age, sex, location",
20
+ "atk": "at the keyboard",
21
+ "ave.": "avenue",
22
+ "aymm": "are you my mother",
23
+ "ayor": "at your own risk",
24
+ "b&b": "bed and breakfast",
25
+ "b+b": "bed and breakfast",
26
+ "b.c": "before christ",
27
+ "b2b": "business to business",
28
+ "b2c": "business to customer",
29
+ "b4": "before",
30
+ "b4n": "bye for now",
31
+ "b@u": "back at you",
32
+ "bae": "before anyone else",
33
+ "bak": "back at keyboard",
34
+ "bbbg": "bye bye be good",
35
+ "bbc": "british broadcasting corporation",
36
+ "bbias": "be back in a second",
37
+ "bbl": "be back later",
38
+ "bbs": "be back soon",
39
+ "be4": "before",
40
+ "bfn": "bye for now",
41
+ "blvd": "boulevard",
42
+ "bout": "about",
43
+ "brb": "be right back",
44
+ "bros": "brothers",
45
+ "brt": "be right there",
46
+ "bsaaw": "big smile and a wink",
47
+ "btw": "by the way",
48
+ "bwl": "bursting with laughter",
49
+ "c/o": "care of",
50
+ "cet": "central european time",
51
+ "cf": "compare",
52
+ "cia": "central intelligence agency",
53
+ "csl": "can not stop laughing",
54
+ "cu": "see you",
55
+ "cul8r": "see you later",
56
+ "cv": "curriculum vitae",
57
+ "cwot": "complete waste of time",
58
+ "cya": "see you",
59
+ "cyt": "see you tomorrow",
60
+ "dae": "does anyone else",
61
+ "dbmib": "do not bother me i am busy",
62
+ "diy": "do it yourself",
63
+ "dm": "direct message",
64
+ "dwh": "during work hours",
65
+ "e123": "easy as one two three",
66
+ "eet": "eastern european time",
67
+ "eg": "example",
68
+ "embm": "early morning business meeting",
69
+ "encl": "enclosed",
70
+ "encl.": "enclosed",
71
+ "etc": "and so on",
72
+ "faq": "frequently asked questions",
73
+ "fawc": "for anyone who cares",
74
+ "fb": "facebook",
75
+ "fc": "fingers crossed",
76
+ "fig": "figure",
77
+ "fimh": "forever in my heart",
78
+ "ft.": "feet",
79
+ "ft": "featuring",
80
+ "ftl": "for the loss",
81
+ "ftw": "for the win",
82
+ "fwiw": "for what it is worth",
83
+ "fyi": "for your information",
84
+ "g9": "genius",
85
+ "gahoy": "get a hold of yourself",
86
+ "gal": "get a life",
87
+ "gcse": "general certificate of secondary education",
88
+ "gfn": "gone for now",
89
+ "gg": "good game",
90
+ "gl": "good luck",
91
+ "glhf": "good luck have fun",
92
+ "gmt": "greenwich mean time",
93
+ "gmta": "great minds think alike",
94
+ "gn": "good night",
95
+ "g.o.a.t": "greatest of all time",
96
+ "goat": "greatest of all time",
97
+ "goi": "get over it",
98
+ "gps": "global positioning system",
99
+ "gr8": "great",
100
+ "gratz": "congratulations",
101
+ "gyal": "girl",
102
+ "h&c": "hot and cold",
103
+ "hp": "horsepower",
104
+ "hr": "hour",
105
+ "hrh": "his royal highness",
106
+ "ht": "height",
107
+ "ibrb": "i will be right back",
108
+ "ic": "i see",
109
+ "icq": "i seek you",
110
+ "icymi": "in case you missed it",
111
+ "idc": "i do not care",
112
+ "idgadf": "i do not give a damn fuck",
113
+ "idgaf": "i do not give a fuck",
114
+ "idk": "i do not know",
115
+ "ie": "that is",
116
+ "i.e": "that is",
117
+ "ifyp": "i feel your pain",
118
+ "IG": "instagram",
119
+ "iirc": "if i remember correctly",
120
+ "ilu": "i love you",
121
+ "ily": "i love you",
122
+ "imho": "in my humble opinion",
123
+ "imo": "in my opinion",
124
+ "imu": "i miss you",
125
+ "iow": "in other words",
126
+ "irl": "in real life",
127
+ "j4f": "just for fun",
128
+ "jic": "just in case",
129
+ "jk": "just kidding",
130
+ "jsyk": "just so you know",
131
+ "l8r": "later",
132
+ "lb": "pound",
133
+ "lbs": "pounds",
134
+ "ldr": "long distance relationship",
135
+ "lmao": "laugh my ass off",
136
+ "lmfao": "laugh my fucking ass off",
137
+ "lol": "laughing out loud",
138
+ "ltd": "limited",
139
+ "ltns": "long time no see",
140
+ "m8": "mate",
141
+ "mf": "motherfucker",
142
+ "mfs": "motherfuckers",
143
+ "mfw": "my face when",
144
+ "mofo": "motherfucker",
145
+ "mph": "miles per hour",
146
+ "mr": "mister",
147
+ "mrw": "my reaction when",
148
+ "ms": "miss",
149
+ "mte": "my thoughts exactly",
150
+ "nagi": "not a good idea",
151
+ "nbc": "national broadcasting company",
152
+ "nbd": "not big deal",
153
+ "nfs": "not for sale",
154
+ "ngl": "not going to lie",
155
+ "nhs": "national health service",
156
+ "nrn": "no reply necessary",
157
+ "nsfl": "not safe for life",
158
+ "nsfw": "not safe for work",
159
+ "nth": "nice to have",
160
+ "nvr": "never",
161
+ "nyc": "new york city",
162
+ "oc": "original content",
163
+ "og": "original",
164
+ "ohp": "overhead projector",
165
+ "oic": "oh i see",
166
+ "omdb": "over my dead body",
167
+ "omg": "oh my god",
168
+ "omw": "on my way",
169
+ "p.a": "per annum",
170
+ "p.m": "after midday",
171
+ "pm": "prime minister",
172
+ "poc": "people of color",
173
+ "pov": "point of view",
174
+ "pp": "pages",
175
+ "ppl": "people",
176
+ "prw": "parents are watching",
177
+ "ps": "postscript",
178
+ "pt": "point",
179
+ "ptb": "please text back",
180
+ "pto": "please turn over",
181
+ "qpsa": "what happens",
182
+ "ratchet": "rude",
183
+ "rbtl": "read between the lines",
184
+ "rlrt": "real life retweet",
185
+ "rofl": "rolling on the floor laughing",
186
+ "roflol": "rolling on the floor laughing out loud",
187
+ "rotflmao": "rolling on the floor laughing my ass off",
188
+ "rt": "retweet",
189
+ "ruok": "are you ok",
190
+ "sfw": "safe for work",
191
+ "sk8": "skate",
192
+ "smh": "shake my head",
193
+ "sq": "square",
194
+ "srsly": "seriously",
195
+ "ssdd": "same stuff different day",
196
+ "tbh": "to be honest",
197
+ "tbs": "tablespooful",
198
+ "tbsp": "tablespooful",
199
+ "tfw": "that feeling when",
200
+ "thks": "thank you",
201
+ "tho": "though",
202
+ "thx": "thank you",
203
+ "tia": "thanks in advance",
204
+ "til": "today i learned",
205
+ "tl;dr": "too long i did not read",
206
+ "tldr": "too long i did not read",
207
+ "tmb": "tweet me back",
208
+ "tntl": "trying not to laugh",
209
+ "ttyl": "talk to you later",
210
+ "u": "you",
211
+ "u2": "you too",
212
+ "u4e": "yours for ever",
213
+ "utc": "coordinated universal time",
214
+ "w/": "with",
215
+ "w/o": "without",
216
+ "w8": "wait",
217
+ "wassup": "what is up",
218
+ "wb": "welcome back",
219
+ "wtf": "what the fuck",
220
+ "wtg": "way to go",
221
+ "wtpa": "where the party at",
222
+ "wuf": "where are you from",
223
+ "wuzup": "what is up",
224
+ "wywh": "wish you were here",
225
+ "yd": "yard",
226
+ "ygtr": "you got that right",
227
+ "ynk": "you never know",
228
+ "zzz": "sleeping bored and tired"
229
+ }