Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on Jun 3

Commit

e1645d7

•

1 Parent(s): d29d6fe

Add slang map

Browse files

Files changed (6) hide show

.gitattributes +3 -1
.gitignore +2 -2
README.md +3 -0
app/constants.py +3 -0
app/data.py +100 -3
data/slang.json +229 -0

.gitattributes CHANGED Viewed

@@ -5,6 +5,7 @@
 # Hide from GitHub's language detection
 *.yaml              linguist-documentation
 *.toml              linguist-documentation
 # Remove assets from github statistics
 *.yaml              linguist-vendored
@@ -12,10 +13,11 @@
 # Set the language for these files to ensure GitHub doesn't show the comments as errors
 .vscode/*.json      linguist-language=JSON5
 # Do not try and merge these files
 poetry.lock         -diff
-*.ipynb             -diff
 # LFS
 models/**           filter=lfs diff=lfs merge=lfs -text

 # Hide from GitHub's language detection
 *.yaml              linguist-documentation
 *.toml              linguist-documentation
+*.json              linguist-documentation
 # Remove assets from github statistics
 *.yaml              linguist-vendored
 # Set the language for these files to ensure GitHub doesn't show the comments as errors
 .vscode/*.json      linguist-language=JSON5
+data/*              binary
 # Do not try and merge these files
 poetry.lock         -diff
+*.pkl               -diff
 # LFS
 models/**           filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -194,6 +194,6 @@ pyrightconfig.json
 # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
 # Custom
-data/
-cache/
 flagged/

 # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
 # Custom
+data/*
+!data/slang.json
 flagged/

README.md CHANGED Viewed

@@ -138,6 +138,9 @@ python -m app evaluate --help
 | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
 | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
 ### Vectorizers
 | Option | Description | When to Use |

 | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
 | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
+#### Used for text preprocessing
+- [Slang Map](Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing)
 ### Vectorizers
 | Option | Description | When to Use |

app/constants.py CHANGED Viewed

@@ -19,6 +19,9 @@ IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-5
 TEST_DATASET_PATH = DATA_DIR / "test.csv"
 TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
 MODEL_DIR.mkdir(exist_ok=True, parents=True)

 TEST_DATASET_PATH = DATA_DIR / "test.csv"
 TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
+SLANGMAP_PATH = DATA_DIR / "slang.json"
+SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing"
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
 MODEL_DIR.mkdir(exist_ok=True, parents=True)

app/data.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from __future__ import annotations
 import bz2
 from typing import TYPE_CHECKING, Literal, Sequence
 import pandas as pd
 import spacy
 from tqdm import tqdm
@@ -14,11 +18,15 @@ from app.constants import (
     IMDB50K_URL,
     SENTIMENT140_PATH,
     SENTIMENT140_URL,
     TEST_DATASET_PATH,
     TEST_DATASET_URL,
 )
 if TYPE_CHECKING:
     from spacy.tokens import Doc
 __all__ = ["load_data", "tokenize"]
@@ -35,6 +43,81 @@ except OSError:
     nlp = spacy.load("en_core_web_sm")
 def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
     """Lemmatize the provided text using spaCy.
@@ -46,12 +129,15 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
         Sequence of lemmatized tokens
     """
     return [
-        token.lemma_.lower().strip()
         for token in doc
         if not token.is_stop  # Ignore stop words
         and not token.is_punct  # Ignore punctuation
         and not token.is_alpha  # Ignore non-alphabetic tokens
-        and not (len(token.lemma_) < threshold)  # Ignore short tokens
     ]
@@ -74,14 +160,25 @@ def tokenize(
     Returns:
         Tokenized text data
     """
     return pd.Series(
         [
             _lemmatize(doc, character_threshold)
             for doc in tqdm(
                 nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
                 total=len(text_data),
-                disable=not show_progress,
                 unit="doc",
             )
         ],
     )

 from __future__ import annotations
 import bz2
+import json
+import re
+from functools import lru_cache
 from typing import TYPE_CHECKING, Literal, Sequence
+import emoji
 import pandas as pd
 import spacy
 from tqdm import tqdm
     IMDB50K_URL,
     SENTIMENT140_PATH,
     SENTIMENT140_URL,
+    SLANGMAP_PATH,
+    SLANGMAP_URL,
     TEST_DATASET_PATH,
     TEST_DATASET_URL,
 )
 if TYPE_CHECKING:
+    from re import Pattern
     from spacy.tokens import Doc
 __all__ = ["load_data", "tokenize"]
     nlp = spacy.load("en_core_web_sm")
+@lru_cache(maxsize=1)
+def slang() -> tuple[Pattern, dict[str, str]]:
+    """Compile a re pattern for slang terms.
+    Returns:
+        Slang pattern and mapping
+    Raises:
+        FileNotFoundError: If the file is not found
+    """
+    if not SLANGMAP_PATH.exists():
+        # msg = f"Missing slang mapping file: {SLANG_PATH}"
+        msg = (
+            f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
+            "Please download the file from:\n"
+            f"{SLANGMAP_URL}"
+        )  # fmt: off
+        raise FileNotFoundError(msg)
+    with SLANGMAP_PATH.open() as f:
+        mapping = json.load(f)
+    return re.compile(r"\b(" + "|".join(map(re.escape, mapping.keys())) + r")\b"), mapping
+def _clean(text: str) -> str:
+    """Perform basic text cleaning.
+    Args:
+        text: Text to clean
+    Returns:
+        Cleaned text
+    """
+    # Make text lowercase
+    text = text.lower()
+    # Remove HTML tags
+    text = re.sub(r"<[^>]*>", "", text)
+    # Map slang terms
+    slang_pattern, slang_mapping = slang()
+    text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
+    # Remove acronyms and abbreviations
+    # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
+    text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
+    # Remove honorifics
+    text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
+    # Remove year abbreviations
+    text = re.sub(r"\b(?:\d{3}0|\d0)s?\b", "", text)
+    # Remove hashtags
+    text = re.sub(r"#[^\s]+", "", text)
+    # Replace mentions with a generic tag
+    text = re.sub(r"@[^\s]+", "user", text)
+    # Replace X/Y with X or Y
+    text = re.sub(r"\b([a-z]+)[//]([a-z]+)\b", r"\1 or \2", text)
+    # Convert emojis to text
+    text = emoji.demojize(text, delimiters=("emoji_", ""))
+    # Remove special characters
+    text = re.sub(r"[^a-z0-9\s]", "", text)
+    # EXTRA: imdb50k specific cleaning
+    text = re.sub(r"mst3k", "", text)  # Very common acronym for Mystery Science Theater 3000
+    return text.strip()
 def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
     """Lemmatize the provided text using spaCy.
         Sequence of lemmatized tokens
     """
     return [
+        tok
         for token in doc
         if not token.is_stop  # Ignore stop words
         and not token.is_punct  # Ignore punctuation
+        and not token.like_email  # Ignore email addresses
+        and not token.like_url  # Ignore URLs
+        and not token.like_num  # Ignore numbers
         and not token.is_alpha  # Ignore non-alphabetic tokens
+        and not (len(tok := token.lemma_.lower().strip()) < threshold)  # Ignore short tokens
     ]
     Returns:
         Tokenized text data
     """
+    text_data = [
+        _clean(text)
+        for text in tqdm(
+            text_data,
+            desc="Cleaning",
+            unit="doc",
+            disable=not show_progress,
+        )
+    ]
     return pd.Series(
         [
             _lemmatize(doc, character_threshold)
             for doc in tqdm(
                 nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
                 total=len(text_data),
+                desc="Lemmatization",
                 unit="doc",
+                disable=not show_progress,
             )
         ],
     )

data/slang.json ADDED Viewed

	@@ -0,0 +1,229 @@

+{
+  "$": " dollar ",
+  "€": " euro ",
+  "4ao": "for adults only",
+  "a.m": "before midday",
+  "a3": "anytime anywhere anyplace",
+  "aamof": "as a matter of fact",
+  "acct": "account",
+  "adih": "another day in hell",
+  "afaic": "as far as i am concerned",
+  "afaict": "as far as i can tell",
+  "afaik": "as far as i know",
+  "afair": "as far as i remember",
+  "afk": "away from keyboard",
+  "app": "application",
+  "approx": "approximately",
+  "apps": "applications",
+  "asap": "as soon as possible",
+  "asl": "age, sex, location",
+  "atk": "at the keyboard",
+  "ave.": "avenue",
+  "aymm": "are you my mother",
+  "ayor": "at your own risk",
+  "b&b": "bed and breakfast",
+  "b+b": "bed and breakfast",
+  "b.c": "before christ",
+  "b2b": "business to business",
+  "b2c": "business to customer",
+  "b4": "before",
+  "b4n": "bye for now",
+  "b@u": "back at you",
+  "bae": "before anyone else",
+  "bak": "back at keyboard",
+  "bbbg": "bye bye be good",
+  "bbc": "british broadcasting corporation",
+  "bbias": "be back in a second",
+  "bbl": "be back later",
+  "bbs": "be back soon",
+  "be4": "before",
+  "bfn": "bye for now",
+  "blvd": "boulevard",
+  "bout": "about",
+  "brb": "be right back",
+  "bros": "brothers",
+  "brt": "be right there",
+  "bsaaw": "big smile and a wink",
+  "btw": "by the way",
+  "bwl": "bursting with laughter",
+  "c/o": "care of",
+  "cet": "central european time",
+  "cf": "compare",
+  "cia": "central intelligence agency",
+  "csl": "can not stop laughing",
+  "cu": "see you",
+  "cul8r": "see you later",
+  "cv": "curriculum vitae",
+  "cwot": "complete waste of time",
+  "cya": "see you",
+  "cyt": "see you tomorrow",
+  "dae": "does anyone else",
+  "dbmib": "do not bother me i am busy",
+  "diy": "do it yourself",
+  "dm": "direct message",
+  "dwh": "during work hours",
+  "e123": "easy as one two three",
+  "eet": "eastern european time",
+  "eg": "example",
+  "embm": "early morning business meeting",
+  "encl": "enclosed",
+  "encl.": "enclosed",
+  "etc": "and so on",
+  "faq": "frequently asked questions",
+  "fawc": "for anyone who cares",
+  "fb": "facebook",
+  "fc": "fingers crossed",
+  "fig": "figure",
+  "fimh": "forever in my heart",
+  "ft.": "feet",
+  "ft": "featuring",
+  "ftl": "for the loss",
+  "ftw": "for the win",
+  "fwiw": "for what it is worth",
+  "fyi": "for your information",
+  "g9": "genius",
+  "gahoy": "get a hold of yourself",
+  "gal": "get a life",
+  "gcse": "general certificate of secondary education",
+  "gfn": "gone for now",
+  "gg": "good game",
+  "gl": "good luck",
+  "glhf": "good luck have fun",
+  "gmt": "greenwich mean time",
+  "gmta": "great minds think alike",
+  "gn": "good night",
+  "g.o.a.t": "greatest of all time",
+  "goat": "greatest of all time",
+  "goi": "get over it",
+  "gps": "global positioning system",
+  "gr8": "great",
+  "gratz": "congratulations",
+  "gyal": "girl",
+  "h&c": "hot and cold",
+  "hp": "horsepower",
+  "hr": "hour",
+  "hrh": "his royal highness",
+  "ht": "height",
+  "ibrb": "i will be right back",
+  "ic": "i see",
+  "icq": "i seek you",
+  "icymi": "in case you missed it",
+  "idc": "i do not care",
+  "idgadf": "i do not give a damn fuck",
+  "idgaf": "i do not give a fuck",
+  "idk": "i do not know",
+  "ie": "that is",
+  "i.e": "that is",
+  "ifyp": "i feel your pain",
+  "IG": "instagram",
+  "iirc": "if i remember correctly",
+  "ilu": "i love you",
+  "ily": "i love you",
+  "imho": "in my humble opinion",
+  "imo": "in my opinion",
+  "imu": "i miss you",
+  "iow": "in other words",
+  "irl": "in real life",
+  "j4f": "just for fun",
+  "jic": "just in case",
+  "jk": "just kidding",
+  "jsyk": "just so you know",
+  "l8r": "later",
+  "lb": "pound",
+  "lbs": "pounds",
+  "ldr": "long distance relationship",
+  "lmao": "laugh my ass off",
+  "lmfao": "laugh my fucking ass off",
+  "lol": "laughing out loud",
+  "ltd": "limited",
+  "ltns": "long time no see",
+  "m8": "mate",
+  "mf": "motherfucker",
+  "mfs": "motherfuckers",
+  "mfw": "my face when",
+  "mofo": "motherfucker",
+  "mph": "miles per hour",
+  "mr": "mister",
+  "mrw": "my reaction when",
+  "ms": "miss",
+  "mte": "my thoughts exactly",
+  "nagi": "not a good idea",
+  "nbc": "national broadcasting company",
+  "nbd": "not big deal",
+  "nfs": "not for sale",
+  "ngl": "not going to lie",
+  "nhs": "national health service",
+  "nrn": "no reply necessary",
+  "nsfl": "not safe for life",
+  "nsfw": "not safe for work",
+  "nth": "nice to have",
+  "nvr": "never",
+  "nyc": "new york city",
+  "oc": "original content",
+  "og": "original",
+  "ohp": "overhead projector",
+  "oic": "oh i see",
+  "omdb": "over my dead body",
+  "omg": "oh my god",
+  "omw": "on my way",
+  "p.a": "per annum",
+  "p.m": "after midday",
+  "pm": "prime minister",
+  "poc": "people of color",
+  "pov": "point of view",
+  "pp": "pages",
+  "ppl": "people",
+  "prw": "parents are watching",
+  "ps": "postscript",
+  "pt": "point",
+  "ptb": "please text back",
+  "pto": "please turn over",
+  "qpsa": "what happens",
+  "ratchet": "rude",
+  "rbtl": "read between the lines",
+  "rlrt": "real life retweet",
+  "rofl": "rolling on the floor laughing",
+  "roflol": "rolling on the floor laughing out loud",
+  "rotflmao": "rolling on the floor laughing my ass off",
+  "rt": "retweet",
+  "ruok": "are you ok",
+  "sfw": "safe for work",
+  "sk8": "skate",
+  "smh": "shake my head",
+  "sq": "square",
+  "srsly": "seriously",
+  "ssdd": "same stuff different day",
+  "tbh": "to be honest",
+  "tbs": "tablespooful",
+  "tbsp": "tablespooful",
+  "tfw": "that feeling when",
+  "thks": "thank you",
+  "tho": "though",
+  "thx": "thank you",
+  "tia": "thanks in advance",
+  "til": "today i learned",
+  "tl;dr": "too long i did not read",
+  "tldr": "too long i did not read",
+  "tmb": "tweet me back",
+  "tntl": "trying not to laugh",
+  "ttyl": "talk to you later",
+  "u": "you",
+  "u2": "you too",
+  "u4e": "yours for ever",
+  "utc": "coordinated universal time",
+  "w/": "with",
+  "w/o": "without",
+  "w8": "wait",
+  "wassup": "what is up",
+  "wb": "welcome back",
+  "wtf": "what the fuck",
+  "wtg": "way to go",
+  "wtpa": "where the party at",
+  "wuf": "where are you from",
+  "wuzup": "what is up",
+  "wywh": "wish you were here",
+  "yd": "yard",
+  "ygtr": "you got that right",
+  "ynk": "you never know",
+  "zzz": "sleeping bored and tired"
+}