Spaces:

vives
/

bert_auto_tagging

Runtime error

App Files Files Community

vives commited on May 24, 2022

Commit

d56e301

1 Parent(s): cc3cff0

Create nlp_entities.py

Browse files

Files changed (1) hide show

nlp_entities.py +142 -0

nlp_entities.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#@title NLP Entities code
+import re
+def er_data_cleaning(raw: str) -> str:
+    """
+    Strip off text for html tags and characters.
+    :param raw:
+    :return: str: stripped string
+    """
+    # HTML tags
+    if raw is None:
+        raw = ""
+    html_removed = re.sub(r"<[^<]+?>", " ", raw)
+    # Remove /
+    raw_line_removed = str(html_removed).replace("/", " ")
+    # removing special entities like &quot; , &amp; etc.
+    special_entites_removed = re.sub(r"&[\w]+;", "", raw_line_removed)
+    # removing unicode characters like \u200c, \u200E etc.
+    unicode_chars_removed = special_entites_removed.encode("ascii", "ignore").decode("utf-8")
+    unicode_chars_removed = re.sub(r"\\u[\d]{3}[\w]", " ", unicode_chars_removed)
+    return unicode_chars_removed.strip()
+def get_clean_text_blobs(text_blobs):
+    """
+    Clean-up text blobs.
+    :param text_blobs: list
+    :return:cleaned_text_blobs: list
+    """
+    cleaned_text_blobs = []
+    for text_blob in text_blobs:
+        cleaned_text_blobs.append(er_data_cleaning(raw=text_blob))
+    return cleaned_text_blobs
+def get_phrases_pagerank(text_blobs, limit=1, token_len_min=2, token_len_max=3):
+    """
+    Return key phrases based on PageRank.
+    :param token_length: Length of the token in the key phrases
+    :param text_blobs: List of text
+    # TODO: limit param is redundant because we are returning all the key phrases. Probably get rid of it
+    :param limit: percentage limit on total key phrases returned
+    :return: set(key_phrases)
+    """
+    try:
+        assert 0 <= limit <= 1
+        text = ". ".join(text_blobs)
+        doc = nlp(text)
+        # doc._.textrank.pos_kept = POS
+        # doc._.textrank.token_lookback = token_lookback
+        total_len = len(doc._.phrases)
+        return_phrases = int(total_len * limit)
+        # examine the top-ranked phrases in the document
+        out_phrases = dict()
+        for p in doc._.phrases[:return_phrases]:
+            # adding token_length would reduce total score from 100
+            tokenized_kp = p.text.split()
+            filtered_tokens = [word for word in tokenized_kp if word not in all_stopwords]
+            kp_length = len(filtered_tokens)
+            if p.rank > 0 and kp_length <= token_len_max and kp_length >= token_len_min:
+                joined_kp = " ".join(filtered_tokens)
+                if joined_kp in out_phrases:
+                    out_phrases[joined_kp]["weight"] += p.rank
+                    out_phrases[joined_kp]["kp_length"] = kp_length
+                else:
+                    # count is dummy value
+                    result_dict = {"weight": p.rank, "kp_length": kp_length, "count": 1}
+                    out_phrases[joined_kp] = result_dict
+    except AssertionError as err:
+        raise err
+    return out_phrases
+def dict_normalization(interest_dictionary, target=1.0):
+    """
+    Normalize the dictionary weights to target.
+    :param interest_dictionary: List of key phrases and scores
+    :param target: normalization score
+    :return: normalized interest dictionary
+    """
+    curr_score = 0
+    # exclude normalization if no output returned from pagerank
+    if len(interest_dictionary) > 0:
+        for kp_info in interest_dictionary.values():
+            curr_score += kp_info["weight"]
+        factor = target / curr_score
+        for kp, _ in interest_dictionary.items():
+            interest_dictionary[kp]["weight"] = round(interest_dictionary[kp]["weight"] * factor, 4)
+    return interest_dictionary
+def get_ners(text_blobs):
+    """
+    Get named entities.
+    :param text_blobs: List of text blobs
+    :return: named_entities
+    """
+    k_ners = dict()
+    for text_blob in text_blobs:
+        doc = nlp(text_blob)
+        for ent in doc.ents:
+            if ent.label_ not in FILT_GROUPS:
+                # increment count associated with named entity
+                if ent.text in k_ners:
+                    k_ners[ent.text] += 1
+                else:
+                    k_ners[ent.text] = 1
+        return k_ners
+def return_ners_and_kp(text_blobs, ret_ne=False):
+    """
+    Return named entities and key phrases corresponding to text blob.
+    :param ret_ne: Boolean to return named entities
+    :param text_blobs: list of text blobs
+    :return: dict(): {NE: {tag1:count, tag2:count},
+                     KP: {tag3:{weight: float, kp_length:count, count: int},
+                          tag4:{weight: float, kp_length:count, count: int}}
+    """
+    return_tags = dict()
+    cleaned_text_blobs = get_clean_text_blobs(text_blobs=text_blobs)
+    kps = get_phrases_pagerank(text_blobs=cleaned_text_blobs)
+    kps = dict_normalization(kps)
+    return_tags["KP"] = kps
+    if ret_ne:
+        ners = get_ners(text_blobs=cleaned_text_blobs)
+        return_tags["NE"] = ners
+    return return_tags