Spaces:

strongpear
/

Vietnamese-aspect-detection

Paused

App Files Files Community

strongpear commited on Aug 14, 2023

Commit

0beb932

1 Parent(s): 49fe48c

create data_preprocessing.py

Browse files

Files changed (1) hide show

data_preprocessing.py +177 -0

data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jul 28 08:29:31 2023
+@author: ASUS
+"""
+import pandas as pd
+import os
+import glob
+import re
+import unicodedata2
+from underthesea import word_tokenize
+path = 'raw_data/'
+files = glob.glob(os.path.join(path, "*.csv"))
+def read_csv_file(file):
+    raw_df = pd.DataFrame()
+    for file in files:
+        drop_idx = []
+        df = pd.read_csv(file)
+        for index, row in df.iterrows():
+            if len(row['comments'].split(" ")) < 10:
+                drop_idx.append(index)
+        df = df.drop(drop_idx, axis=0)
+        df.reset_index(inplace=True)
+        raw_df = pd.concat([raw_df, df], ignore_index=True)
+    raw_df.drop(['index', 'Unnamed: 0'], axis=1, inplace=True)
+    raw_df = raw_df.drop_duplicates()
+    return raw_df
+def remove_xem_them(text):
+    text = text.replace("Xem thêm", "")
+    text = text.replace("xem thêm", "")
+    return text
+# remove emojis
+def remove_emojis(text):
+    emoj = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002500-\U00002BEF"  # chinese char
+        u"\U00002702-\U000027B0"
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u"\U00010000-\U0010ffff"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u200d"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\ufe0f"  # dingbats
+        u"\u3030"
+                      "]+", re.UNICODE)
+    return re.sub(emoj, ' ', text)
+def remove_hastag(text):
+    pattern = re.compile(r'([\#]+)((\w)*)(\s*)')
+    matches = pattern.finditer(text + " ")
+    for m in matches:
+      text = text.replace(m.group(), '')
+    return text
+def remove_stopwords(text):
+    stopwords = []
+    f = open('vietnamese-stopwords.txt', encoding='utf8')
+    for line in f:
+        stopwords.append(line.rstrip('\n'))
+    new_text = ' '.join([i for i in text.split() if i not in stopwords])
+    return new_text
+# split word with punctuation
+def format_punctuation(text):
+  pattern = re.compile(r'(([\!\"\#\$\%\&\,\.\-\_\+\:\;\?\^\•])+)(\w+)')
+  matches = pattern.finditer(text + " ")
+  for m in matches:
+    text = text.replace(m.group()[0], ' ')
+  return text
+# remove punctuation
+def remove_punctuation(text):
+    punc = "'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’“”•…‼‼‼⁃₫√≧≦–"
+    new_text = "".join([i for i in text if i not in punc])
+    return new_text
+def format_price(text):
+    pattern = re.compile(r'([0-9]+)(\s*)(k)(?=\W)')
+    matches = pattern.finditer(text + " ")
+    prices = []
+    new_prices = []
+    for m in matches:
+        prices.append(m.group())
+        new_prices.append(m.group().replace('k', '') + " nghìn_đồng")
+    pattern = re.compile(r'([0-9]+)(\s*)(tr |m )(([0-9]*))')
+    matches = pattern.finditer(text + " ")
+    for m in matches:
+        prices.append(m.group())
+        for r in ["tr ", "m "]:
+            if r in m.group():
+                n_p = m.group().replace(r, " triệu ")
+                break
+        tmp = n_p.split("triệu")
+        if tmp[1] == " ":
+            n_p += "_đồng "
+        else :
+            if int(tmp[1]) < 10:
+                tmp[1] = int(tmp[1]) * 100
+            if int(tmp[1]) < 100:
+                tmp[1] = int(tmp[1]) * 10
+            n_p = tmp[0] + "_triệu " + str(tmp[1]) + " nghìn_đồng"
+        new_prices.append(n_p)
+    for i in range(len(prices)):
+        text = text.replace(prices[i], new_prices[i])
+    text = text.replace("nghìn đồng", "nghìn_đồng")
+    text = text.replace("triệu đồng", "triệu_đồng")
+    return text
+def format_price_v2(text):
+    pattern = re.compile(r'([0-9]+)(\s*)(triệu_đồng|nghìn_đồng|nghìn)')
+    matches = pattern.finditer(text + " ")
+    old = []
+    new = []
+    for m in matches:
+        old.append(m.group())
+        new.append("_".join(m.group().split()))
+    for i in range(len(old)):
+      text = text.replace(old[i], new[i])
+    return text
+def clean_text(text):
+    text = text.lower()
+    rp_dict = {"cty":"công ty", "\"":"", "'":"", "\n":" ", " k ":" không ", " h ":" giờ ", " ko ":" không ", " cf ":" cà phê ", " cofe ":" cà phê ", " coffee ":" cà phê ", " cofee ":" cà phê ", " cafe ":" cà phê ", " cafee ":" cà phê ",
+               " j ":" gì ", ".000":" nghìn", "vnd":" đồng", "vnđ":" đồng", " r ":" rồi ", " đc ":" được ", " dc ":" được ", " pv ":" phục vụ ", " pvu ":" phục vụ ", " pvụ ":" phục vụ ",
+               " nv ":" nhân viên ", " nvien ":" nhân viên ", " nviên ": " nhân viên ", " b ":" bạn ", " m ":" mình ", " ng ":" người ", " cx ":" cũng ", "oder":"order", "ita":"ít",
+               "vaie":"vải", "chie":"chỉ", "cb":"chuẩn bị", "nc":"nước", "khoog":"không", "bânh":"bánh", "lug":"lung", "nhiêm":"nhiên", "nguời":"người", "ntn":"như thế này", "nuớc":"nước",
+               "lẫu":"lẩu", "dẻ":"rẻ", "siu":"siêu", "ni":"này"}
+    for key, value in rp_dict.items():
+        text = text.replace(key, value)
+    text = re.sub('\n', '' , text)
+    return text
+def normalize_format(text):
+  return unicodedata2.normalize('NFC', text)
+def word_segment(text):
+  try:
+    text = word_tokenize(text, format='text')
+  except:
+    return "Lỗi"
+  return text