Spaces:

strongpear
/

Vietnamese-aspect-detection

Paused

File size: 5,576 Bytes

0beb932

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 28 08:29:31 2023

@author: ASUS
"""
import pandas as pd
import os
import glob
import re

import unicodedata2
from underthesea import word_tokenize

path = 'raw_data/'
files = glob.glob(os.path.join(path, "*.csv"))

def read_csv_file(file):
    
    raw_df = pd.DataFrame()
    
    for file in files:
        drop_idx = []
        df = pd.read_csv(file)
        for index, row in df.iterrows():
            if len(row['comments'].split(" ")) < 10:
                drop_idx.append(index)
        
        df = df.drop(drop_idx, axis=0)
        df.reset_index(inplace=True)
        
        raw_df = pd.concat([raw_df, df], ignore_index=True)
        
    raw_df.drop(['index', 'Unnamed: 0'], axis=1, inplace=True)
    raw_df = raw_df.drop_duplicates() 
     
    return raw_df

def remove_xem_them(text):
    text = text.replace("Xem thêm", "")
    text = text.replace("xem thêm", "")
    
    return text

# remove emojis
def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    
    return re.sub(emoj, ' ', text)

def remove_hastag(text):
    pattern = re.compile(r'([\#]+)((\w)*)(\s*)')
    matches = pattern.finditer(text + " ")
    for m in matches:
      text = text.replace(m.group(), '')
    
    return text

def remove_stopwords(text):
    stopwords = []
    
    f = open('vietnamese-stopwords.txt', encoding='utf8')
    for line in f:
        stopwords.append(line.rstrip('\n'))
        
    new_text = ' '.join([i for i in text.split() if i not in stopwords])
    
    return new_text

# split word with punctuation
def format_punctuation(text):
  pattern = re.compile(r'(([\!\"\#\$\%\&\,\.\-\_\+\:\;\?\^\•])+)(\w+)')
  matches = pattern.finditer(text + " ")
  for m in matches:
    text = text.replace(m.group()[0], ' ')
    
  return text

# remove punctuation
def remove_punctuation(text):
    punc = "'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’“”•…‼‼‼⁃₫√≧≦–"
    new_text = "".join([i for i in text if i not in punc])
    
    return new_text

def format_price(text):
    pattern = re.compile(r'([0-9]+)(\s*)(k)(?=\W)')
    matches = pattern.finditer(text + " ")
    prices = []
    new_prices = []
    for m in matches:
        prices.append(m.group())
        new_prices.append(m.group().replace('k', '') + " nghìn_đồng")
    
    pattern = re.compile(r'([0-9]+)(\s*)(tr |m )(([0-9]*))')
    matches = pattern.finditer(text + " ")
    for m in matches:
        prices.append(m.group())
        for r in ["tr ", "m "]:
            if r in m.group():
                n_p = m.group().replace(r, " triệu ")
                break
        tmp = n_p.split("triệu")
        if tmp[1] == " ":
            n_p += "_đồng "
        else :
            if int(tmp[1]) < 10:
                tmp[1] = int(tmp[1]) * 100
            if int(tmp[1]) < 100:
                tmp[1] = int(tmp[1]) * 10
            n_p = tmp[0] + "_triệu " + str(tmp[1]) + " nghìn_đồng"
        new_prices.append(n_p)
    
    for i in range(len(prices)):
        text = text.replace(prices[i], new_prices[i])
    
    text = text.replace("nghìn đồng", "nghìn_đồng")
    text = text.replace("triệu đồng", "triệu_đồng")
    
    return text

def format_price_v2(text):
    pattern = re.compile(r'([0-9]+)(\s*)(triệu_đồng|nghìn_đồng|nghìn)')
    matches = pattern.finditer(text + " ")
    old = []
    new = []
    for m in matches:
        old.append(m.group())
        new.append("_".join(m.group().split()))
    for i in range(len(old)):
      text = text.replace(old[i], new[i])
    
    return text

def clean_text(text):
    text = text.lower()
    rp_dict = {"cty":"công ty", "\"":"", "'":"", "\n":" ", " k ":" không ", " h ":" giờ ", " ko ":" không ", " cf ":" cà phê ", " cofe ":" cà phê ", " coffee ":" cà phê ", " cofee ":" cà phê ", " cafe ":" cà phê ", " cafee ":" cà phê ",
               " j ":" gì ", ".000":" nghìn", "vnd":" đồng", "vnđ":" đồng", " r ":" rồi ", " đc ":" được ", " dc ":" được ", " pv ":" phục vụ ", " pvu ":" phục vụ ", " pvụ ":" phục vụ ",
               " nv ":" nhân viên ", " nvien ":" nhân viên ", " nviên ": " nhân viên ", " b ":" bạn ", " m ":" mình ", " ng ":" người ", " cx ":" cũng ", "oder":"order", "ita":"ít",
               "vaie":"vải", "chie":"chỉ", "cb":"chuẩn bị", "nc":"nước", "khoog":"không", "bânh":"bánh", "lug":"lung", "nhiêm":"nhiên", "nguời":"người", "ntn":"như thế này", "nuớc":"nước",
               "lẫu":"lẩu", "dẻ":"rẻ", "siu":"siêu", "ni":"này"}

    for key, value in rp_dict.items():
        text = text.replace(key, value)

    text = re.sub('\n', '' , text)

    return text

def normalize_format(text):
  return unicodedata2.normalize('NFC', text)

def word_segment(text):
  try:
    text = word_tokenize(text, format='text')
  except:
    return "Lỗi"
  return text