{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"sys.path.append(\"..\")\n",
"from src.preprocessing import PreprocessingPipeline\n",
"import pandas as pd\n",
"import vaex"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----\n",
"### Test vaex"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"../data/test_en.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"# | label | text |
\n",
"\n",
"\n",
"0 | 0 | "I think it's time John Rambo move on with his l... |
\n",
"1 | 1 | "I've just watch 2 films of Pang brothers, The E... |
\n",
"2 | 1 | 'Jewel Thief is *THE* crime thriller of Bollywoo... |
\n",
"3 | 0 | 'This so called remake is terrible. I went to se... |
\n",
"4 | 1 | 'When Northfork debuted at the Cannes Film Festi... |
\n",
"... | ... | ... |
\n",
"4,995 | 0 | 'The title tells it all -- Ed Gein, the butcher ... |
\n",
"4,996 | 0 | "This film makes about as much sense as an 'Ozzi... |
\n",
"4,997 | 0 | '"Sex and the City" has some great things going ... |
\n",
"4,998 | 0 | 'Please...if anybody gets the chance to read thi... |
\n",
"4,999 | 0 | '...a film comes along that manages to be absolu... |
\n",
"\n",
"
"
],
"text/plain": [
"# label text\n",
"0 0 \"I think it's time John Rambo move on with his l...\n",
"1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
"2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
"3 0 'This so called remake is terrible. I went to se...\n",
"4 1 'When Northfork debuted at the Cannes Film Festi...\n",
"... ... ...\n",
"4,995 0 'The title tells it all -- Ed Gein, the butcher ...\n",
"4,996 0 \"This film makes about as much sense as an 'Ozzi...\n",
"4,997 0 '\"Sex and the City\" has some great things going ...\n",
"4,998 0 'Please...if anybody gets the chance to read thi...\n",
"4,999 0 '...a film comes along that manages to be absolu..."
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vaex.from_pandas(df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df_small = df.iloc[:1000]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"# | label | text |
\n",
"\n",
"\n",
"0 | 0 | "I think it's time John Rambo move on with his l... |
\n",
"1 | 1 | "I've just watch 2 films of Pang brothers, The E... |
\n",
"2 | 1 | 'Jewel Thief is *THE* crime thriller of Bollywoo... |
\n",
"3 | 0 | 'This so called remake is terrible. I went to se... |
\n",
"4 | 1 | 'When Northfork debuted at the Cannes Film Festi... |
\n",
"... | ... | ... |
\n",
"995 | 1 | "It's a funny business, reviewing movies. These ... |
\n",
"996 | 1 | 'Right from the start you see that "Anchors Awei... |
\n",
"997 | 0 | 'I saw this movie in NEW York city. I was waitin... |
\n",
"998 | 0 | 'Firstly, this is NOT an adaptation of a Stephen... |
\n",
"999 | 1 | "Barbra Streisand's debut television special is ... |
\n",
"\n",
"
"
],
"text/plain": [
"# label text\n",
"0 0 \"I think it's time John Rambo move on with his l...\n",
"1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
"2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
"3 0 'This so called remake is terrible. I went to se...\n",
"4 1 'When Northfork debuted at the Cannes Film Festi...\n",
"... ... ...\n",
"995 1 \"It's a funny business, reviewing movies. These ...\n",
"996 1 'Right from the start you see that \"Anchors Awei...\n",
"997 0 'I saw this movie in NEW York city. I was waitin...\n",
"998 0 'Firstly, this is NOT an adaptation of a Stephen...\n",
"999 1 \"Barbra Streisand's debut television special is ..."
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vaex.from_pandas(df_small)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# df = pd.read_csv(\"../data/test_en.csv\")\n",
"df = pd.read_excel(\"../data/test_chinese.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pre_steps = [\n",
" \"normalize_unicode\",\n",
" \"normalize_acronyms\",\n",
" \"normalize_bullet_points\",\n",
" \"normalize_hyphenated_words\",\n",
" \"normalize_quotation_marks\",\n",
" \"normalize_whitespaces\",\n",
" \"normalize_repeating_words\",\n",
" \"normalize_repeating_chars\",\n",
" \"normalize_useless_spaces\",\n",
" # \"replace_currency_symbols\",\n",
" # \"replace_emails\",\n",
" # \"replace_emojis\",\n",
" # \"replace_hashtags\",\n",
" # \"replace_numbers\",\n",
" # \"replace_phone_numbers\",\n",
" # \"replace_urls\",\n",
" # \"replace_user_handles\",\n",
" # \"remove_accents\",\n",
" # \"remove_brackets\",\n",
" # \"remove_html_tags\",\n",
" # \"remove_non_words\",\n",
" # \"remove_punctuation\",\n",
" # \"lowercase\",\n",
" \"strip\",\n",
"]\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"post_steps = [\n",
" \"lowercase\",\n",
" # \"replace_currency_symbols\",\n",
" # \"replace_urls\",\n",
" # \"replace_emails\",\n",
" # \"replace_user_handles\",\n",
" # \"replace_hashtags\",\n",
" # \"replace_emojis\",\n",
" # \"replace_phone_numbers\",\n",
" # \"replace_numbers\",\n",
" # \"remove_html_tags\",\n",
" # \"remove_accents\",\n",
" # \"remove_brackets\",\n",
" \"remove_non_words\",\n",
" # \"remove_numbers\",\n",
" # \"remove_punctuation\",\n",
" \"normalize_repeating_words\",\n",
" \"normalize_repeating_chars\",\n",
" \"normalize_useless_spaces\",\n",
" \"strip\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"pipe = PreprocessingPipeline(\n",
" language=\"Chinese\",\n",
" lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n",
" pre_steps=pre_steps,\n",
" post_steps=post_steps,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.text[0]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.pre(df.text[0])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Compose(, , , , , , )"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipe.post"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"odf = pipe.vaex_process(df, \"text\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"odf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"steps = list(PreprocessingPipeline.pipeline_components().keys())\n",
"default_pre_steps_idx = [steps.index(i) for i in pre_steps]\n",
"default_post_steps_idx = [steps.index(i) for i in post_steps]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"default_pre_steps_idx"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"default_post_steps_idx"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sorted(list(PreprocessingPipeline.pipeline_components().keys()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"list(PreprocessingPipeline.lemmatization_component().keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"_re_non_words = re.compile(\"[^A-Za-z]+\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"_re_non_words.sub(\" \", \"Mimmo23\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f"
},
"kernelspec": {
"display_name": "Python 3.7.11 64-bit ('wordify': conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}