{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!pip install bertopic\n", "\n", "# bertopicのmodelを作るscript" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/user/miniconda3/envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from bertopic import BERTopic" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "streaming=True\n", "dataset_list =[\n", " load_dataset('mc4', 'ja', split='train',streaming=streaming),\n", " load_dataset('oscar', 'unshuffled_deduplicated_ja', split='train',streaming=streaming),\n", " load_dataset('cc100', lang='ja', split='train',streaming=streaming),\n", " load_dataset(\"augmxnt/shisa-pretrain-en-ja-v1\",split=\"train\",streaming=streaming),\n", " load_dataset(\"hpprc/wikipedia-20240101\", split=\"train\",streaming=streaming),\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "10000it [00:20, 482.63it/s]\n", "10000it [00:19, 524.44it/s]\n", "10000it [00:12, 778.96it/s]\n", "10000it [00:25, 386.40it/s]\n", "10000it [00:58, 171.79it/s]\n" ] } ], "source": [ "from tqdm import tqdm\n", "docs=[]\n", "#prepare data for training model\n", "for dataset in dataset_list:\n", " cnt=0\n", " for record in tqdm(dataset):\n", " text=record[\"text\"]\n", " docs.append(text)\n", " cnt+=1\n", "\n", " if cnt>10000:\n", " break\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-03-12 08:37:19,823 - BERTopic - Embedding - Transforming documents to embeddings.\n", "Batches: 100%|██████████| 1563/1563 [00:50<00:00, 30.79it/s] \n", "2024-03-12 08:38:20,622 - BERTopic - Embedding - Completed ✓\n", "2024-03-12 08:38:20,622 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", "2024-03-12 08:38:59,566 - BERTopic - Dimensionality - Completed ✓\n", "2024-03-12 08:38:59,567 - BERTopic - Cluster - Start clustering the reduced embeddings\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "2024-03-12 08:46:25,241 - BERTopic - Cluster - Completed ✓\n", "2024-03-12 08:46:25,242 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", "2024-03-12 08:47:25,876 - BERTopic - Representation - Completed ✓\n", "2024-03-12 08:47:25,952 - BERTopic - Topic reduction - Reducing number of topics\n", "2024-03-12 08:48:28,300 - BERTopic - Topic reduction - Reduced number of topics from 435 to 342\n" ] } ], "source": [ "\n", "model_path=\"data/topic_model.bin\"\n", "topic_model = BERTopic(language=\"japanese\", calculate_probabilities=True, verbose=True, nr_topics=\"20\")\n", "topics, probs = topic_model.fit_transform(docs)\n", "\n", "\n", "#topic_model=BERTopic.load(model_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-03-12 08:48:42,599 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.\n", "/home/user/miniconda3/envs/ft/lib/python3.11/site-packages/scipy/sparse/_index.py:143: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.\n", " self._set_arrayXarray(i, j, x)\n" ] } ], "source": [ "topic_model.save(model_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Topic | \n", "Count | \n", "Name | \n", "Representation | \n", "Representative_Docs | \n", "
---|---|---|---|---|---|
0 | \n", "-1 | \n", "22559 | \n", "-1_the_and_to_of | \n", "[the, and, to, of, 送料無料, in, 12, 11, 10, また] | \n", "[Створення сайту - Сторінка 419 - Форум\\nЧетве... | \n", "
1 | \n", "0 | \n", "1585 | \n", "0_送料無料_サマータイヤ_代引不可_中古 | \n", "[送料無料, サマータイヤ, 代引不可, 中古, ブラック, diy, レディース, 工具,... | \n", "[上品なスタイル 【5/1(土)クーポン&ワンダフルデー 4本1台分!!】 215/45R1... | \n", "
2 | \n", "1 | \n", "1209 | \n", "1_としあき_無念_name_投稿日 | \n", "[としあき, 無念, name, 投稿日, id, 16, 名前, 柳宗理, no, 11] | \n", "[ハニーセレクト日曜昼の部テンプレセット髪型全然使ってなかったけど - ふたろぐばこ−二次元... | \n", "
3 | \n", "2 | \n", "801 | \n", "2_ワンピース_5cm_レディース_着丈 | \n", "[ワンピース, 5cm, レディース, 着丈, 肩幅, 素材, 格安通販, シューズ, 袖丈... | \n", "[非売品 入学式 セレモニー 秋冬 秋 他と被らない 冬 小さいサイズ スカート セット 卒... | \n", "
4 | \n", "3 | \n", "799 | \n", "3_ベンジャミン_フランクリン_passion_thee | \n", "[ベンジャミン, フランクリン, passion, thee, nベンジャミン, 全業種, ... | \n", "[it's ok with me 意味\\t9\\n英語で「It's okay.(イッツオーケー... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
337 | \n", "336 | \n", "11 | \n", "336_abuse_you_counselling_emotional | \n", "[abuse, you, counselling, emotional, addiction... | \n", "[スピリチュアルカウンセリングは、魂の向上を目的とした、至高神からのヒーリングで魂を整えて頂... | \n", "
338 | \n", "337 | \n", "10 | \n", "337_京都の道_snorkeling_その1_中の池 | \n", "[京都の道, snorkeling, その1, 中の池, k7, silfra, 今だけ特別... | \n", "[オアフ島(ホノルル) 福岡発 ◎今だけ無料で海の見える部屋へアップグレード!◎シェラトン・... | \n", "
339 | \n", "338 | \n", "10 | \n", "338_実印_いつ使う_件のレビュー例えば_いつ使うは | \n", "[実印, いつ使う, 件のレビュー例えば, いつ使うは, しっかりした会社, 印鑑, 実印の... | \n", "[冊子の「契約内容のお知らせ」ページをめくると、登録情報の変更シートがあります。\\n, 今回... | \n", "
340 | \n", "339 | \n", "10 | \n", "339_galaxy_s7_samsung_edge | \n", "[galaxy, s7, samsung, edge, i9195i, s8, 3i9200... | \n", "[ S8 PlusとS9 Plus - bajatyoutube.com\\n2019/0... | \n", "
341 | \n", "340 | \n", "10 | \n", "340_゚д゚_対価_労働_産業別組合 | \n", "[゚д゚, 対価, 労働, 産業別組合, 工会, 約款, union, 契約書, 規約, 労... | \n", "[ただし、中小企業の事業主等、労働者以外でも業務の実態や災害の発生状況からみて、労働者に準じ... | \n", "
342 rows × 5 columns
\n", "