{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus\n", "#\n", "#\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import os\n", "import librosa" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz\n", "# import tarfile\n", "# tar = tarfile.open(\"cv-corpus-12.0-2022-12-07-be.tar.gz\", \"r:gz\")\n", "# tar.extractall()\n", "# tar.close()\n", "\n", "corpuspath = '/a/cv-corpus'\n", "outputpath = '/storage/filtered_dataset'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open validated.tsv\n", "df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\\t' ,low_memory=False)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# drop from df columns age, accents\n", "df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# count number of recordes with down_votes > 0\n", "df[df['down_votes'] > 0].count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# count number of recordes with up_votes == 0\n", "df[df['up_votes'] == 0].count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# drop all rows with down_votes > 0 and up_votes == 0\n", "df = df[df['down_votes'] == 0]\n", "df = df[df['up_votes'] > 0]\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# drop column down_votes and up_votes\n", "df = df.drop(['down_votes', 'up_votes'], axis=1)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sort by count\n", "df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)\n", "df_sorted" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get top 10 speakers\n", "top_10_speakers = df_sorted.head(10)\n", "top_10_speakers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get for the first speaker ten random paths to audio files\n", "def get_speaker_audio_list(speaker_id, n=10):\n", " return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# CHOOSE : which speaker will we use\n", "speaker_index = 0\n", "speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])\n", "print(speaker_audio_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open audio files from speaker_audio_list and play them\n", "# audio files lie in cv-corpus-12.0-2022-12-07/be/clips\n", "import IPython.display as ipd\n", "for audio in speaker_audio_list:\n", " audio = corpuspath+'/be/clips/' + audio\n", " audio_data = ipd.Audio(audio)\n", " display(audio_data)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 0 is pretty good\n", "# 1 is bad\n", "# 2 is partly 0, other are different\n", "# 3 is bad\n", "# 4 is pretty fast and clear, but not good\n", "# 5 is echoing, sometimes mic cracks\n", "# 6 is really slow and clear, but accent?\n", "# 7 has a lot of intonation, but is pretty clear\n", "# 8 is clear and slow, sometimes little mic crack\n", "# 9 has background noise, whispering\n", "\n", "# options: 0, 6, 8" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate speech rate in words per minute for each speaker\n", "def get_speech_rate(speaker_id):\n", " df_speaker = df[df['client_id'] == speaker_id]\n", " # get 1000 random samples to calculate speech rate\n", " df_speaker = df_speaker.sample(1000)\n", " # get duration of each audio file\n", " df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n", " # get number of words in each audio file\n", " df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))\n", " # calculate speech rate\n", " df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60\n", " # return mean speech rate\n", " return df_speaker['speech_rate'].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calculate speech rate for each speaker\n", "print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_average_duration(df_speaker):\n", " # get 1000 random samples to calculate speech rate\n", " df_speaker = df_speaker.sample(1000)\n", " # get duration of each audio file\n", " df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n", " return df_speaker['duration'].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n", "\n", "avg_duration = get_average_duration(df_speaker)\n", "avg_total_duration = avg_duration * len(df_speaker.index)\n", "print(f'Average duration for speaker {speaker_index}: ', avg_duration, \", average total duration(hours): \",(avg_total_duration/60.0/60.0))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# get df with speaker_index speaker \n", "df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n", "df_speaker = df_speaker.drop(['client_id'], axis=1)\n", "\n", "# get only x latest hours\n", "limit_hours = 30\n", "limit_files = round(limit_hours*60*60 / avg_duration)\n", "df_speaker = df_speaker.tail(limit_files)\n", "\n", "df_speaker" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # move all files of that speaker to another folder\n", "# # use multiprocessing to speed up\n", "# # add progress bar\n", "# from tqdm import tqdm\n", "# import multiprocessing\n", "# from multiprocessing import Pool\n", "# import shutil\n", "\n", "# def move_file(file):\n", "# shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)\n", "\n", "# # get list of files to move\n", "# files = df_speaker['path'].values.tolist()\n", "\n", "# # move files\n", "# with Pool(multiprocessing.cpu_count()) as p:\n", "# r = list(tqdm(p.imap(move_file, files), total=len(files)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# cleanup output and save text lines to csv\n", "if os.path.isdir(outputpath):\n", " for file in os.scandir(outputpath):\n", " os.remove(file.path)\n", "else:\n", " os.mkdir(outputpath)\n", "\n", "df_speaker['path2'] = df_speaker['path'].str.replace('\\.mp3$','.wav', regex=True)\n", "df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm\n", "import multiprocessing\n", "from multiprocessing import Pool\n", "from tqdm import tqdm\n", "from pydub import AudioSegment\n", "\n", "def convert_mp3_to_wav(file):\n", " sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)\n", " sound = sound.set_frame_rate(22050)\n", " sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')\n", "\n", "# get list of files to convert\n", "files = df_speaker['path'].values.tolist()\n", "\n", "# convert files\n", "with Pool(multiprocessing.cpu_count()) as p:\n", " r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }