Spaces:

amanmibra
/

void-demo-aisf

Runtime error

App Files Files Community

amanmibra commited on May 12, 2023

Commit

47bf442

•

1 Parent(s): d6c7ff9

Add transformations to datset (to spec)

Browse files

Files changed (3) hide show

__pycache__/dataset.cpython-39.pyc +0 -0
dataset.py +37 -8
notebooks/playground.ipynb +42 -14

__pycache__/dataset.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/dataset.cpython-39.pyc and b/__pycache__/dataset.cpython-39.pyc differ

dataset.py CHANGED Viewed

@@ -1,16 +1,22 @@
 import os
 from torch.utils.data import Dataset
 import pandas as pd
 import torchaudio
 class VoiceDataset(Dataset):
-    def __init__(self, data_directory):
         self._data_path = os.path.join(data_directory)
         self._labels = os.listdir(self._data_path)
-        self.audio_files, self.audio_labels = self._join_audio_files()
     def __len__(self):
         total_audio_files = 0
@@ -20,16 +26,39 @@ class VoiceDataset(Dataset):
         return total_audio_files
     def __getitem__(self, index):
-        return self.audio_files[index], self.audio_labels[index]
     def _join_audio_files(self):
-        audio_files = []
-        audio_labels = []
         for label in self._labels:
             label_path = os.path.join(self._data_path, label)
             for f in os.listdir(label_path):
-                audio_files.append(f)
-                audio_labels.append(label)
-        return audio_files, audio_labels

 import os
+import torch
 from torch.utils.data import Dataset
 import pandas as pd
 import torchaudio
 class VoiceDataset(Dataset):
+    def __init__(self, data_directory, transformation, target_sample_rate):
+        # file processing
         self._data_path = os.path.join(data_directory)
         self._labels = os.listdir(self._data_path)
+        self.audio_files_labels = self._join_audio_files()
+        # audio processing
+        self.transformation = transformation
+        self.target_sample_rate = target_sample_rate
     def __len__(self):
         total_audio_files = 0
         return total_audio_files
     def __getitem__(self, index):
+        file, label = self.audio_files_labels[index]
+        filepath = os.path.join(self._data_path, label, file)
+        wav, sr = torchaudio.load(filepath, normalize=True)
+        wav = self._resample(wav, sr)
+        wav = self._mix_down(wav)
+        wav = self.transformation(wav)
+        return wav, label
     def _join_audio_files(self):
+        """Join all the audio file names and labels into one single dimenional array"""
+        audio_files_labels = []
         for label in self._labels:
             label_path = os.path.join(self._data_path, label)
             for f in os.listdir(label_path):
+                audio_files_labels.append((f, label))
+        return audio_files_labels
+    def _resample(self, wav, current_sample_rate):
+        """Resample audio to the target sample rate, if necessary"""
+        if current_sample_rate != self.target_sample_rate:
+            resampler = torchaudio.transforms.Resample(current_sample_rate, self.target_sample_rate)
+            wav = resampler(wav)
+        return wav
+    def _mix_down(self, wav):
+        """Mix down audio to a single channel, if necessary"""
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        return wav

notebooks/playground.ipynb CHANGED Viewed

@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "8b292047",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,7 +14,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "88db7a26",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,7 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "d4ac5e60",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "903c1d7d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "7dec6dd0",
    "metadata": {},
    "outputs": [
     {
@@ -65,39 +65,67 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "1eea9cf8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = VoiceDataset('../data')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "cee3b661",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'../data'"
       ]
      },
-     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "dataset[1]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d1a4615a",
    "metadata": {},
    "outputs": [],
    "source": []

   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "17f47516",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "3959c95c",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "53328491",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "24923a03",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "08f1c4c3",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 46,
+   "id": "9554ab2c",
    "metadata": {},
    "outputs": [],
    "source": [
+    "mel_spectrogram = torchaudio.transforms.MelSpectrogram(\n",
+    "        sample_rate=16000,\n",
+    "        n_fft=1024,\n",
+    "        hop_length=512,\n",
+    "        n_mels=64\n",
+    "    )\n",
+    "dataset = VoiceDataset('../data', mel_spectrogram, 16000)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 47,
+   "id": "f1413af4",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "5718"
       ]
      },
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "len(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "e81b46ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor([[ 0.0220,  0.0041, -0.0153,  ...,  0.0006, -0.0056, -0.0064]]),\n",
+       " 'aman')"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset[0]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "48574640",
    "metadata": {},
    "outputs": [],
    "source": []