Spaces:

amanmibra
/

void-demo-aisf

Runtime error

App Files Files Community

amanmibra commited on May 13, 2023

Commit

0908871

1 Parent(s): 47bf442

Add cut/pad files

Browse files

Files changed (3) hide show

__pycache__/dataset.cpython-39.pyc +0 -0
dataset.py +27 -3
notebooks/playground.ipynb +43 -16

__pycache__/dataset.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/dataset.cpython-39.pyc and b/__pycache__/dataset.cpython-39.pyc differ

dataset.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torchaudio
 class VoiceDataset(Dataset):
-    def __init__(self, data_directory, transformation, target_sample_rate):
         # file processing
         self._data_path = os.path.join(data_directory)
         self._labels = os.listdir(self._data_path)
@@ -17,6 +17,7 @@ class VoiceDataset(Dataset):
         # audio processing
         self.transformation = transformation
         self.target_sample_rate = target_sample_rate
     def __len__(self):
         total_audio_files = 0
@@ -26,14 +27,20 @@ class VoiceDataset(Dataset):
         return total_audio_files
     def __getitem__(self, index):
         file, label = self.audio_files_labels[index]
         filepath = os.path.join(self._data_path, label, file)
         wav, sr = torchaudio.load(filepath, normalize=True)
         wav = self._resample(wav, sr)
         wav = self._mix_down(wav)
         wav = self.transformation(wav)
         return wav, label
@@ -61,4 +68,21 @@ class VoiceDataset(Dataset):
         if wav.shape[0] > 1:
             wav = torch.mean(wav, dim=0, keepdim=True)
-        return wav

 class VoiceDataset(Dataset):
+    def __init__(self, data_directory, transformation, target_sample_rate, time_limit_in_secs=5):
         # file processing
         self._data_path = os.path.join(data_directory)
         self._labels = os.listdir(self._data_path)
         # audio processing
         self.transformation = transformation
         self.target_sample_rate = target_sample_rate
+        self.num_samples = time_limit_in_secs * self.target_sample_rate
     def __len__(self):
         total_audio_files = 0
         return total_audio_files
     def __getitem__(self, index):
+        # get file
         file, label = self.audio_files_labels[index]
         filepath = os.path.join(self._data_path, label, file)
+        # load wav
         wav, sr = torchaudio.load(filepath, normalize=True)
+        # modify wav file, if necessary
         wav = self._resample(wav, sr)
         wav = self._mix_down(wav)
+        wav = self._cut_or_pad(wav)
+        # apply transformation
         wav = self.transformation(wav)
         return wav, label
         if wav.shape[0] > 1:
             wav = torch.mean(wav, dim=0, keepdim=True)
+        return wav
+    def _cut_or_pad(self, wav):
+        """Modify audio if number of samples != target number of samples of the dataset.
+        If there are too many samples, cut the audio.
+        If there are not enough samples, pad the audio with zeros.
+        """
+        length_signal =  wav.shape[1]
+        if length_signal > self.num_samples:
+            wav = wav[:, :self.num_samples]
+        elif length_signal < self.num_samples:
+            num_of_missing_samples = self.num_samples - length_signal
+            pad = (0, num_of_missing_samples)
+            wav = torch.nn.functional.pad(wav, pad)
+        return wav

notebooks/playground.ipynb CHANGED Viewed

@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "17f47516",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,7 +14,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "3959c95c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,7 +25,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "53328491",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "24923a03",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "08f1c4c3",
    "metadata": {},
    "outputs": [
     {
@@ -65,8 +65,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "9554ab2c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,13 +76,13 @@
     "        hop_length=512,\n",
     "        n_mels=64\n",
     "    )\n",
-    "dataset = VoiceDataset('../data', mel_spectrogram, 16000)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "f1413af4",
    "metadata": {},
    "outputs": [
     {
@@ -91,7 +91,7 @@
        "5718"
       ]
      },
-     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -102,18 +102,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "id": "e81b46ee",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(tensor([[ 0.0220,  0.0041, -0.0153,  ...,  0.0006, -0.0056, -0.0064]]),\n",
        " 'aman')"
       ]
      },
-     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -122,10 +128,31 @@
     "dataset[0]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "48574640",
    "metadata": {},
    "outputs": [],
    "source": []

   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "26db4cdb",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "c8244b70",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "f3fd2d28",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "da9fe647",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "70905d2d",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 64,
+   "id": "523d28f9",
    "metadata": {},
    "outputs": [],
    "source": [
     "        hop_length=512,\n",
     "        n_mels=64\n",
     "    )\n",
+    "dataset = VoiceDataset('../data', mel_spectrogram, 16000,)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 65,
+   "id": "0044724d",
    "metadata": {},
    "outputs": [
     {
        "5718"
       ]
      },
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 66,
+   "id": "df7a9e58",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "(tensor([[[0.2647, 0.0247, 0.0324,  ..., 0.0000, 0.0000, 0.0000],\n",
+       "          [0.0812, 0.0178, 0.0890,  ..., 0.0000, 0.0000, 0.0000],\n",
+       "          [0.0052, 0.0212, 0.1341,  ..., 0.0000, 0.0000, 0.0000],\n",
+       "          ...,\n",
+       "          [0.5154, 0.3950, 0.4497,  ..., 0.0000, 0.0000, 0.0000],\n",
+       "          [0.1919, 0.4804, 0.5144,  ..., 0.0000, 0.0000, 0.0000],\n",
+       "          [0.1208, 0.4357, 0.4016,  ..., 0.0000, 0.0000, 0.0000]]]),\n",
        " 'aman')"
       ]
      },
+     "execution_count": 66,
      "metadata": {},
      "output_type": "execute_result"
     }
     "dataset[0]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "df064dbc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 64, 313])"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset[0][0].shape"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "ed4899bf",
    "metadata": {},
    "outputs": [],
    "source": []