## 1. Setup & Installation

In [1]:
%%writefile requirements.txt
torchaudio
pyannote.audio

Overwriting requirements.txt


In [2]:
!pip install -r requirements.txt --upgrade

Collecting torchaudio
  Downloading torchaudio-0.12.1-cp39-cp39-manylinux1_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting pyannote.audio
  Downloading pyannote.audio-2.0.1-py2.py3-none-any.whl (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.9/385.9 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==1.12.1
  Using cached torch-1.12.1-cp39-cp39-manylinux1_x86_64.whl (776.4 MB)
Collecting pytorch-lightning<1.7,>=1.5.4
  Downloading pytorch_lightning-1.6.5-py3-none-any.whl (585 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 kB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hmmlearn<0.3,>=0.2.7
  Downloading hmmlearn-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (217 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m217.2/217.2 kB[0m [31m22.4 MB/s[0m e

In [3]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")



Downloading:   0%|          | 0.00/598 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129k [00:00<?, ?B/s]

start=0.5s stop=1.4s speaker_SPEAKER_01
start=1.9s stop=2.8s speaker_SPEAKER_01
start=3.0s stop=3.5s speaker_SPEAKER_02
start=3.6s stop=4.3s speaker_SPEAKER_01
start=4.6s stop=6.8s speaker_SPEAKER_02
start=7.1s stop=7.6s speaker_SPEAKER_00
start=7.6s stop=9.5s speaker_SPEAKER_02
start=9.8s stop=10.6s speaker_SPEAKER_02
start=9.9s stop=10.4s speaker_SPEAKER_00
start=12.4s stop=15.6s speaker_SPEAKER_03
start=15.8s stop=16.1s speaker_SPEAKER_00
start=16.1s stop=16.2s speaker_SPEAKER_01
start=17.2s stop=17.4s speaker_SPEAKER_00
start=17.7s stop=20.4s speaker_SPEAKER_01
start=20.6s stop=20.7s speaker_SPEAKER_01
start=20.7s stop=20.8s speaker_SPEAKER_00
start=20.8s stop=20.9s speaker_SPEAKER_01
start=21.1s stop=22.1s speaker_SPEAKER_01
start=22.5s stop=22.7s speaker_SPEAKER_02
start=23.2s stop=23.5s speaker_SPEAKER_02
start=23.5s stop=24.0s speaker_SPEAKER_01
start=24.3s stop=25.5s speaker_SPEAKER_02
start=25.8s stop=27.3s speaker_SPEAKER_01
start=27.3s stop=27.5s speaker_SPEAKER_02
start=29

In [7]:
from transformers.pipelines.audio_utils import ffmpeg_read
import torch




audio_nparray = ffmpeg_read(request["inputs"], 16000)
audio_tensor= torch.from_numpy(audio_nparray).unsqueeze(0)
f = {"waveform": audio_tensor, "sample_rate": 16000}

## 2. Create Custom Handler for Inference Endpoints


In [8]:
%%writefile handler.py
from typing import  Dict
from pyannote.audio import Pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import torch 

SAMPLE_RATE = 16000



class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")


    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the deserialized audio file as bytes
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        # process input
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", None) #  min_speakers=2, max_speakers=5

        
        # prepare pynannote input
        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
        audio_tensor= torch.from_numpy(audio_nparray).unsqueeze(0)
        pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
        
        # apply pretrained pipeline
        # pass inputs with all kwargs in data
        if parameters is not None:
            diarization = self.pipeline(pyannote_input, **parameters)
        else:
            diarization = self.pipeline(pyannote_input)

        # postprocess the prediction
        processed_diarization = [
            {"label": str(label), "start": str(segment.start), "stop": str(segment.end)}
            for segment, _, label in diarization.itertracks(yield_label=True)
        ]
        
        return {"diarization": processed_diarization}

Overwriting handler.py


test custom pipeline

In [1]:
from handler import EndpointHandler

# init handler
my_handler = EndpointHandler(path=".")

In [2]:
import base64
from PIL import Image
from io import BytesIO
import json

# file reader
with open("sample.wav", "rb") as f:
  request = {"inputs": f.read()}

# test the handler
pred = my_handler(request)

In [3]:
pred

{'diarization': [{'label': 'SPEAKER_01',
   'start': '0.4978125',
   'stop': '1.3921875'},
  {'label': 'SPEAKER_01', 'start': '1.8984375', 'stop': '2.7590624999999998'},
  {'label': 'SPEAKER_02', 'start': '2.9953125', 'stop': '3.5015625000000004'},
  {'label': 'SPEAKER_01',
   'start': '3.5690625000000002',
   'stop': '4.311562500000001'},
  {'label': 'SPEAKER_02', 'start': '4.6153125', 'stop': '6.7753125'},
  {'label': 'SPEAKER_00', 'start': '7.1128125', 'stop': '7.551562500000001'},
  {'label': 'SPEAKER_02',
   'start': '7.551562500000001',
   'stop': '9.475312500000001'},
  {'label': 'SPEAKER_02',
   'start': '9.812812500000003',
   'stop': '10.555312500000003'},
  {'label': 'SPEAKER_00',
   'start': '9.863437500000003',
   'stop': '10.420312500000001'},
  {'label': 'SPEAKER_03', 'start': '12.411562500000002', 'stop': '15.5503125'},
  {'label': 'SPEAKER_00', 'start': '15.786562500000002', 'stop': '16.1409375'},
  {'label': 'SPEAKER_01', 'start': '16.1409375', 'stop': '16.1578125'},
