File size: 2,133 Bytes
a8c39f5
b3385db
a8c39f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3385db
 
 
 
a8c39f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import librosa
import numpy as np


def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250):
    """
    Splits an audio signal into segments using a fixed frame size and hop size.

    Parameters:
    - audio (np.ndarray): The audio signal to split.
    - sr (int): The sample rate of the input audio (default is 16000).
    - silence_thresh (int): Silence threshold (default =-60dB)
    - min_silence_len (int): Minimum silence duration (default 250ms).

    Returns:
    - list of np.ndarray: A list of audio segments.
    - np.ndarray: The intervals where the audio was split.
    """
    frame_length = int(min_silence_len / 1000 * sr)
    hop_length = frame_length // 2
    intervals = librosa.effects.split(audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length)
    intervals[0, 0] = 0
    intervals[1:, 0] = intervals[:-1, 1]
    intervals[-1, 1] = len(audio)
    audio_segments = [audio[start:end] for start, end in intervals]

    return audio_segments, intervals


def merge_audio(audio_segments, intervals, sr_orig, sr_new):
    """
    Merges audio segments back into a single audio signal, filling gaps with silence.

    Parameters:
    - audio_segments (list of np.ndarray): The non-silent audio segments.
    - intervals (np.ndarray): The intervals used for splitting the original audio.
    - sr_orig (int): The sample rate of the original audio
    - sr_new (int): The sample rate of the model

    Returns:
    - np.ndarray: The merged audio signal with silent gaps restored.
    """
    sr_ratio = sr_new / sr_orig if sr_new > sr_orig else 1.0

    merged_audio = np.zeros(
        int(intervals[0][0] * sr_ratio if intervals[0][0] > 0 else 0),
        dtype=audio_segments[0].dtype,
    )

    merged_audio = np.concatenate((merged_audio, audio_segments[0]))

    for i in range(1, len(intervals)):
        silence_duration = int((intervals[i][0] - intervals[i - 1][1]) * sr_ratio)
        silence = np.zeros(silence_duration, dtype=audio_segments[0].dtype)
        merged_audio = np.concatenate((merged_audio, silence, audio_segments[i]))

    return merged_audio