File size: 5,406 Bytes
3bafaf7
98aa923
3bafaf7
98aa923
 
3bafaf7
98aa923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import subprocess

command = "git clone https://github.com/OlaWod/FreeVC.git"
subprocess.run(command, shell=True)

command = "git clone https://github.com/OlaWod/FreeVC.git"
subprocess.run(command, shell=True)

import gradio as gr
import numpy as np
import os
from scipy.io.wavfile import write
import tempfile
import zipfile
import shutil
from pydub import AudioSegment
from pydub.silence import split_on_silence
from IPython.display import Audio
import nltk  # we'll use this to split into sentences
import subprocess

from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio, display
import numpy as np

from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

# Preload models if necessary
preload_models()

def process_audio_files_with_logging(script, speaker, cloneFile):
    log_messages = "Starting audio processing...\n"
    sentences = script.split('\n')
    sentences = [item.strip() for item in sentences if item.strip()]
    GEN_TEMP = 0.4  # Example temperature, adjust as necessary

    temp_dir = tempfile.mkdtemp()

    for idx, sentence in enumerate(sentences):
        log_messages += f"Processing sentence {idx + 1}: {sentence}\n"
        semantic_tokens = generate_text_semantic(
            sentence,
            history_prompt=speaker,
            temp=GEN_TEMP,
            min_eos_p=0.05,
        )

        audio_array = semantic_to_waveform(semantic_tokens, history_prompt=speaker)
        filename = os.path.join(temp_dir, f"audio_{idx:02d}.wav")
        write(filename, SAMPLE_RATE, audio_array)
        log_messages += f"Generated audio for sentence {idx + 1}.\n"

    log_messages += "All sentences processed. Starting silence reduction...\n"

    # Process each file to remove or reduce silence
    for root, _, files in os.walk(temp_dir):
        with open("FreeVC/convert.txt", "w") as f:
            for file in files:
                file_path = os.path.join(root, file)
                audio = AudioSegment.from_file(file_path, format="wav")

                # Detect non-silent chunks and process
                processed_audio = process_audio_for_silence(audio, log_messages)

                # Overwrite the original file with processed audio
                processed_audio.export(file_path, format="wav")

                file_name_without_extension, file_extension = os.path.splitext(file)
                line = f"{file_name_without_extension}|{file_path}|{cloneFile[0]}\n"
                f.write(line)
                log_messages += line + "\n"
    #command = "python FreeVC/convert.py --hpfile FreeVC/configs/freevc.json --ptfile FreeVC/checkpoints/freevc.pth --txtpath FreeVC/convert.txt --outdir FreeVC/outputs/freevc"
    #subprocess.run(command, shell=True)
    log_messages += "Silence reduction complete. Zipping files...\n"

    # Zip the processed files
    zip_filename = zip_processed_files(temp_dir, log_messages)

    # Clean up the temporary directory
    shutil.rmtree(temp_dir)

    log_messages += "Processing complete. Files ready for download.\n"
    return zip_filename, log_messages

def process_audio_for_silence(audio, log_messages):
    # Parameters for silence detection
    silence_thresh = -32  # Silence threshold in dB
    min_silence_len = 1000  # Minimum length of silence to consider in ms
    keep_silence = 300  # Amount of silence to keep after the silence in ms

    # Detect non-silent chunks
    non_silent_chunks = split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )

    # Combine the non-silent chunks back into a single audio segment
    processed_audio = AudioSegment.empty()
    for chunk in non_silent_chunks:
        processed_audio += chunk

    log_messages += "Audio processed for silence.\n"
    return processed_audio

def zip_processed_files(temp_dir, log_messages):
    zip_filename = os.path.join(tempfile.gettempdir(), "processed_audio_files.zip")
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, _, files in os.walk(temp_dir):
            for file in files:
                zipf.write(os.path.join(root, file), file)

    log_messages += "Files zipped successfully.\n"
    return zip_filename

# Define the Gradio interface
interface = gr.Interface(
    fn=process_audio_files_with_logging,
    inputs=[gr.Textbox(label="Script", lines=10), gr.Dropdown(label="Speaker", choices=[("French","v2/fr_speaker_7"), ("English","v2/en_speaker_7"), ("Japanese","v2/ja_speaker_2"), ("German","v2/de_speaker_6"), ("Hindi","v2/hi_speaker_2"), ("Italian","v2/it_speaker_6"), ("Korean","v2/ko_speaker_0"), ("Polish","v2/pl_speaker_2"), ("Portuguese","v2/pt_speaker_5"), ("Russian","v2/ru_speaker_4"), ("Spanish","v2/es_speaker_0"), ("Turkish","v2/tr_speaker_1")]), gr.Files(label="clone voice")],
    outputs=[gr.File(label="Download Processed Files"), gr.Textbox(label="Log Messages", lines=20)],
    title="Audio Processing and Generation",
    description="Enter a script and select a speaker to generate and process audio files. Process logs will be displayed below."
)

interface.launch()