File size: 4,032 Bytes
96ea36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import sys
sys.path.append('../AudioJourney')
import logging
import yaml
import numpy as np
import torch
import torchaudio
from torchaudio.transforms import SpeedPerturbation
import nltk
from APIs import WRITE_AUDIO, LOUDNESS_NORM
from flask import Flask, request, jsonify
from transformers import BarkModel, AutoProcessor


with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Configure the logging format and level
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Create a FileHandler for the log file
os.makedirs('services_logs', exist_ok=True)
log_filename = 'services_logs/Text-to-Speech.log'
file_handler = logging.FileHandler(log_filename, mode='w')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

# Add the FileHandler to the root logger
logging.getLogger('').addHandler(file_handler)

# Initialize the model here
SPEED = float(config['Text-to-Speech']['speed'])
speed_perturb = SpeedPerturbation(32000, [SPEED])

logging.info('Loading Bark model ...')
# TODO: fp16?
model = BarkModel.from_pretrained("suno/bark")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model = model.to_bettertransformer()    # Flash attention
SAMPLE_RATE = model.generation_config.sample_rate
SEMANTIC_TEMPERATURE = 0.9
COARSE_TEMPERATURE = 0.5
FINE_TEMPERATURE = 0.5

processor = AutoProcessor.from_pretrained("suno/bark")

app = Flask(__name__)

@app.route('/generate_speech', methods=['POST'])
def generate_speech():
    # Receive the text from the POST request
    data = request.json
    text = data['text']
    speaker_id = data['speaker_id']
    speaker_npz = data['speaker_npz']
    volume = float(data.get('volume', -35))
    output_wav = data.get('output_wav', 'out.wav')
    
    logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}')

    try:   
        # Generate audio using the global pipe object
        text = text.replace('\n', ' ').strip()
        sentences = nltk.sent_tokenize(text)
        silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0)  # 0.1 second of silence

        pieces = []
        for sentence in sentences:
            inputs = processor(sentence, voice_preset=speaker_npz).to(device)
            # NOTE: you must run the line below, otherwise you will see the runtime error
            # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
            inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1)

            with torch.inference_mode():
                # TODO: min_eos_p?
                output = model.generate(
                    **inputs,
                    do_sample = True,
                    semantic_temperature = SEMANTIC_TEMPERATURE,
                    coarse_temperature = COARSE_TEMPERATURE,
                    fine_temperature = FINE_TEMPERATURE
                )

            pieces += [output, silence]

        result_audio = torch.cat(pieces, dim=1)
        wav_tensor = result_audio.to(dtype=torch.float32).cpu()
        wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000)
        wav = speed_perturb(wav.float())[0].squeeze(0)
        wav = wav.numpy()
        wav = LOUDNESS_NORM(wav, volumn=volume)
        WRITE_AUDIO(wav, name=output_wav)

        # Return success message and the filename of the generated audio
        return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})

    except Exception as e:
        raise e
        # Return error message if something goes wrong
        return jsonify({'API error': str(e)}), 500


if __name__ == '__main__':
    service_port = config['Text-to-Speech']['service-port']
    app.run(debug=False, port=service_port)