File size: 5,618 Bytes
5817c5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
"""sttToTts.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
"""

#text-to-speech and speech to text
!pip install TTS
!pip install transformers

#text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model  = "TTS.cs_api.CS_API", gpu=True)

#voice recording
import IPython.display
import google.colab.output
import base64
# all imports for voice recording
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(name, sec):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(f'{name}.webm','wb') as f:
    f.write(b)
  return (f'{name}.webm')  # or webm ?

#to record the text which is going to be transcribed
record('audio', sec = 10)

#works -- speech-to-text with an audio I provide the path to reach
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None

# load audio from a specific path
audio_path = "audio.webm"
audio_array, sampling_rate = librosa.load(audio_path, sr=16000)  # "sr=16000" ensures that the sampling rate is as required


# process the audio array
input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features


predicted_ids = model.generate(input_features)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

#to record the speaker's voice used for tts
record('speaker', sec = 10 )

#library to convert digits to words (ex : 1 --> one)
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install inflect

import re
import inflect
#because numbers under digit format are ignored otherwise
def convert_numbers_to_words(s):
    p = inflect.engine()
    # Find all sequences of digits in the string
    numbers = re.findall(r'\d+', s)
    for number in numbers:
        # Convert each number to words
        words = p.number_to_words(number)
        # Replace the original number in the string with its word representation
        s = s.replace(number, words)
    return s

#model test 1 for text to speech
#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
from google.colab import drive
from IPython.display import Audio



tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
                file_path="output.wav",
                speaker_wav='speaker.webm',
                language="en",
                emotion ='angry',
                speed = 2)
audio_path = "output.wav"
Audio(audio_path)

#model test 2 for text to speech
from IPython.display import Audio
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
    text="Wie sage ich auf Italienisch, dass ich dich liebe?",
    speaker_wav="speaker.webm",
    file_path="ouptut.wav"
)
audio_path = "output.wav"
Audio(audio_path)

#model test 3 for text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)

from IPython.display import Audio


# generate speech by cloning a voice using custom settings
tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
                file_path="output.wav",
                speaker_wav="Slide 1.m4a",
                language="en",
                emotion = "neutral",
                decoder_iterations=35)

audio_path = "output.wav"
Audio(audio_path)

# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)

#model test 4 for text to speech
from IPython.display import Audio

from TTS.api import TTS
#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
#api.tts_to_file("This is a test.", file_path="output.wav")

# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
    "I am a basic human",
    speaker_wav="speaker.webm",
    file_path="output.wav"
)

audio_path = "output.wav"
Audio(audio_path)