File size: 5,618 Bytes
5817c5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# -*- coding: utf-8 -*-
"""sttToTts.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
"""
#text-to-speech and speech to text
!pip install TTS
!pip install transformers
#text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True)
#voice recording
import IPython.display
import google.colab.output
import base64
# all imports for voice recording
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
await sleep(time)
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""
def record(name, sec):
display(Javascript(RECORD))
s = output.eval_js('record(%d)' % (sec*1000))
b = b64decode(s.split(',')[1])
with open(f'{name}.webm','wb') as f:
f.write(b)
return (f'{name}.webm') # or webm ?
#to record the text which is going to be transcribed
record('audio', sec = 10)
#works -- speech-to-text with an audio I provide the path to reach
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None
# load audio from a specific path
audio_path = "audio.webm"
audio_array, sampling_rate = librosa.load(audio_path, sr=16000) # "sr=16000" ensures that the sampling rate is as required
# process the audio array
input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)
#to record the speaker's voice used for tts
record('speaker', sec = 10 )
#library to convert digits to words (ex : 1 --> one)
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install inflect
import re
import inflect
#because numbers under digit format are ignored otherwise
def convert_numbers_to_words(s):
p = inflect.engine()
# Find all sequences of digits in the string
numbers = re.findall(r'\d+', s)
for number in numbers:
# Convert each number to words
words = p.number_to_words(number)
# Replace the original number in the string with its word representation
s = s.replace(number, words)
return s
#model test 1 for text to speech
#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
from google.colab import drive
from IPython.display import Audio
tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
file_path="output.wav",
speaker_wav='speaker.webm',
language="en",
emotion ='angry',
speed = 2)
audio_path = "output.wav"
Audio(audio_path)
#model test 2 for text to speech
from IPython.display import Audio
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
text="Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="speaker.webm",
file_path="ouptut.wav"
)
audio_path = "output.wav"
Audio(audio_path)
#model test 3 for text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
from IPython.display import Audio
# generate speech by cloning a voice using custom settings
tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
file_path="output.wav",
speaker_wav="Slide 1.m4a",
language="en",
emotion = "neutral",
decoder_iterations=35)
audio_path = "output.wav"
Audio(audio_path)
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
#model test 4 for text to speech
from IPython.display import Audio
from TTS.api import TTS
#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
#api.tts_to_file("This is a test.", file_path="output.wav")
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
"I am a basic human",
speaker_wav="speaker.webm",
file_path="output.wav"
)
audio_path = "output.wav"
Audio(audio_path) |