KITT / stttotts.py
Abigail
first commit tts and stt with multiple stt possibilities
5817c5e
raw
history blame
5.62 kB
# -*- coding: utf-8 -*-
"""sttToTts.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
"""
#text-to-speech and speech to text
!pip install TTS
!pip install transformers
#text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True)
#voice recording
import IPython.display
import google.colab.output
import base64
# all imports for voice recording
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
await sleep(time)
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""
def record(name, sec):
display(Javascript(RECORD))
s = output.eval_js('record(%d)' % (sec*1000))
b = b64decode(s.split(',')[1])
with open(f'{name}.webm','wb') as f:
f.write(b)
return (f'{name}.webm') # or webm ?
#to record the text which is going to be transcribed
record('audio', sec = 10)
#works -- speech-to-text with an audio I provide the path to reach
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None
# load audio from a specific path
audio_path = "audio.webm"
audio_array, sampling_rate = librosa.load(audio_path, sr=16000) # "sr=16000" ensures that the sampling rate is as required
# process the audio array
input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)
#to record the speaker's voice used for tts
record('speaker', sec = 10 )
#library to convert digits to words (ex : 1 --> one)
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install inflect
import re
import inflect
#because numbers under digit format are ignored otherwise
def convert_numbers_to_words(s):
p = inflect.engine()
# Find all sequences of digits in the string
numbers = re.findall(r'\d+', s)
for number in numbers:
# Convert each number to words
words = p.number_to_words(number)
# Replace the original number in the string with its word representation
s = s.replace(number, words)
return s
#model test 1 for text to speech
#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
from google.colab import drive
from IPython.display import Audio
tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
file_path="output.wav",
speaker_wav='speaker.webm',
language="en",
emotion ='angry',
speed = 2)
audio_path = "output.wav"
Audio(audio_path)
#model test 2 for text to speech
from IPython.display import Audio
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
text="Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="speaker.webm",
file_path="ouptut.wav"
)
audio_path = "output.wav"
Audio(audio_path)
#model test 3 for text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
from IPython.display import Audio
# generate speech by cloning a voice using custom settings
tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
file_path="output.wav",
speaker_wav="Slide 1.m4a",
language="en",
emotion = "neutral",
decoder_iterations=35)
audio_path = "output.wav"
Audio(audio_path)
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
#model test 4 for text to speech
from IPython.display import Audio
from TTS.api import TTS
#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
#api.tts_to_file("This is a test.", file_path="output.wav")
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
"I am a basic human",
speaker_wav="speaker.webm",
file_path="output.wav"
)
audio_path = "output.wav"
Audio(audio_path)