|
|
|
"""sttToTts.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38 |
|
""" |
|
|
|
|
|
!pip install TTS |
|
!pip install transformers |
|
|
|
|
|
from TTS.api import TTS |
|
tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True) |
|
|
|
|
|
import IPython.display |
|
import google.colab.output |
|
import base64 |
|
|
|
from IPython.display import Javascript |
|
from google.colab import output |
|
from base64 import b64decode |
|
|
|
|
|
|
|
RECORD = """ |
|
const sleep = time => new Promise(resolve => setTimeout(resolve, time)) |
|
const b2text = blob => new Promise(resolve => { |
|
const reader = new FileReader() |
|
reader.onloadend = e => resolve(e.srcElement.result) |
|
reader.readAsDataURL(blob) |
|
}) |
|
var record = time => new Promise(async resolve => { |
|
stream = await navigator.mediaDevices.getUserMedia({ audio: true }) |
|
recorder = new MediaRecorder(stream) |
|
chunks = [] |
|
recorder.ondataavailable = e => chunks.push(e.data) |
|
recorder.start() |
|
await sleep(time) |
|
recorder.onstop = async ()=>{ |
|
blob = new Blob(chunks) |
|
text = await b2text(blob) |
|
resolve(text) |
|
} |
|
recorder.stop() |
|
}) |
|
""" |
|
|
|
def record(name, sec): |
|
display(Javascript(RECORD)) |
|
s = output.eval_js('record(%d)' % (sec*1000)) |
|
b = b64decode(s.split(',')[1]) |
|
with open(f'{name}.webm','wb') as f: |
|
f.write(b) |
|
return (f'{name}.webm') |
|
|
|
|
|
record('audio', sec = 10) |
|
|
|
|
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
import librosa |
|
|
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-small") |
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") |
|
model.config.forced_decoder_ids = None |
|
|
|
|
|
audio_path = "audio.webm" |
|
audio_array, sampling_rate = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
|
|
input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features |
|
|
|
|
|
predicted_ids = model.generate(input_features) |
|
|
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) |
|
print(transcription) |
|
|
|
|
|
record('speaker', sec = 10 ) |
|
|
|
|
|
import locale |
|
locale.getpreferredencoding = lambda: "UTF-8" |
|
!pip install inflect |
|
|
|
import re |
|
import inflect |
|
|
|
def convert_numbers_to_words(s): |
|
p = inflect.engine() |
|
|
|
numbers = re.findall(r'\d+', s) |
|
for number in numbers: |
|
|
|
words = p.number_to_words(number) |
|
|
|
s = s.replace(number, words) |
|
return s |
|
|
|
|
|
|
|
from google.colab import drive |
|
from IPython.display import Audio |
|
|
|
|
|
|
|
tts.tts_to_file(text=convert_numbers_to_words(str(transcription)), |
|
file_path="output.wav", |
|
speaker_wav='speaker.webm', |
|
language="en", |
|
emotion ='angry', |
|
speed = 2) |
|
audio_path = "output.wav" |
|
Audio(audio_path) |
|
|
|
|
|
from IPython.display import Audio |
|
|
|
api = TTS("tts_models/deu/fairseq/vits") |
|
api.tts_with_vc_to_file( |
|
text="Wie sage ich auf Italienisch, dass ich dich liebe?", |
|
speaker_wav="speaker.webm", |
|
file_path="ouptut.wav" |
|
) |
|
audio_path = "output.wav" |
|
Audio(audio_path) |
|
|
|
|
|
from TTS.api import TTS |
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True) |
|
|
|
from IPython.display import Audio |
|
|
|
|
|
|
|
tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest", |
|
file_path="output.wav", |
|
speaker_wav="Slide 1.m4a", |
|
language="en", |
|
emotion = "neutral", |
|
decoder_iterations=35) |
|
|
|
audio_path = "output.wav" |
|
Audio(audio_path) |
|
|
|
|
|
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) |
|
|
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) |
|
|
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) |
|
|
|
|
|
from IPython.display import Audio |
|
|
|
from TTS.api import TTS |
|
|
|
|
|
|
|
|
|
api = TTS("tts_models/deu/fairseq/vits") |
|
api.tts_with_vc_to_file( |
|
"I am a basic human", |
|
speaker_wav="speaker.webm", |
|
file_path="output.wav" |
|
) |
|
|
|
audio_path = "output.wav" |
|
Audio(audio_path) |