Spaces:

sasan
/

KITT

Build error

File size: 5,618 Bytes

5817c5e

# -*- coding: utf-8 -*-
"""sttToTts.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
"""

#text-to-speech and speech to text
!pip install TTS
!pip install transformers

#text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model  = "TTS.cs_api.CS_API", gpu=True)

#voice recording
import IPython.display
import google.colab.output
import base64
# all imports for voice recording
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(name, sec):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(f'{name}.webm','wb') as f:
    f.write(b)
  return (f'{name}.webm')  # or webm ?

#to record the text which is going to be transcribed
record('audio', sec = 10)

#works -- speech-to-text with an audio I provide the path to reach
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None

# load audio from a specific path
audio_path = "audio.webm"
audio_array, sampling_rate = librosa.load(audio_path, sr=16000)  # "sr=16000" ensures that the sampling rate is as required


# process the audio array
input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features


predicted_ids = model.generate(input_features)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

#to record the speaker's voice used for tts
record('speaker', sec = 10 )

#library to convert digits to words (ex : 1 --> one)
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install inflect

import re
import inflect
#because numbers under digit format are ignored otherwise
def convert_numbers_to_words(s):
    p = inflect.engine()
    # Find all sequences of digits in the string
    numbers = re.findall(r'\d+', s)
    for number in numbers:
        # Convert each number to words
        words = p.number_to_words(number)
        # Replace the original number in the string with its word representation
        s = s.replace(number, words)
    return s

#model test 1 for text to speech
#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
from google.colab import drive
from IPython.display import Audio



tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
                file_path="output.wav",
                speaker_wav='speaker.webm',
                language="en",
                emotion ='angry',
                speed = 2)
audio_path = "output.wav"
Audio(audio_path)

#model test 2 for text to speech
from IPython.display import Audio
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
    text="Wie sage ich auf Italienisch, dass ich dich liebe?",
    speaker_wav="speaker.webm",
    file_path="ouptut.wav"
)
audio_path = "output.wav"
Audio(audio_path)

#model test 3 for text to speech
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)

from IPython.display import Audio


# generate speech by cloning a voice using custom settings
tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
                file_path="output.wav",
                speaker_wav="Slide 1.m4a",
                language="en",
                emotion = "neutral",
                decoder_iterations=35)

audio_path = "output.wav"
Audio(audio_path)

# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)

#model test 4 for text to speech
from IPython.display import Audio

from TTS.api import TTS
#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
#api.tts_to_file("This is a test.", file_path="output.wav")

# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
    "I am a basic human",
    speaker_wav="speaker.webm",
    file_path="output.wav"
)

audio_path = "output.wav"
Audio(audio_path)