Spaces:

sasan
/

KITT

Build error

KITT / stttotts.py

Abigail

first commit tts and stt with multiple stt possibilities

5817c5e about 1 year ago

5.62 kB

	# -- coding: utf-8 --
	"""sttToTts.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/15QqRKFSwfhRdnaj5-R1z6xFfeEOOta38
	"""

	#text-to-speech and speech to text
	!pip install TTS
	!pip install transformers

	#text to speech
	from TTS.api import TTS
	tts = TTS("tts_models/multilingual/multi-dataset/your_tts", cs_api_model = "TTS.cs_api.CS_API", gpu=True)

	#voice recording
	import IPython.display
	import google.colab.output
	import base64
	# all imports for voice recording
	from IPython.display import Javascript
	from google.colab import output
	from base64 import b64decode

	#to record sound, found on https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be

	RECORD = """
	const sleep = time => new Promise(resolve => setTimeout(resolve, time))
	const b2text = blob => new Promise(resolve => {
	const reader = new FileReader()
	reader.onloadend = e => resolve(e.srcElement.result)
	reader.readAsDataURL(blob)
	})
	var record = time => new Promise(async resolve => {
	stream = await navigator.mediaDevices.getUserMedia({ audio: true })
	recorder = new MediaRecorder(stream)
	chunks = []
	recorder.ondataavailable = e => chunks.push(e.data)
	recorder.start()
	await sleep(time)
	recorder.onstop = async ()=>{
	blob = new Blob(chunks)
	text = await b2text(blob)
	resolve(text)
	}
	recorder.stop()
	})
	"""

	def record(name, sec):
	display(Javascript(RECORD))
	s = output.eval_js('record(%d)' % (sec*1000))
	b = b64decode(s.split(',')[1])
	with open(f'{name}.webm','wb') as f:
	f.write(b)
	return (f'{name}.webm') # or webm ?

	#to record the text which is going to be transcribed
	record('audio', sec = 10)

	#works -- speech-to-text with an audio I provide the path to reach
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import librosa

	# load model and processor
	processor = WhisperProcessor.from_pretrained("openai/whisper-small")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
	model.config.forced_decoder_ids = None

	# load audio from a specific path
	audio_path = "audio.webm"
	audio_array, sampling_rate = librosa.load(audio_path, sr=16000) # "sr=16000" ensures that the sampling rate is as required


	# process the audio array
	input_features = processor(audio_array, sampling_rate, return_tensors="pt").input_features


	predicted_ids = model.generate(input_features)

	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
	print(transcription)

	#to record the speaker's voice used for tts
	record('speaker', sec = 10 )

	#library to convert digits to words (ex : 1 --> one)
	import locale
	locale.getpreferredencoding = lambda: "UTF-8"
	!pip install inflect

	import re
	import inflect
	#because numbers under digit format are ignored otherwise
	def convert_numbers_to_words(s):
	p = inflect.engine()
	# Find all sequences of digits in the string
	numbers = re.findall(r'\d+', s)
	for number in numbers:
	# Convert each number to words
	words = p.number_to_words(number)
	# Replace the original number in the string with its word representation
	s = s.replace(number, words)
	return s

	#model test 1 for text to speech
	#works - text to speech with voice cloner (by providing the path to the audio where the voice is)
	from google.colab import drive
	from IPython.display import Audio



	tts.tts_to_file(text=convert_numbers_to_words(str(transcription)),
	file_path="output.wav",
	speaker_wav='speaker.webm',
	language="en",
	emotion ='angry',
	speed = 2)
	audio_path = "output.wav"
	Audio(audio_path)

	#model test 2 for text to speech
	from IPython.display import Audio
	# TTS with on the fly voice conversion
	api = TTS("tts_models/deu/fairseq/vits")
	api.tts_with_vc_to_file(
	text="Wie sage ich auf Italienisch, dass ich dich liebe?",
	speaker_wav="speaker.webm",
	file_path="ouptut.wav"
	)
	audio_path = "output.wav"
	Audio(audio_path)

	#model test 3 for text to speech
	from TTS.api import TTS
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)

	from IPython.display import Audio


	# generate speech by cloning a voice using custom settings
	tts.tts_to_file(text="But for me to rap like a computer it must be in my genes I got a laptop in my back pocket My pen'll go off when I half-cock it Got a fat knot from that rap profit Made a livin' and a killin' off it Ever since Bill Clinton was still in office with Monica Lewinsky feelin' on his nutsack I'm an MC still as honest",
	file_path="output.wav",
	speaker_wav="Slide 1.m4a",
	language="en",
	emotion = "neutral",
	decoder_iterations=35)

	audio_path = "output.wav"
	Audio(audio_path)

	# Init TTS with the target studio speaker
	tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
	# Run TTS
	tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
	# Run TTS with emotion and speed control
	tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)

	#model test 4 for text to speech
	from IPython.display import Audio

	from TTS.api import TTS
	#api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
	#api.tts_to_file("This is a test.", file_path="output.wav")

	# TTS with on the fly voice conversion
	api = TTS("tts_models/deu/fairseq/vits")
	api.tts_with_vc_to_file(
	"I am a basic human",
	speaker_wav="speaker.webm",
	file_path="output.wav"
	)

	audio_path = "output.wav"
	Audio(audio_path)