levanti_en_ar / tts.py
Guy Mor-Lan
add files
e35836c
raw
history blame
3.46 kB
#%%
import azure.cognitiveservices.speech as speechsdk
import re
import os
import hashlib
import random
from dotenv import load_dotenv
load_dotenv(".env")
print(os.environ.get('SPEECH_KEY'))
print(os.environ.get('SPEECH_REGION'))
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
region=os.environ.get('SPEECH_REGION'))
def do_cleanup(dir='wavs', num_files=100):
files = os.listdir(dir)
if len(files) > num_files:
for file in files[:len(files) - num_files]:
os.remove(f"{dir}/{file}")
def add_sukun(text):
# Define Arabic letters and sukun
arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
shadda = 'ّ'
arabic_letters += shadda
sukun = 'ْ'
punctuation = '.,;!?،؛؟'
def process_word(word):
# If the last character is punctuation, process the letter before it
if word[-1] in punctuation:
if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
return word[:-2] + word[-2] + sukun + word[-1]
return word
# If the last character is an Arabic letter and does not have a sukun, add one
elif word[-1] in arabic_letters and word[-1] != sukun:
return word + sukun
return word
# Use regex to split text into words and punctuation
words = re.findall(r'\S+|[.,;!?،؛؟]', text)
processed_text = ' '.join(process_word(word) for word in words)
return processed_text
def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'
def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):
input_text = add_sukun(input_text)
hash = hashlib.md5(input_text.encode()).hexdigest()
if os.path.exists(f"wavs/{hash}.wav"):
return f"wavs/{hash}.wav"
audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
# speech_config.speech_synthesis_voice_name=voice
# speech_config.speech_synthesis_language = "ar-EG"
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
audio_config=audio_config)
if use_ssml:
# print("Using SSML")
ssml = get_ssml(input_text, voice=voice)
result = speech_synthesizer.speak_ssml_async(ssml).get()
else:
# print("Using text")
result = speech_synthesizer.speak_text_async(input_text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("Speech synthesized for text [{}]".format(input_text))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# randomly every 50 calls, clean up the wavs folder
if random.randint(1, 50) == 1:
do_cleanup()
return f"wavs/{hash}.wav"