Spaces:
Running
Running
import os | |
import gradio as gr | |
import requests | |
import langid | |
import base64 | |
import json | |
import time | |
import re | |
import hashlib | |
import hash_code_for_cached_output | |
API_URL = os.environ.get("API_URL") | |
TOKEN = os.environ.get("TOKEN") | |
RESULT_URL = os.environ.get("RESULT_URL") | |
supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr'] | |
supported_styles = { | |
'zh': "zh_default", | |
'en': [ | |
"en_default", | |
"en_us", | |
"en_br", | |
"en_au", | |
"en_in" | |
], | |
"es": "es_default", | |
"fr": "fr_default", | |
"ja": "jp_default", | |
"ko": "kr_default" | |
} | |
output_dir = 'outputs' | |
os.makedirs(output_dir, exist_ok=True) | |
def audio_to_base64(audio_file): | |
with open(audio_file, "rb") as audio_file: | |
audio_data = audio_file.read() | |
base64_data = base64.b64encode(audio_data).decode("utf-8") | |
return base64_data | |
def count_chars_words(sentence): | |
segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence) | |
char_count = 0 | |
word_count = 0 | |
for segment in segments: | |
if re.match(r'[\u4e00-\u9fa5]+', segment): | |
char_count += len(segment) | |
else: | |
word_count += len(segment.split()) | |
return char_count + word_count | |
def predict(prompt, style, audio_file_pth, speed, agree): | |
# initialize a empty info | |
text_hint = '' | |
# agree with the terms | |
if agree == False: | |
text_hint += '[ERROR] Please accept the Terms & Condition!\n' | |
gr.Warning("Please accept the Terms & Condition!") | |
return ( | |
text_hint, | |
None, | |
None, | |
) | |
# Before we get into inference, we will detect if it is from example table or default value | |
# If so, we use a cached Audio. Noted that, it is just for demo efficiency. | |
# hash code were generated by `hash_code_for_cached_output.py` | |
# this hash get from gradio console | |
cached_outputs = { | |
"af39e1f1ff_60565a5c20_en_us" : "cached_outputs/0.wav", | |
"af39e1f1ff_420ab8211d_en_us" : "cached_outputs/1.wav", | |
"ced034cc22_0f96bf44f5_es_default" : "cached_outputs/2.wav", | |
"d3172b178d_3fef5adc6f_zh_default" : "cached_outputs/3.wav", | |
"cda6998e1a_9897b60a4e_jp_default" : "cached_outputs/4.wav" | |
} | |
unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, prompt, style) | |
print("audio_file_pth is", audio_file_pth) | |
print("unique_code is", unique_code) | |
if unique_code in list(cached_outputs.keys()): | |
return ( | |
'We get the cached output for you, since you are trying to generate an example cloning.', | |
cached_outputs[unique_code], | |
audio_file_pth, | |
) | |
# first detect the input language | |
language_predicted = langid.classify(prompt)[0].strip() | |
print(f"Detected language:{language_predicted}") | |
if language_predicted not in supported_languages: | |
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" | |
gr.Warning( | |
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" | |
) | |
return ( | |
text_hint, | |
None, | |
None, | |
) | |
# check the style | |
if style not in supported_styles[language_predicted]: | |
text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n" | |
gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.") | |
prompt_length = count_chars_words(prompt) | |
speaker_wav = audio_file_pth | |
if prompt_length < 2: | |
text_hint += f"[ERROR] Please give a longer prompt text \n" | |
gr.Warning("Please give a longer prompt text") | |
return ( | |
text_hint, | |
None, | |
None, | |
) | |
if prompt_length > 50: | |
text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n" | |
gr.Warning( | |
"Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749" | |
) | |
return ( | |
text_hint, | |
None, | |
None, | |
) | |
save_path = f'{output_dir}/output.wav' | |
speaker_audio_base64 = audio_to_base64(speaker_wav) | |
if style == 'en_us': # we update us accent | |
style = 'en_newest' | |
data = { | |
"text": prompt, | |
"reference_speaker": speaker_audio_base64, | |
"language": style, | |
"speed": speed | |
} | |
start = time.time() | |
headers = { | |
"Authorization": f"Bearer {TOKEN}" | |
} | |
response = requests.post(API_URL, json=data, headers=headers, timeout=60) | |
print(f'Get response successfully within {time.time() - start}') | |
task_id = response.json()['task_id'] | |
while True: | |
response = requests.post(RESULT_URL, json={'task_id': task_id}, headers=headers) | |
json_data = response.json() | |
status = json_data['status'] | |
if status in ["CREATED", "RUNNING"]: | |
time.sleep(1) | |
continue | |
if status == 'FAILED': | |
text_hint += f"[HTTP ERROR] {json_data['error']} \n" | |
gr.Warning( | |
f"[HTTP ERROR] {json_data['error']} \n" | |
) | |
return ( | |
text_hint, | |
None, | |
None, | |
) | |
else: | |
decoded_bytes = base64.b64decode(json_data['result']['base64'].encode('utf-8')) | |
with open(save_path, 'wb') as f: | |
f.write(decoded_bytes) | |
text_hint += f'''Get response successfully \n''' | |
return ( | |
text_hint, | |
save_path, | |
speaker_wav, | |
) | |
title = "MyShell OpenVoice V2" | |
description = """ | |
In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning. | |
""" | |
description_v2 = """ | |
In April 2024, we released **OpenVoice V2**, which includes all features in V1 and has: | |
- **Better Audio Quality**. OpenVoice V2 adopts a different training strategy that delivers better audio quality. | |
- **Native Multi-lingual Support**. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2. | |
- **Free Commercial Use**. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use. | |
""" | |
markdown_table = """ | |
<div align="center" style="margin-bottom: 10px;"> | |
| | | | | |
| :-----------: | :-----------: | :-----------: | | |
| **OpenSource Repo** | **Project Page** | **Join the Community** | | |
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) | | |
</div> | |
""" | |
markdown_table_v2 = """ | |
<div align="center" style="margin-bottom: 2px;"> | |
| | | | | | |
| :-----------: | :-----------: | :-----------: | :-----------: | | |
| **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) | | |
| | | | |
| :-----------: | :-----------: | | |
**Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) | | |
</div> | |
""" | |
content = """ | |
<div> | |
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>If you want to deploy the model by yourself and perform inference, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb'>this jupyter notebook</a>.</strong> | |
</div> | |
""" | |
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>" | |
examples = [ | |
[ | |
"Did you ever hear a folk tale about a giant turtle?", | |
'en_us', | |
"examples/speaker0.mp3", | |
True, | |
],[ | |
"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.", | |
'es_default', | |
"examples/speaker1.mp3", | |
True, | |
],[ | |
"我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。", | |
'zh_default', | |
"examples/speaker2.mp3", | |
True, | |
],[ | |
"彼は毎朝ジョギングをして体を健康に保っています。", | |
'jp_default', | |
"examples/speaker3.mp3", | |
True, | |
], | |
] | |
with gr.Blocks(analytics_enabled=False) as demo: | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/> | |
""" | |
) | |
with gr.Row(): | |
gr.Markdown(markdown_table_v2) | |
with gr.Row(): | |
gr.Markdown(description) | |
with gr.Column(): | |
gr.Video('./openvoicev2.mp4', autoplay=True) | |
with gr.Row(): | |
gr.Markdown(description_v2) | |
with gr.Row(): | |
gr.HTML(wrapped_markdown_content) | |
with gr.Row(): | |
with gr.Column(): | |
input_text_gr = gr.Textbox( | |
label="Text Prompt", | |
info="One or two sentences at a time is better. Up to 200 text characters.", | |
value="The bustling city square bustled with street performers, tourists, and local vendors.", | |
) | |
style_gr = gr.Dropdown( | |
label="Style", | |
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)", | |
choices=["en_default", "en_us", "en_br", "en_au", "en_in", "es_default", "fr_default", "jp_default", "zh_default", "kr_default",], | |
max_choices=1, | |
value="en_us", | |
) | |
ref_gr = gr.Audio( | |
label="Reference Audio", | |
info="Click on the ✎ button to upload your own target speaker audio", | |
type="filepath", | |
value="examples/speaker0.mp3", | |
) | |
tos_gr = gr.Checkbox( | |
label="Agree", | |
value=False, | |
info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE", | |
) | |
tts_button = gr.Button("Send", elem_id="send-btn", visible=True) | |
with gr.Column(): | |
out_text_gr = gr.Text(label="Info") | |
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True) | |
ref_audio_gr = gr.Audio(label="Reference Audio Used") | |
gr.Examples(examples, | |
label="Examples", | |
inputs=[input_text_gr, style_gr, ref_gr, tos_gr], | |
outputs=[out_text_gr, audio_gr, ref_audio_gr], | |
fn=predict, | |
cache_examples=False,) | |
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr]) | |
demo.queue(concurrency_count=6) | |
demo.launch(debug=True, show_api=True) |