OpenVoiceV2

Sleeping

File size: 12,620 Bytes

import os
import gradio as gr
import requests
import langid
import base64
import json
import time
import re
import hashlib
import hash_code_for_cached_output


API_URL = os.environ.get("API_URL")
TOKEN = os.environ.get("TOKEN")
RESULT_URL = os.environ.get("RESULT_URL")
supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr']
supported_styles = {
    'zh': "zh_default",
    'en': [
        "en_default",
        "en_us",
        "en_br",
        "en_au",
        "en_in"
    ],
    "es": "es_default",
    "fr": "fr_default",
    "ja": "jp_default",
    "ko": "kr_default"
}

output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

def audio_to_base64(audio_file):
    with open(audio_file, "rb") as audio_file:
        audio_data = audio_file.read()
        base64_data = base64.b64encode(audio_data).decode("utf-8")
    return base64_data

def count_chars_words(sentence):
    segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence)
    
    char_count = 0
    word_count = 0
    for segment in segments:
        if re.match(r'[\u4e00-\u9fa5]+', segment): 
            char_count += len(segment)
        else: 
            word_count += len(segment.split())
    return char_count + word_count

def predict(prompt, style, audio_file_pth, speed, agree):
    # initialize a empty info
    text_hint = ''
    # agree with the terms
    if agree == False:
        text_hint += '[ERROR] Please accept the Terms & Condition!\n'
        gr.Warning("Please accept the Terms & Condition!")
        return (
            text_hint,
            None,
            None,
        )

    # Before we get into inference, we will detect if it is from example table or default value
    # If so, we use a cached Audio. Noted that, it is just for demo efficiency.
    # hash code were generated by `hash_code_for_cached_output.py`
    # this hash get from gradio console
    cached_outputs = {
        "af39e1f1ff_60565a5c20_en_us" : "cached_outputs/0.wav",
        "af39e1f1ff_420ab8211d_en_us" : "cached_outputs/1.wav",
        "ced034cc22_0f96bf44f5_es_default" : "cached_outputs/2.wav",
        "d3172b178d_3fef5adc6f_zh_default" : "cached_outputs/3.wav",
        "cda6998e1a_9897b60a4e_jp_default" : "cached_outputs/4.wav"
    }
    unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, prompt, style)
    print("audio_file_pth is", audio_file_pth)
    print("unique_code is", unique_code)
    if unique_code in list(cached_outputs.keys()):
        return (
            'We get the cached output for you, since you are trying to generate an example cloning.',
            cached_outputs[unique_code],
            audio_file_pth,
        )

    # first detect the input language
    language_predicted = langid.classify(prompt)[0].strip()  
    print(f"Detected language:{language_predicted}")


    if language_predicted not in supported_languages:
        text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
        gr.Warning(
            f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
        )

        return (
            text_hint,
            None,
            None,
        )

    # check the style
    if style not in supported_styles[language_predicted]:
        text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n"
        gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.")

    prompt_length = count_chars_words(prompt)

    speaker_wav = audio_file_pth

    if prompt_length < 2:
        text_hint += f"[ERROR] Please give a longer prompt text \n"
        gr.Warning("Please give a longer prompt text")
        return (
            text_hint,
            None,
            None,
        )
    if prompt_length > 50:
        text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n"
        gr.Warning(
            "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749"
        )
        return (
            text_hint,
            None,
            None,
        )

    save_path = f'{output_dir}/output.wav'
    speaker_audio_base64 = audio_to_base64(speaker_wav)
    if style == 'en_us':  # we update us accent
        style = 'en_newest'
    data = {
        "text": prompt,
        "reference_speaker": speaker_audio_base64,
        "language": style,
        "speed": speed
    }
    
    start = time.time()

    headers = {
        "Authorization": f"Bearer {TOKEN}"
    }
    
    response = requests.post(API_URL, json=data, headers=headers, timeout=60)
    print(f'Get response successfully within {time.time() - start}')

    task_id = response.json()['task_id']
    while True:
        response = requests.post(RESULT_URL, json={'task_id': task_id}, headers=headers)
        json_data = response.json()
        status = json_data['status']
        if status in ["CREATED", "RUNNING"]:
            time.sleep(1)
            continue
        if status == 'FAILED':
            text_hint += f"[HTTP ERROR] {json_data['error']} \n"
            gr.Warning(
                f"[HTTP ERROR] {json_data['error']} \n"
            )
            return (
                text_hint,
                None,
                None,
            )
        else:
            decoded_bytes = base64.b64decode(json_data['result']['base64'].encode('utf-8'))
            with open(save_path, 'wb') as f:
                f.write(decoded_bytes)
            
            text_hint += f'''Get response successfully \n'''
            return (
                text_hint,
                save_path,
                speaker_wav,
            )


title = "MyShell OpenVoice V2"

description = """
In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning.
"""

description_v2 = """
In April 2024, we released **OpenVoice V2**, which includes all features in V1 and has: 
 - **Better Audio Quality**. OpenVoice V2 adopts a different training strategy that delivers better audio quality. 
 - **Native Multi-lingual Support**. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2. 
 - **Free Commercial Use**. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
"""

markdown_table = """
<div align="center" style="margin-bottom: 10px;">

|               |               |               |
| :-----------: | :-----------: | :-----------: | 
| **OpenSource Repo** | **Project Page** | **Join the Community** |        
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |

</div>
"""

markdown_table_v2 = """
<div align="center" style="margin-bottom: 2px;">

|               |               |               |              |
| :-----------: | :-----------: | :-----------: | :-----------: | 
| **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> |  **Project Page** |  [OpenVoice](https://research.myshell.ai/open-voice) |     

| | |
| :-----------: | :-----------: |
**Join the Community** |   [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |

</div>
"""
content = """
<div>
  <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>If you want to deploy the model by yourself and perform inference, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb'>this jupyter notebook</a>.</strong>
</div>
"""
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"


examples = [
    [
        "Did you ever hear a folk tale about a giant turtle?",
        'en_us',
        "examples/speaker0.mp3",
        True,
    ],[
        "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
        'es_default',
        "examples/speaker1.mp3",
        True,
    ],[
        "我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。",
        'zh_default',
        "examples/speaker2.mp3",
        True,
    ],[
        "彼は毎朝ジョギングをして体を健康に保っています。",
        'jp_default',
        "examples/speaker3.mp3",
        True,
    ],
]

with gr.Blocks(analytics_enabled=False) as demo:

    with gr.Row():
        with gr.Column():
            with gr.Row():
                gr.Markdown(
                    """
                    ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
                    """
                )
            with gr.Row():    
                gr.Markdown(markdown_table_v2)
            with gr.Row():
                gr.Markdown(description)
        with gr.Column():
            gr.Video('./openvoicev2.mp4', autoplay=True)

    with gr.Row():
        gr.Markdown(description_v2)

    with gr.Row():
        gr.HTML(wrapped_markdown_content)

    with gr.Row():
        with gr.Column():
            input_text_gr = gr.Textbox(
                label="Text Prompt",
                info="One or two sentences at a time is better. Up to 200 text characters.",
                value="The bustling city square bustled with street performers, tourists, and local vendors.",
            )
            style_gr = gr.Dropdown(
                label="Style",
                info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
                choices=["en_default", "en_us", "en_br", "en_au", "en_in", "es_default", "fr_default", "jp_default", "zh_default", "kr_default",],
                max_choices=1,
                value="en_us",
            )
            ref_gr = gr.Audio(
                label="Reference Audio",
                info="Click on the ✎ button to upload your own target speaker audio",
                type="filepath",
                value="examples/speaker0.mp3",
            )
            tos_gr = gr.Checkbox(
                label="Agree",
                value=False,
                info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
            )

            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)


        with gr.Column():
            out_text_gr = gr.Text(label="Info")
            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
            ref_audio_gr = gr.Audio(label="Reference Audio Used")

            gr.Examples(examples,
                        label="Examples",
                        inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
                        outputs=[out_text_gr, audio_gr, ref_audio_gr],
                        fn=predict,
                        cache_examples=False,)
            tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])

demo.queue(concurrency_count=6)  
demo.launch(debug=True, show_api=True)