File size: 11,034 Bytes
6fa82d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94648ab
 
a1b2bae
13c0603
 
bc0e3c7
6fa82d9
 
94648ab
6fa82d9
 
 
b923173
6fa82d9
 
 
 
 
 
 
 
 
94648ab
281ff2d
6ce11f9
94648ab
7e16289
281ff2d
94648ab
4bae818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4979540
 
 
281ff2d
 
94648ab
6ce11f9
4979540
 
 
 
 
 
 
 
 
 
 
 
 
 
97c030c
281ff2d
6fa82d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f60cc77
6fa82d9
 
 
 
 
 
 
 
bc0e3c7
6fa82d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38f8478
 
6fa82d9
 
94648ab
 
 
eaad3e8
 
 
3c8e6fc
 
052da48
94648ab
 
 
 
 
 
 
 
 
 
6fa82d9
 
38f8478
6fa82d9
 
 
38f8478
 
6fa82d9
38f8478
6fa82d9
 
 
 
 
 
 
 
 
 
 
 
38f8478
6fa82d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
'''
+----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
| Step 1: Set Up       |        |  Step 2: Set Up Gradio  |        |  Step 3: Speech-to-Text       |        |  Step 4: Text-to-Speech |
| Environment          |        |  Interface              |        | & Language Model Processing   |        |  Output                 |
+----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
|                      |        |                         |        |                               |        |                         |
| - Import Python      |        | - Define interface      |        | - Transcribe audio            |        | - XTTS model generates  |
|   libraries          |        |   components            |        |   to text using               |        |   spoken response from  |
| - Initialize models: |--------> - Configure audio and   |------->|   Faster Whisper ASR          |------->|   LLM's text response   |
|   Whisper, Mistral,  |        |   text interaction      |        | - Transcribed text            |        |                         |
|   XTTS               |        | - Launch interface      |        |   is added to                 |        |                         |
|                      |        |                         |        |   chatbot's history           |        |                         |
|                      |        |                         |        | - Mistral LLM                 |        |                         |
|                      |        |                         |        |   processes chatbot           |        |                         |
|                      |        |                         |        |   history to generate         |        |                         |
|                      |        |                         |        |   response                    |        |                         |
+----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
'''

###### Set Up Environment ######

import os
# Set CUDA environment variable and install llama-cpp-python
# llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
os.system('python -m unidic download')
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')


# Third-party library imports
from faster_whisper import WhisperModel
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
#from TTS.utils.manage import ModelManager

# Local imports
from utils import get_sentence, wave_header_chunk, generate_speech_for_sentence

# Load Whisper ASR model
print("Loading Whisper ASR")
whisper_model = WhisperModel("large-v3", device="cpu", compute_type="float32")

# Load Mistral LLM
print("Loading Mistral LLM")
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)


# Load XTTS Model
print("Loading XTTS model")
#model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # move in v2, since xtts_v1 is generated keyerror, I guess you can select it with old github's release.
os.environ["COQUI_TOS_AGREED"] = "1"
#m = ModelManager().download_model(model_name)
##print(m)
#m = model_name

#xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)

device = "cpu"
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
print("⏳Downloading model")
ModelManager().download_model(model_name)
model_path = os.path.join(
    get_user_data_dir("tts"), model_name.replace("/", "--")
)

config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
xtts_model = Xtts.init_from_config(config)
xtts_model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
xtts_model.to(device)



#xtts_model = TTS(model_name, gpu=False)
#xtts_model.to("cpu") # no GPU or Amd
#tts.to("cuda") # cuda only

#tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
#ModelManager().download_model(tts_model_name)
#tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
#config = XttsConfig()
#config.load_json(os.path.join(tts_model_path, "config.json"))
#xtts_model = Xtts.init_from_config(config)
#xtts_model.to("cpu")
#xtts_model.load_checkpoint(
#    config,
#    checkpoint_path=os.path.join(tts_model_path, "model.pth"),
#    vocab_path=os.path.join(tts_model_path, "vocab.json"),
#    eval=True,
#    use_deepspeed=True,
#)
#xtts_model.cuda()
print("Loaded XTTS model")

###### Set up Gradio Interface ######

with gr.Blocks(title="Voice chat with LLM") as demo:
    DESCRIPTION = """# Voice chat with LLM"""
    gr.Markdown(DESCRIPTION)

    # Define chatbot component
    chatbot = gr.Chatbot(
        value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")],  # Initial greeting from the chatbot
        elem_id="chatbot",
        avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
        bubble_full_width=False,
    )

    # Define chatbot voice component
    VOICES = ["female", "male"]
    with gr.Row():
        chatbot_voice = gr.Dropdown(
            label="Voice of the Chatbot",
            info="How should Chatbot talk like",
            choices=VOICES,
            max_choices=1,
            value=VOICES[0],
        )

    # Define text and audio record input components
    with gr.Row():
        txt_box = gr.Textbox(
            scale=3,
            show_label=False,
            placeholder="Enter text and press enter, or speak to your microphone",
            container=False,
            interactive=True,
        )
        audio_record = gr.Audio(sources=["microphone"], type="filepath", scale=4)

    # Define generated audio playback component 
    with gr.Row():
        sentence = gr.Textbox(visible=False)
        audio_playback = gr.Audio(
            value=None,
            label="Generated audio response",
            streaming=True,
            autoplay=True,interactive=False,
            show_label=True,
        )

    # Will be triggered on text submit (will send to generate_speech)
    def add_text(chatbot_history, text):
        chatbot_history = [] if chatbot_history is None else chatbot_history
        chatbot_history = chatbot_history + [(text, None)]
        return chatbot_history, gr.update(value="", interactive=False)
    
    # Will be triggered on voice submit (will transribe and send to generate_speech)
    def add_audio(chatbot_history, audio):
        chatbot_history = [] if chatbot_history is None else chatbot_history
        # get result from whisper and strip it to delete begin and end space
        response, _ = whisper_model.transcribe(audio)
        text = list(response)[0].text.strip()
        print("Transcribed text:", text)
        chatbot_history = chatbot_history + [(text, None)]
        return chatbot_history, gr.update(value="", interactive=False)

    def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
        # Start by yielding an initial empty audio to set up autoplay
        yield ("", chatbot_history, wave_header_chunk())
        #yield ("", chatbot_history)

        # Helper function to handle the speech generation and yielding process
        def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
            if sentence != "":
                print("Processing sentence")
                # generate speech by cloning a voice using default setting
                generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, None, return_as_byte=True)
                if generated_speech is not None:
                    #_, audio_dict = generated_speech
                    yield (sentence, chatbot_history, generated_speech)
                    #yield (sentence, chatbot_history, audio_dict["value"])

        if initial_greeting:
            # Process only the initial greeting if specified
            for _, sentence in chatbot_history:
                yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
        else:
            # Continuously get and process sentences from a generator function
            for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
                print("Inserting sentence to queue")
                yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)

    txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
                             ).then(fn=generate_speech,  inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])

    txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)

    audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
                                            ).then(fn=generate_speech,  inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])

    audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)

    FOOTNOTE = """
            This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
            It relies on the following models :
            - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
            - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses. 
            - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.

            Note:
            - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
            - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
    gr.Markdown(FOOTNOTE)
    demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
demo.queue().launch(debug=True,share=True)