John Langley commited on
Commit
6fa82d9
1 Parent(s): 29a15b9

add initial files

Browse files
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
3
+ | Step 1: Set Up | | Step 2: Set Up Gradio | | Step 3: Speech-to-Text | | Step 4: Text-to-Speech |
4
+ | Environment | | Interface | | & Language Model Processing | | Output |
5
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
6
+ | | | | | | | |
7
+ | - Import Python | | - Define interface | | - Transcribe audio | | - XTTS model generates |
8
+ | libraries | | components | | to text using | | spoken response from |
9
+ | - Initialize models: |--------> - Configure audio and |------->| Faster Whisper ASR |------->| LLM's text response |
10
+ | Whisper, Mistral, | | text interaction | | - Transcribed text | | |
11
+ | XTTS | | - Launch interface | | is added to | | |
12
+ | | | | | chatbot's history | | |
13
+ | | | | | - Mistral LLM | | |
14
+ | | | | | processes chatbot | | |
15
+ | | | | | history to generate | | |
16
+ | | | | | response | | |
17
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
18
+ '''
19
+
20
+ ###### Set Up Environment ######
21
+
22
+ import os
23
+ # Set CUDA environment variable and install llama-cpp-python
24
+ # llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
25
+ os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
26
+ os.system('python -m unidic download')
27
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
28
+
29
+
30
+ # Third-party library imports
31
+ from faster_whisper import WhisperModel
32
+ import gradio as gr
33
+ from huggingface_hub import hf_hub_download
34
+ from llama_cpp import Llama
35
+ from TTS.tts.configs.xtts_config import XttsConfig
36
+ from TTS.tts.models.xtts import Xtts
37
+ from TTS.utils.generic_utils import get_user_data_dir
38
+ from TTS.utils.manage import ModelManager
39
+
40
+ # Local imports
41
+ from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
42
+
43
+ # Load Whisper ASR model
44
+ print("Loading Whisper ASR")
45
+ whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
46
+
47
+ # Load Mistral LLM
48
+ print("Loading Mistral LLM")
49
+ hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
50
+ mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
51
+ mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
52
+
53
+
54
+ # Load XTTS Model
55
+ print("Loading XTTS model")
56
+ os.environ["COQUI_TOS_AGREED"] = "1"
57
+ tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
58
+ ModelManager().download_model(tts_model_name)
59
+ tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
60
+ config = XttsConfig()
61
+ config.load_json(os.path.join(tts_model_path, "config.json"))
62
+ xtts_model = Xtts.init_from_config(config)
63
+ xtts_model.load_checkpoint(
64
+ config,
65
+ checkpoint_path=os.path.join(tts_model_path, "model.pth"),
66
+ vocab_path=os.path.join(tts_model_path, "vocab.json"),
67
+ eval=True,
68
+ use_deepspeed=True,
69
+ )
70
+ xtts_model.cuda()
71
+
72
+ ###### Set up Gradio Interface ######
73
+
74
+ with gr.Blocks(title="Voice chat with LLM") as demo:
75
+ DESCRIPTION = """# Voice chat with LLM"""
76
+ gr.Markdown(DESCRIPTION)
77
+
78
+ # Define chatbot component
79
+ chatbot = gr.Chatbot(
80
+ value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot
81
+ elem_id="chatbot",
82
+ avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
83
+ bubble_full_width=False,
84
+ )
85
+
86
+ # Define chatbot voice component
87
+ VOICES = ["female", "male"]
88
+ with gr.Row():
89
+ chatbot_voice = gr.Dropdown(
90
+ label="Voice of the Chatbot",
91
+ info="How should Chatbot talk like",
92
+ choices=VOICES,
93
+ max_choices=1,
94
+ value=VOICES[0],
95
+ )
96
+
97
+ # Define text and audio record input components
98
+ with gr.Row():
99
+ txt_box = gr.Textbox(
100
+ scale=3,
101
+ show_label=False,
102
+ placeholder="Enter text and press enter, or speak to your microphone",
103
+ container=False,
104
+ interactive=True,
105
+ )
106
+ audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
107
+
108
+ # Define generated audio playback component
109
+ with gr.Row():
110
+ sentence = gr.Textbox(visible=False)
111
+ audio_playback = gr.Audio(
112
+ value=None,
113
+ label="Generated audio response",
114
+ streaming=True,
115
+ autoplay=True,
116
+ interactive=False,
117
+ show_label=True,
118
+ )
119
+
120
+ # Will be triggered on text submit (will send to generate_speech)
121
+ def add_text(chatbot_history, text):
122
+ chatbot_history = [] if chatbot_history is None else chatbot_history
123
+ chatbot_history = chatbot_history + [(text, None)]
124
+ return chatbot_history, gr.update(value="", interactive=False)
125
+
126
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
127
+ def add_audio(chatbot_history, audio):
128
+ chatbot_history = [] if chatbot_history is None else chatbot_history
129
+ # get result from whisper and strip it to delete begin and end space
130
+ response, _ = whisper_model.transcribe(audio)
131
+ text = list(response)[0].text.strip()
132
+ print("Transcribed text:", text)
133
+ chatbot_history = chatbot_history + [(text, None)]
134
+ return chatbot_history, gr.update(value="", interactive=False)
135
+
136
+ def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
137
+ # Start by yielding an initial empty audio to set up autoplay
138
+ yield ("", chatbot_history, wave_header_chunk())
139
+
140
+ # Helper function to handle the speech generation and yielding process
141
+ def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
142
+ if sentence != "":
143
+ print("Processing sentence")
144
+ generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
145
+ if generated_speech is not None:
146
+ _, audio_dict = generated_speech
147
+ yield (sentence, chatbot_history, audio_dict["value"])
148
+
149
+ if initial_greeting:
150
+ # Process only the initial greeting if specified
151
+ for _, sentence in chatbot_history:
152
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
153
+ else:
154
+ # Continuously get and process sentences from a generator function
155
+ for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
156
+ print("Inserting sentence to queue")
157
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
158
+
159
+ txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
160
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
161
+
162
+ txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
163
+
164
+ audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
165
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
166
+
167
+ audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
168
+
169
+ FOOTNOTE = """
170
+ This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
171
+ It relies on the following models :
172
+ - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
173
+ - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
174
+ - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
175
+
176
+ Note:
177
+ - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
178
+ - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
179
+ gr.Markdown(FOOTNOTE)
180
+ demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
181
+ demo.queue().launch(debug=True,share=True)
examples/ai-chat-logo.png ADDED
examples/app_ui.png ADDED
examples/female.wav ADDED
Binary file (454 kB). View file
 
examples/hf-logo.png ADDED
examples/male.wav ADDED
Binary file (381 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ TTS @ git+https://github.com/coqui-ai/TTS@v0.20.6
3
+ pydantic==1.10.13
4
+ python-multipart==0.0.6
5
+ typing-extensions>=4.8.0
6
+ cutlet
7
+ mecab-python3==1.0.6
8
+ unidic-lite==1.0.8
9
+ unidic==1.1.0
10
+ langid
11
+ deepspeed
12
+ pydub
13
+ librosa
14
+ ffmpeg-python
15
+ gradio_client
16
+ emoji
17
+ asyncio
18
+ noisereduce==3.0.0
19
+ faster-whisper==1.0.1
utils.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import textwrap
8
+ import time
9
+ import uuid
10
+ import wave
11
+
12
+ import emoji
13
+ import gradio as gr
14
+ import langid
15
+ import nltk
16
+ import numpy as np
17
+ import noisereduce as nr
18
+ from huggingface_hub import HfApi
19
+
20
+ # Download the 'punkt' tokenizer for the NLTK library
21
+ nltk.download("punkt")
22
+
23
+ # will use api to restart space on a unrecoverable error
24
+ HF_TOKEN = os.environ.get("HF_TOKEN")
25
+ REPO_ID = os.environ.get("REPO_ID")
26
+ api = HfApi(token=HF_TOKEN)
27
+
28
+ latent_map = {}
29
+
30
+ def get_latents(chatbot_voice, xtts_model, voice_cleanup=False):
31
+ global latent_map
32
+ if chatbot_voice not in latent_map:
33
+ speaker_wav = f"examples/{chatbot_voice}.wav"
34
+ if (voice_cleanup):
35
+ try:
36
+ cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
37
+ resample_filter="-ac 1 -ar 22050"
38
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
39
+ #we will use newer ffmpeg as that has afftn denoise filter
40
+ shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
41
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
42
+ speaker_wav=out_filename
43
+ print("Filtered microphone input")
44
+ except subprocess.CalledProcessError:
45
+ # There was an error - command exited with non-zero code
46
+ print("Error: failed filtering, use original microphone input")
47
+ else:
48
+ speaker_wav=speaker_wav
49
+ # gets condition latents from the model
50
+ # returns tuple (gpt_cond_latent, speaker_embedding)
51
+ latent_map[chatbot_voice] = xtts_model.get_conditioning_latents(audio_path=speaker_wav)
52
+ return latent_map[chatbot_voice]
53
+
54
+
55
+ def detect_language(prompt, xtts_supported_languages=None):
56
+ if xtts_supported_languages is None:
57
+ xtts_supported_languages = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
58
+
59
+ # Fast language autodetection
60
+ if len(prompt)>15:
61
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
62
+ if language_predicted == "zh":
63
+ #we use zh-cn on xtts
64
+ language_predicted = "zh-cn"
65
+
66
+ if language_predicted not in xtts_supported_languages:
67
+ print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
68
+ gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
69
+ language= "en"
70
+ else:
71
+ language = language_predicted
72
+ print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
73
+ else:
74
+ # Hard to detect language fast in short sentence, use english default
75
+ language = "en"
76
+ print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
77
+
78
+ return language
79
+
80
+ def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
81
+ gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
82
+ try:
83
+ t0 = time.time()
84
+ chunks = xtts_model.inference_stream(
85
+ prompt,
86
+ language,
87
+ gpt_cond_latent,
88
+ speaker_embedding,
89
+ repetition_penalty=7.0,
90
+ temperature=0.85,
91
+ )
92
+
93
+ first_chunk = True
94
+ for i, chunk in enumerate(chunks):
95
+ if first_chunk:
96
+ first_chunk_time = time.time() - t0
97
+ metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
98
+ first_chunk = False
99
+ #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
100
+
101
+ # In case output is required to be multiple voice files
102
+ # out_file = f'{char}_{i}.wav'
103
+ # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
104
+ # audio = AudioSegment.from_file(out_file)
105
+ # audio.export(out_file, format='wav')
106
+ # return out_file
107
+ # directly return chunk as bytes for streaming
108
+ chunk = chunk.detach().cpu().numpy().squeeze()
109
+ chunk = (chunk * 32767).astype(np.int16)
110
+ yield chunk.tobytes()
111
+
112
+ except RuntimeError as e:
113
+ if "device-side assert" in str(e):
114
+ # cannot do anything on cuda device side error, need tor estart
115
+ print(
116
+ f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
117
+ flush=True,
118
+ )
119
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
120
+ print("Cuda device-assert Runtime encountered need restart")
121
+
122
+ # HF Space specific.. This error is unrecoverable need to restart space
123
+ api.restart_space(REPO_ID=REPO_ID)
124
+ else:
125
+ print("RuntimeError: non device-side assert error:", str(e))
126
+ # Does not require warning happens on empty chunk and at end
127
+ ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
128
+ return None
129
+ return None
130
+ except:
131
+ return None
132
+
133
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
134
+ # This will create a wave header then append the frame input
135
+ # It should be first on a streaming wav file
136
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
137
+ wav_buf = io.BytesIO()
138
+ with wave.open(wav_buf, "wb") as vfout:
139
+ vfout.setnchannels(channels)
140
+ vfout.setsampwidth(sample_width)
141
+ vfout.setframerate(sample_rate)
142
+ vfout.writeframes(frame_input)
143
+
144
+ wav_buf.seek(0)
145
+ return wav_buf.read()
146
+
147
+ def format_prompt(message, history):
148
+ system_message = f"""
149
+ You are an empathetic, insightful, and supportive coach who helps people deal with challenges and celebrate achievements.
150
+ You help people feel better by asking questions to reflect on and evoke feelings of positivity, gratitude, joy, and love.
151
+ You show radical candor and tough love.
152
+ Respond in a casual and friendly tone.
153
+ Sprinkle in filler words, contractions, idioms, and other casual speech that we use in conversation.
154
+ Emulate the user’s speaking style and be concise in your response.
155
+ """
156
+ prompt = (
157
+ "<s>[INST]" + system_message + "[/INST]"
158
+ )
159
+ for user_prompt, bot_response in history:
160
+ if user_prompt is not None:
161
+ prompt += f"[INST] {user_prompt} [/INST]"
162
+ prompt += f" {bot_response}</s> "
163
+
164
+ if message=="":
165
+ message="Hello"
166
+ prompt += f"[INST] {message} [/INST]"
167
+ return prompt
168
+
169
+ def generate_llm_output(
170
+ prompt,
171
+ history,
172
+ llm,
173
+ temperature=0.8,
174
+ max_tokens=256,
175
+ top_p=0.95,
176
+ stop_words=["<s>","[/INST]", "</s>"]
177
+ ):
178
+ temperature = float(temperature)
179
+ if temperature < 1e-2:
180
+ temperature = 1e-2
181
+ top_p = float(top_p)
182
+
183
+ generate_kwargs = dict(
184
+ temperature=temperature,
185
+ max_tokens=max_tokens,
186
+ top_p=top_p,
187
+ stop=stop_words
188
+ )
189
+ formatted_prompt = format_prompt(prompt, history)
190
+ try:
191
+ print("LLM Input:", formatted_prompt)
192
+ # Local GGUF
193
+ stream = llm(
194
+ formatted_prompt,
195
+ **generate_kwargs,
196
+ stream=True,
197
+ )
198
+ output = ""
199
+ for response in stream:
200
+ character= response["choices"][0]["text"]
201
+
202
+ if character in stop_words:
203
+ # end of context
204
+ return
205
+
206
+ if emoji.is_emoji(character):
207
+ # Bad emoji not a meaning messes chat from next lines
208
+ return
209
+
210
+ output += response["choices"][0]["text"]
211
+ yield output
212
+
213
+ except Exception as e:
214
+ print("Unhandled Exception: ", str(e))
215
+ gr.Warning("Unfortunately Mistral is unable to process")
216
+ output = "I do not know what happened but I could not understand you ."
217
+ return output
218
+
219
+ def get_sentence(history, llm):
220
+ history = [["", None]] if history is None else history
221
+ history[-1][1] = ""
222
+ sentence_list = []
223
+ sentence_hash_list = []
224
+
225
+ text_to_generate = ""
226
+ stored_sentence = None
227
+ stored_sentence_hash = None
228
+
229
+ for character in generate_llm_output(history[-1][0], history[:-1], llm):
230
+ history[-1][1] = character.replace("<|assistant|>","")
231
+ # It is coming word by word
232
+ text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
233
+ if len(text_to_generate) > 1:
234
+
235
+ dif = len(text_to_generate) - len(sentence_list)
236
+
237
+ if dif == 1 and len(sentence_list) != 0:
238
+ continue
239
+
240
+ if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
241
+ continue
242
+
243
+ # All this complexity due to trying append first short sentence to next one for proper language auto-detect
244
+ if stored_sentence is not None and stored_sentence_hash is None and dif>1:
245
+ #means we consumed stored sentence and should look at next sentence to generate
246
+ sentence = text_to_generate[len(sentence_list)+1]
247
+ elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
248
+ print("Appending stored")
249
+ sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
250
+ stored_sentence_hash = None
251
+ else:
252
+ sentence = text_to_generate[len(sentence_list)]
253
+
254
+ # too short sentence just append to next one if there is any
255
+ # this is for proper language detection
256
+ if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
257
+ if sentence[-1] in [".","!","?"]:
258
+ if stored_sentence_hash != hash(sentence):
259
+ stored_sentence = sentence
260
+ stored_sentence_hash = hash(sentence)
261
+ print("Storing:",stored_sentence)
262
+ continue
263
+
264
+
265
+ sentence_hash = hash(sentence)
266
+ if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
267
+ continue
268
+
269
+ if sentence_hash not in sentence_hash_list:
270
+ sentence_hash_list.append(sentence_hash)
271
+ sentence_list.append(sentence)
272
+ print("New Sentence: ", sentence)
273
+ yield (sentence, history)
274
+
275
+ # return that final sentence token
276
+ try:
277
+ last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
278
+ sentence_hash = hash(last_sentence)
279
+ if sentence_hash not in sentence_hash_list:
280
+ if stored_sentence is not None and stored_sentence_hash is not None:
281
+ last_sentence = stored_sentence + last_sentence
282
+ stored_sentence = stored_sentence_hash = None
283
+ print("Last Sentence with stored:",last_sentence)
284
+
285
+ sentence_hash_list.append(sentence_hash)
286
+ sentence_list.append(last_sentence)
287
+ print("Last Sentence: ", last_sentence)
288
+
289
+ yield (last_sentence, history)
290
+ except:
291
+ print("ERROR on last sentence history is :", history)
292
+
293
+ # will generate speech audio file per sentence
294
+ def generate_speech_for_sentence(history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=None, filter_output=True, return_as_byte=False):
295
+ language = "autodetect"
296
+
297
+ wav_bytestream = b""
298
+
299
+ if len(sentence)==0:
300
+ print("EMPTY SENTENCE")
301
+ return
302
+
303
+ # Sometimes prompt </s> coming on output remove it
304
+ # Some post process for speech only
305
+ sentence = sentence.replace("</s>", "")
306
+ # remove code from speech
307
+ sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
308
+ sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
309
+
310
+ sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
311
+
312
+ sentence = sentence.replace("```", "")
313
+ sentence = sentence.replace("...", " ")
314
+ sentence = sentence.replace("(", " ")
315
+ sentence = sentence.replace(")", " ")
316
+ sentence = sentence.replace("<|assistant|>","")
317
+
318
+ if len(sentence)==0:
319
+ print("EMPTY SENTENCE after processing")
320
+ return
321
+
322
+ # A fast fix for last chacter, may produce weird sounds if it is with text
323
+ #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
324
+ # # just add a space
325
+ # sentence = sentence[:-1] + " " + sentence[-1]
326
+
327
+ # regex does the job well
328
+ sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
329
+
330
+ print("Sentence for speech:", sentence)
331
+
332
+
333
+ try:
334
+ SENTENCE_SPLIT_LENGTH=350
335
+ if len(sentence)<SENTENCE_SPLIT_LENGTH:
336
+ # no problem continue on
337
+ sentence_list = [sentence]
338
+ else:
339
+ # Until now nltk likely split sentences properly but we need additional
340
+ # check for longer sentence and split at last possible position
341
+ # Do whatever necessary, first break at hypens then spaces and then even split very long words
342
+ sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
343
+ print("SPLITTED LONG SENTENCE:",sentence_list)
344
+
345
+ for sentence in sentence_list:
346
+
347
+ if any(c.isalnum() for c in sentence):
348
+ if language=="autodetect":
349
+ #on first call autodetect, nexts sentence calls will use same language
350
+ language = detect_language(sentence, xtts_supported_languages)
351
+
352
+ #exists at least 1 alphanumeric (utf-8)
353
+ audio_stream = get_voice_streaming(
354
+ sentence, language, chatbot_voice, xtts_model
355
+ )
356
+ else:
357
+ # likely got a ' or " or some other text without alphanumeric in it
358
+ audio_stream = None
359
+
360
+ # XTTS is actually using streaming response but we are playing audio by sentence
361
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
362
+ if audio_stream is not None:
363
+ frame_length = 0
364
+ for chunk in audio_stream:
365
+ try:
366
+ wav_bytestream += chunk
367
+ frame_length += len(chunk)
368
+ except:
369
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
370
+ continue
371
+
372
+ # Filter output for better voice
373
+ if filter_output:
374
+ data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
375
+ float_data = data_s16 * 0.5**15
376
+ reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
377
+ wav_bytestream = (reduced_noise * 32767).astype(np.int16)
378
+ wav_bytestream = wav_bytestream.tobytes()
379
+
380
+ if audio_stream is not None:
381
+ if not return_as_byte:
382
+ audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
383
+ with wave.open(audio_unique_filename, "w") as f:
384
+ f.setnchannels(1)
385
+ # 2 bytes per sample.
386
+ f.setsampwidth(2)
387
+ f.setframerate(24000)
388
+ f.writeframes(wav_bytestream)
389
+
390
+ return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
391
+ else:
392
+ return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
393
+ except RuntimeError as e:
394
+ if "device-side assert" in str(e):
395
+ # cannot do anything on cuda device side error, need tor estart
396
+ print(
397
+ f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
398
+ flush=True,
399
+ )
400
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
401
+ print("Cuda device-assert Runtime encountered need restart")
402
+
403
+ # HF Space specific.. This error is unrecoverable need to restart space
404
+ api.restart_space(REPO_ID=REPO_ID)
405
+ else:
406
+ print("RuntimeError: non device-side assert error:", str(e))
407
+ raise e
408
+
409
+ print("All speech ended")
410
+ return