Spaces:
Runtime error
Runtime error
John Langley
commited on
Commit
•
6fa82d9
1
Parent(s):
29a15b9
add initial files
Browse files- app.py +181 -0
- examples/ai-chat-logo.png +0 -0
- examples/app_ui.png +0 -0
- examples/female.wav +0 -0
- examples/hf-logo.png +0 -0
- examples/male.wav +0 -0
- requirements.txt +19 -0
- utils.py +410 -0
app.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
|
3 |
+
| Step 1: Set Up | | Step 2: Set Up Gradio | | Step 3: Speech-to-Text | | Step 4: Text-to-Speech |
|
4 |
+
| Environment | | Interface | | & Language Model Processing | | Output |
|
5 |
+
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
|
6 |
+
| | | | | | | |
|
7 |
+
| - Import Python | | - Define interface | | - Transcribe audio | | - XTTS model generates |
|
8 |
+
| libraries | | components | | to text using | | spoken response from |
|
9 |
+
| - Initialize models: |--------> - Configure audio and |------->| Faster Whisper ASR |------->| LLM's text response |
|
10 |
+
| Whisper, Mistral, | | text interaction | | - Transcribed text | | |
|
11 |
+
| XTTS | | - Launch interface | | is added to | | |
|
12 |
+
| | | | | chatbot's history | | |
|
13 |
+
| | | | | - Mistral LLM | | |
|
14 |
+
| | | | | processes chatbot | | |
|
15 |
+
| | | | | history to generate | | |
|
16 |
+
| | | | | response | | |
|
17 |
+
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
|
18 |
+
'''
|
19 |
+
|
20 |
+
###### Set Up Environment ######
|
21 |
+
|
22 |
+
import os
|
23 |
+
# Set CUDA environment variable and install llama-cpp-python
|
24 |
+
# llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
|
25 |
+
os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
|
26 |
+
os.system('python -m unidic download')
|
27 |
+
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
|
28 |
+
|
29 |
+
|
30 |
+
# Third-party library imports
|
31 |
+
from faster_whisper import WhisperModel
|
32 |
+
import gradio as gr
|
33 |
+
from huggingface_hub import hf_hub_download
|
34 |
+
from llama_cpp import Llama
|
35 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
36 |
+
from TTS.tts.models.xtts import Xtts
|
37 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
38 |
+
from TTS.utils.manage import ModelManager
|
39 |
+
|
40 |
+
# Local imports
|
41 |
+
from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
|
42 |
+
|
43 |
+
# Load Whisper ASR model
|
44 |
+
print("Loading Whisper ASR")
|
45 |
+
whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
|
46 |
+
|
47 |
+
# Load Mistral LLM
|
48 |
+
print("Loading Mistral LLM")
|
49 |
+
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
|
50 |
+
mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
|
51 |
+
mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
|
52 |
+
|
53 |
+
|
54 |
+
# Load XTTS Model
|
55 |
+
print("Loading XTTS model")
|
56 |
+
os.environ["COQUI_TOS_AGREED"] = "1"
|
57 |
+
tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
58 |
+
ModelManager().download_model(tts_model_name)
|
59 |
+
tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
|
60 |
+
config = XttsConfig()
|
61 |
+
config.load_json(os.path.join(tts_model_path, "config.json"))
|
62 |
+
xtts_model = Xtts.init_from_config(config)
|
63 |
+
xtts_model.load_checkpoint(
|
64 |
+
config,
|
65 |
+
checkpoint_path=os.path.join(tts_model_path, "model.pth"),
|
66 |
+
vocab_path=os.path.join(tts_model_path, "vocab.json"),
|
67 |
+
eval=True,
|
68 |
+
use_deepspeed=True,
|
69 |
+
)
|
70 |
+
xtts_model.cuda()
|
71 |
+
|
72 |
+
###### Set up Gradio Interface ######
|
73 |
+
|
74 |
+
with gr.Blocks(title="Voice chat with LLM") as demo:
|
75 |
+
DESCRIPTION = """# Voice chat with LLM"""
|
76 |
+
gr.Markdown(DESCRIPTION)
|
77 |
+
|
78 |
+
# Define chatbot component
|
79 |
+
chatbot = gr.Chatbot(
|
80 |
+
value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot
|
81 |
+
elem_id="chatbot",
|
82 |
+
avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
|
83 |
+
bubble_full_width=False,
|
84 |
+
)
|
85 |
+
|
86 |
+
# Define chatbot voice component
|
87 |
+
VOICES = ["female", "male"]
|
88 |
+
with gr.Row():
|
89 |
+
chatbot_voice = gr.Dropdown(
|
90 |
+
label="Voice of the Chatbot",
|
91 |
+
info="How should Chatbot talk like",
|
92 |
+
choices=VOICES,
|
93 |
+
max_choices=1,
|
94 |
+
value=VOICES[0],
|
95 |
+
)
|
96 |
+
|
97 |
+
# Define text and audio record input components
|
98 |
+
with gr.Row():
|
99 |
+
txt_box = gr.Textbox(
|
100 |
+
scale=3,
|
101 |
+
show_label=False,
|
102 |
+
placeholder="Enter text and press enter, or speak to your microphone",
|
103 |
+
container=False,
|
104 |
+
interactive=True,
|
105 |
+
)
|
106 |
+
audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
|
107 |
+
|
108 |
+
# Define generated audio playback component
|
109 |
+
with gr.Row():
|
110 |
+
sentence = gr.Textbox(visible=False)
|
111 |
+
audio_playback = gr.Audio(
|
112 |
+
value=None,
|
113 |
+
label="Generated audio response",
|
114 |
+
streaming=True,
|
115 |
+
autoplay=True,
|
116 |
+
interactive=False,
|
117 |
+
show_label=True,
|
118 |
+
)
|
119 |
+
|
120 |
+
# Will be triggered on text submit (will send to generate_speech)
|
121 |
+
def add_text(chatbot_history, text):
|
122 |
+
chatbot_history = [] if chatbot_history is None else chatbot_history
|
123 |
+
chatbot_history = chatbot_history + [(text, None)]
|
124 |
+
return chatbot_history, gr.update(value="", interactive=False)
|
125 |
+
|
126 |
+
# Will be triggered on voice submit (will transribe and send to generate_speech)
|
127 |
+
def add_audio(chatbot_history, audio):
|
128 |
+
chatbot_history = [] if chatbot_history is None else chatbot_history
|
129 |
+
# get result from whisper and strip it to delete begin and end space
|
130 |
+
response, _ = whisper_model.transcribe(audio)
|
131 |
+
text = list(response)[0].text.strip()
|
132 |
+
print("Transcribed text:", text)
|
133 |
+
chatbot_history = chatbot_history + [(text, None)]
|
134 |
+
return chatbot_history, gr.update(value="", interactive=False)
|
135 |
+
|
136 |
+
def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
|
137 |
+
# Start by yielding an initial empty audio to set up autoplay
|
138 |
+
yield ("", chatbot_history, wave_header_chunk())
|
139 |
+
|
140 |
+
# Helper function to handle the speech generation and yielding process
|
141 |
+
def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
|
142 |
+
if sentence != "":
|
143 |
+
print("Processing sentence")
|
144 |
+
generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
|
145 |
+
if generated_speech is not None:
|
146 |
+
_, audio_dict = generated_speech
|
147 |
+
yield (sentence, chatbot_history, audio_dict["value"])
|
148 |
+
|
149 |
+
if initial_greeting:
|
150 |
+
# Process only the initial greeting if specified
|
151 |
+
for _, sentence in chatbot_history:
|
152 |
+
yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
|
153 |
+
else:
|
154 |
+
# Continuously get and process sentences from a generator function
|
155 |
+
for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
|
156 |
+
print("Inserting sentence to queue")
|
157 |
+
yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
|
158 |
+
|
159 |
+
txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
|
160 |
+
).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
|
161 |
+
|
162 |
+
txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
|
163 |
+
|
164 |
+
audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
|
165 |
+
).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
|
166 |
+
|
167 |
+
audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
|
168 |
+
|
169 |
+
FOOTNOTE = """
|
170 |
+
This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
|
171 |
+
It relies on the following models :
|
172 |
+
- Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
|
173 |
+
- Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
|
174 |
+
- Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
|
175 |
+
|
176 |
+
Note:
|
177 |
+
- Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
|
178 |
+
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
179 |
+
gr.Markdown(FOOTNOTE)
|
180 |
+
demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
|
181 |
+
demo.queue().launch(debug=True,share=True)
|
examples/ai-chat-logo.png
ADDED
examples/app_ui.png
ADDED
examples/female.wav
ADDED
Binary file (454 kB). View file
|
|
examples/hf-logo.png
ADDED
examples/male.wav
ADDED
Binary file (381 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Preinstall requirements from TTS
|
2 |
+
TTS @ git+https://github.com/coqui-ai/TTS@v0.20.6
|
3 |
+
pydantic==1.10.13
|
4 |
+
python-multipart==0.0.6
|
5 |
+
typing-extensions>=4.8.0
|
6 |
+
cutlet
|
7 |
+
mecab-python3==1.0.6
|
8 |
+
unidic-lite==1.0.8
|
9 |
+
unidic==1.1.0
|
10 |
+
langid
|
11 |
+
deepspeed
|
12 |
+
pydub
|
13 |
+
librosa
|
14 |
+
ffmpeg-python
|
15 |
+
gradio_client
|
16 |
+
emoji
|
17 |
+
asyncio
|
18 |
+
noisereduce==3.0.0
|
19 |
+
faster-whisper==1.0.1
|
utils.py
ADDED
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import io
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import subprocess
|
7 |
+
import textwrap
|
8 |
+
import time
|
9 |
+
import uuid
|
10 |
+
import wave
|
11 |
+
|
12 |
+
import emoji
|
13 |
+
import gradio as gr
|
14 |
+
import langid
|
15 |
+
import nltk
|
16 |
+
import numpy as np
|
17 |
+
import noisereduce as nr
|
18 |
+
from huggingface_hub import HfApi
|
19 |
+
|
20 |
+
# Download the 'punkt' tokenizer for the NLTK library
|
21 |
+
nltk.download("punkt")
|
22 |
+
|
23 |
+
# will use api to restart space on a unrecoverable error
|
24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
25 |
+
REPO_ID = os.environ.get("REPO_ID")
|
26 |
+
api = HfApi(token=HF_TOKEN)
|
27 |
+
|
28 |
+
latent_map = {}
|
29 |
+
|
30 |
+
def get_latents(chatbot_voice, xtts_model, voice_cleanup=False):
|
31 |
+
global latent_map
|
32 |
+
if chatbot_voice not in latent_map:
|
33 |
+
speaker_wav = f"examples/{chatbot_voice}.wav"
|
34 |
+
if (voice_cleanup):
|
35 |
+
try:
|
36 |
+
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
37 |
+
resample_filter="-ac 1 -ar 22050"
|
38 |
+
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
39 |
+
#we will use newer ffmpeg as that has afftn denoise filter
|
40 |
+
shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
|
41 |
+
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
|
42 |
+
speaker_wav=out_filename
|
43 |
+
print("Filtered microphone input")
|
44 |
+
except subprocess.CalledProcessError:
|
45 |
+
# There was an error - command exited with non-zero code
|
46 |
+
print("Error: failed filtering, use original microphone input")
|
47 |
+
else:
|
48 |
+
speaker_wav=speaker_wav
|
49 |
+
# gets condition latents from the model
|
50 |
+
# returns tuple (gpt_cond_latent, speaker_embedding)
|
51 |
+
latent_map[chatbot_voice] = xtts_model.get_conditioning_latents(audio_path=speaker_wav)
|
52 |
+
return latent_map[chatbot_voice]
|
53 |
+
|
54 |
+
|
55 |
+
def detect_language(prompt, xtts_supported_languages=None):
|
56 |
+
if xtts_supported_languages is None:
|
57 |
+
xtts_supported_languages = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
|
58 |
+
|
59 |
+
# Fast language autodetection
|
60 |
+
if len(prompt)>15:
|
61 |
+
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
62 |
+
if language_predicted == "zh":
|
63 |
+
#we use zh-cn on xtts
|
64 |
+
language_predicted = "zh-cn"
|
65 |
+
|
66 |
+
if language_predicted not in xtts_supported_languages:
|
67 |
+
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
68 |
+
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
69 |
+
language= "en"
|
70 |
+
else:
|
71 |
+
language = language_predicted
|
72 |
+
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
|
73 |
+
else:
|
74 |
+
# Hard to detect language fast in short sentence, use english default
|
75 |
+
language = "en"
|
76 |
+
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
77 |
+
|
78 |
+
return language
|
79 |
+
|
80 |
+
def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
|
81 |
+
gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
|
82 |
+
try:
|
83 |
+
t0 = time.time()
|
84 |
+
chunks = xtts_model.inference_stream(
|
85 |
+
prompt,
|
86 |
+
language,
|
87 |
+
gpt_cond_latent,
|
88 |
+
speaker_embedding,
|
89 |
+
repetition_penalty=7.0,
|
90 |
+
temperature=0.85,
|
91 |
+
)
|
92 |
+
|
93 |
+
first_chunk = True
|
94 |
+
for i, chunk in enumerate(chunks):
|
95 |
+
if first_chunk:
|
96 |
+
first_chunk_time = time.time() - t0
|
97 |
+
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
98 |
+
first_chunk = False
|
99 |
+
#print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
100 |
+
|
101 |
+
# In case output is required to be multiple voice files
|
102 |
+
# out_file = f'{char}_{i}.wav'
|
103 |
+
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
104 |
+
# audio = AudioSegment.from_file(out_file)
|
105 |
+
# audio.export(out_file, format='wav')
|
106 |
+
# return out_file
|
107 |
+
# directly return chunk as bytes for streaming
|
108 |
+
chunk = chunk.detach().cpu().numpy().squeeze()
|
109 |
+
chunk = (chunk * 32767).astype(np.int16)
|
110 |
+
yield chunk.tobytes()
|
111 |
+
|
112 |
+
except RuntimeError as e:
|
113 |
+
if "device-side assert" in str(e):
|
114 |
+
# cannot do anything on cuda device side error, need tor estart
|
115 |
+
print(
|
116 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
117 |
+
flush=True,
|
118 |
+
)
|
119 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
120 |
+
print("Cuda device-assert Runtime encountered need restart")
|
121 |
+
|
122 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
123 |
+
api.restart_space(REPO_ID=REPO_ID)
|
124 |
+
else:
|
125 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
126 |
+
# Does not require warning happens on empty chunk and at end
|
127 |
+
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
128 |
+
return None
|
129 |
+
return None
|
130 |
+
except:
|
131 |
+
return None
|
132 |
+
|
133 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
134 |
+
# This will create a wave header then append the frame input
|
135 |
+
# It should be first on a streaming wav file
|
136 |
+
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
137 |
+
wav_buf = io.BytesIO()
|
138 |
+
with wave.open(wav_buf, "wb") as vfout:
|
139 |
+
vfout.setnchannels(channels)
|
140 |
+
vfout.setsampwidth(sample_width)
|
141 |
+
vfout.setframerate(sample_rate)
|
142 |
+
vfout.writeframes(frame_input)
|
143 |
+
|
144 |
+
wav_buf.seek(0)
|
145 |
+
return wav_buf.read()
|
146 |
+
|
147 |
+
def format_prompt(message, history):
|
148 |
+
system_message = f"""
|
149 |
+
You are an empathetic, insightful, and supportive coach who helps people deal with challenges and celebrate achievements.
|
150 |
+
You help people feel better by asking questions to reflect on and evoke feelings of positivity, gratitude, joy, and love.
|
151 |
+
You show radical candor and tough love.
|
152 |
+
Respond in a casual and friendly tone.
|
153 |
+
Sprinkle in filler words, contractions, idioms, and other casual speech that we use in conversation.
|
154 |
+
Emulate the user’s speaking style and be concise in your response.
|
155 |
+
"""
|
156 |
+
prompt = (
|
157 |
+
"<s>[INST]" + system_message + "[/INST]"
|
158 |
+
)
|
159 |
+
for user_prompt, bot_response in history:
|
160 |
+
if user_prompt is not None:
|
161 |
+
prompt += f"[INST] {user_prompt} [/INST]"
|
162 |
+
prompt += f" {bot_response}</s> "
|
163 |
+
|
164 |
+
if message=="":
|
165 |
+
message="Hello"
|
166 |
+
prompt += f"[INST] {message} [/INST]"
|
167 |
+
return prompt
|
168 |
+
|
169 |
+
def generate_llm_output(
|
170 |
+
prompt,
|
171 |
+
history,
|
172 |
+
llm,
|
173 |
+
temperature=0.8,
|
174 |
+
max_tokens=256,
|
175 |
+
top_p=0.95,
|
176 |
+
stop_words=["<s>","[/INST]", "</s>"]
|
177 |
+
):
|
178 |
+
temperature = float(temperature)
|
179 |
+
if temperature < 1e-2:
|
180 |
+
temperature = 1e-2
|
181 |
+
top_p = float(top_p)
|
182 |
+
|
183 |
+
generate_kwargs = dict(
|
184 |
+
temperature=temperature,
|
185 |
+
max_tokens=max_tokens,
|
186 |
+
top_p=top_p,
|
187 |
+
stop=stop_words
|
188 |
+
)
|
189 |
+
formatted_prompt = format_prompt(prompt, history)
|
190 |
+
try:
|
191 |
+
print("LLM Input:", formatted_prompt)
|
192 |
+
# Local GGUF
|
193 |
+
stream = llm(
|
194 |
+
formatted_prompt,
|
195 |
+
**generate_kwargs,
|
196 |
+
stream=True,
|
197 |
+
)
|
198 |
+
output = ""
|
199 |
+
for response in stream:
|
200 |
+
character= response["choices"][0]["text"]
|
201 |
+
|
202 |
+
if character in stop_words:
|
203 |
+
# end of context
|
204 |
+
return
|
205 |
+
|
206 |
+
if emoji.is_emoji(character):
|
207 |
+
# Bad emoji not a meaning messes chat from next lines
|
208 |
+
return
|
209 |
+
|
210 |
+
output += response["choices"][0]["text"]
|
211 |
+
yield output
|
212 |
+
|
213 |
+
except Exception as e:
|
214 |
+
print("Unhandled Exception: ", str(e))
|
215 |
+
gr.Warning("Unfortunately Mistral is unable to process")
|
216 |
+
output = "I do not know what happened but I could not understand you ."
|
217 |
+
return output
|
218 |
+
|
219 |
+
def get_sentence(history, llm):
|
220 |
+
history = [["", None]] if history is None else history
|
221 |
+
history[-1][1] = ""
|
222 |
+
sentence_list = []
|
223 |
+
sentence_hash_list = []
|
224 |
+
|
225 |
+
text_to_generate = ""
|
226 |
+
stored_sentence = None
|
227 |
+
stored_sentence_hash = None
|
228 |
+
|
229 |
+
for character in generate_llm_output(history[-1][0], history[:-1], llm):
|
230 |
+
history[-1][1] = character.replace("<|assistant|>","")
|
231 |
+
# It is coming word by word
|
232 |
+
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
|
233 |
+
if len(text_to_generate) > 1:
|
234 |
+
|
235 |
+
dif = len(text_to_generate) - len(sentence_list)
|
236 |
+
|
237 |
+
if dif == 1 and len(sentence_list) != 0:
|
238 |
+
continue
|
239 |
+
|
240 |
+
if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
|
241 |
+
continue
|
242 |
+
|
243 |
+
# All this complexity due to trying append first short sentence to next one for proper language auto-detect
|
244 |
+
if stored_sentence is not None and stored_sentence_hash is None and dif>1:
|
245 |
+
#means we consumed stored sentence and should look at next sentence to generate
|
246 |
+
sentence = text_to_generate[len(sentence_list)+1]
|
247 |
+
elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
|
248 |
+
print("Appending stored")
|
249 |
+
sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
|
250 |
+
stored_sentence_hash = None
|
251 |
+
else:
|
252 |
+
sentence = text_to_generate[len(sentence_list)]
|
253 |
+
|
254 |
+
# too short sentence just append to next one if there is any
|
255 |
+
# this is for proper language detection
|
256 |
+
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
257 |
+
if sentence[-1] in [".","!","?"]:
|
258 |
+
if stored_sentence_hash != hash(sentence):
|
259 |
+
stored_sentence = sentence
|
260 |
+
stored_sentence_hash = hash(sentence)
|
261 |
+
print("Storing:",stored_sentence)
|
262 |
+
continue
|
263 |
+
|
264 |
+
|
265 |
+
sentence_hash = hash(sentence)
|
266 |
+
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
267 |
+
continue
|
268 |
+
|
269 |
+
if sentence_hash not in sentence_hash_list:
|
270 |
+
sentence_hash_list.append(sentence_hash)
|
271 |
+
sentence_list.append(sentence)
|
272 |
+
print("New Sentence: ", sentence)
|
273 |
+
yield (sentence, history)
|
274 |
+
|
275 |
+
# return that final sentence token
|
276 |
+
try:
|
277 |
+
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
|
278 |
+
sentence_hash = hash(last_sentence)
|
279 |
+
if sentence_hash not in sentence_hash_list:
|
280 |
+
if stored_sentence is not None and stored_sentence_hash is not None:
|
281 |
+
last_sentence = stored_sentence + last_sentence
|
282 |
+
stored_sentence = stored_sentence_hash = None
|
283 |
+
print("Last Sentence with stored:",last_sentence)
|
284 |
+
|
285 |
+
sentence_hash_list.append(sentence_hash)
|
286 |
+
sentence_list.append(last_sentence)
|
287 |
+
print("Last Sentence: ", last_sentence)
|
288 |
+
|
289 |
+
yield (last_sentence, history)
|
290 |
+
except:
|
291 |
+
print("ERROR on last sentence history is :", history)
|
292 |
+
|
293 |
+
# will generate speech audio file per sentence
|
294 |
+
def generate_speech_for_sentence(history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=None, filter_output=True, return_as_byte=False):
|
295 |
+
language = "autodetect"
|
296 |
+
|
297 |
+
wav_bytestream = b""
|
298 |
+
|
299 |
+
if len(sentence)==0:
|
300 |
+
print("EMPTY SENTENCE")
|
301 |
+
return
|
302 |
+
|
303 |
+
# Sometimes prompt </s> coming on output remove it
|
304 |
+
# Some post process for speech only
|
305 |
+
sentence = sentence.replace("</s>", "")
|
306 |
+
# remove code from speech
|
307 |
+
sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
|
308 |
+
sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
|
309 |
+
|
310 |
+
sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
|
311 |
+
|
312 |
+
sentence = sentence.replace("```", "")
|
313 |
+
sentence = sentence.replace("...", " ")
|
314 |
+
sentence = sentence.replace("(", " ")
|
315 |
+
sentence = sentence.replace(")", " ")
|
316 |
+
sentence = sentence.replace("<|assistant|>","")
|
317 |
+
|
318 |
+
if len(sentence)==0:
|
319 |
+
print("EMPTY SENTENCE after processing")
|
320 |
+
return
|
321 |
+
|
322 |
+
# A fast fix for last chacter, may produce weird sounds if it is with text
|
323 |
+
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
324 |
+
# # just add a space
|
325 |
+
# sentence = sentence[:-1] + " " + sentence[-1]
|
326 |
+
|
327 |
+
# regex does the job well
|
328 |
+
sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
|
329 |
+
|
330 |
+
print("Sentence for speech:", sentence)
|
331 |
+
|
332 |
+
|
333 |
+
try:
|
334 |
+
SENTENCE_SPLIT_LENGTH=350
|
335 |
+
if len(sentence)<SENTENCE_SPLIT_LENGTH:
|
336 |
+
# no problem continue on
|
337 |
+
sentence_list = [sentence]
|
338 |
+
else:
|
339 |
+
# Until now nltk likely split sentences properly but we need additional
|
340 |
+
# check for longer sentence and split at last possible position
|
341 |
+
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
342 |
+
sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
343 |
+
print("SPLITTED LONG SENTENCE:",sentence_list)
|
344 |
+
|
345 |
+
for sentence in sentence_list:
|
346 |
+
|
347 |
+
if any(c.isalnum() for c in sentence):
|
348 |
+
if language=="autodetect":
|
349 |
+
#on first call autodetect, nexts sentence calls will use same language
|
350 |
+
language = detect_language(sentence, xtts_supported_languages)
|
351 |
+
|
352 |
+
#exists at least 1 alphanumeric (utf-8)
|
353 |
+
audio_stream = get_voice_streaming(
|
354 |
+
sentence, language, chatbot_voice, xtts_model
|
355 |
+
)
|
356 |
+
else:
|
357 |
+
# likely got a ' or " or some other text without alphanumeric in it
|
358 |
+
audio_stream = None
|
359 |
+
|
360 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
361 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
362 |
+
if audio_stream is not None:
|
363 |
+
frame_length = 0
|
364 |
+
for chunk in audio_stream:
|
365 |
+
try:
|
366 |
+
wav_bytestream += chunk
|
367 |
+
frame_length += len(chunk)
|
368 |
+
except:
|
369 |
+
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
370 |
+
continue
|
371 |
+
|
372 |
+
# Filter output for better voice
|
373 |
+
if filter_output:
|
374 |
+
data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
|
375 |
+
float_data = data_s16 * 0.5**15
|
376 |
+
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
|
377 |
+
wav_bytestream = (reduced_noise * 32767).astype(np.int16)
|
378 |
+
wav_bytestream = wav_bytestream.tobytes()
|
379 |
+
|
380 |
+
if audio_stream is not None:
|
381 |
+
if not return_as_byte:
|
382 |
+
audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
|
383 |
+
with wave.open(audio_unique_filename, "w") as f:
|
384 |
+
f.setnchannels(1)
|
385 |
+
# 2 bytes per sample.
|
386 |
+
f.setsampwidth(2)
|
387 |
+
f.setframerate(24000)
|
388 |
+
f.writeframes(wav_bytestream)
|
389 |
+
|
390 |
+
return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
|
391 |
+
else:
|
392 |
+
return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
|
393 |
+
except RuntimeError as e:
|
394 |
+
if "device-side assert" in str(e):
|
395 |
+
# cannot do anything on cuda device side error, need tor estart
|
396 |
+
print(
|
397 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
|
398 |
+
flush=True,
|
399 |
+
)
|
400 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
401 |
+
print("Cuda device-assert Runtime encountered need restart")
|
402 |
+
|
403 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
404 |
+
api.restart_space(REPO_ID=REPO_ID)
|
405 |
+
else:
|
406 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
407 |
+
raise e
|
408 |
+
|
409 |
+
print("All speech ended")
|
410 |
+
return
|