import gradio as gr import edge_tts import asyncio import os from huggingface_hub import hf_hub_download from llama_cpp import Llama from faster_whisper import WhisperModel from utilsasync import get_sentence, tts_interface os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc" os.system('python -m unidic download') os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose') # The device to load the model onto. # # Available device types: # "cuda" - NVIDIA GPU # "cpu" - Plain CPU # "mps" - Apple silicon device = "cuda" # Load Mistral LLM print("Loading Mistral LLM") hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf") mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf" mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False) # Load Whisper ASR model print("Loading Whisper ASR") whisper_model = WhisperModel("large-v3", device="cpu", compute_type="float32") # Get all available voices from edge_tts async def get_voices(): voices = await edge_tts.list_voices() return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Will be triggered on text submit (updates the chat interface and sends the request to the LLM for a response) def add_text(chatbot_history, text): chatbot_history = [] if chatbot_history is None else chatbot_history chatbot_history = chatbot_history + [(text, None)] return chatbot_history, gr.update(value="", interactive=True) # Will be triggered on voice submit (will transribe and send to generate_speech) def add_audio(chatbot_history, audio): chatbot_history = [] if chatbot_history is None else chatbot_history # get result from whisper and strip it to delete begin and end space response, _ = whisper_model.transcribe(audio) text = list(response)[0].text.strip() print("Transcribed text:", text) chatbot_history = chatbot_history + [(text, None)] return chatbot_history, gr.update(value="", interactive=True) #Gets a reponse from the LLM and creates an audio clip using a TTS Model def respond(chat_history, voice): if not voice: return None, gr.Warning("Please select a voice.") for sentence, chatbot_history in get_sentence(chat_history, mistral_llm): print("Inserting sentence to queue") print(sentence) audiopb = tts_interface(sentence, voice) #history, response = get_sentence(chat_history, mistral_llm) yield chatbot_history, sentence, audiopb #Gradio Interface async def create_demo(): voices = await get_voices() #Interface Code with gr.Blocks(title="Chat with LLM - POC") as demo: DESCRIPTION = """# Chat with LLM - POC""" gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1, min_width=300): voice = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="") user_msg = gr.Textbox(placeholder="Enter text here or speak into your microphone") audio_record = gr.Audio(sources=["microphone"], type="filepath", scale=4) ai_response = gr.Label(show_label=True,label="LLM Sentence currently being processed") submit_button = gr.Button("Submit") speech_button = gr.Button("Test Speech", visible=False) audio_playback = gr.Audio( value=None, label="Generated audio response", streaming=True, autoplay=True,interactive=False, show_label=True, visible=False ) with gr.Column(scale=1, min_width=300): # Define chatbot component chatbot = gr.Chatbot( value=[(None, "Hi, I'm an AI training assistant. Let's get going, how should we start?")], # Initial greeting from the chatbot elem_id="Conversation", bubble_full_width=False, ) speech_button.click(fn=tts_interface, inputs=[user_msg, voice], outputs=[audio_playback]) audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, user_msg], queue=False ).then(fn=respond, inputs=[chatbot, voice], outputs=[chatbot, ai_response, audio_playback]) #.then(fn=tts_interface, inputs=[ai_response, voice], outputs=[audio_playback]) submit_button.click(fn=add_text, inputs=[chatbot, user_msg], outputs=[chatbot, user_msg], queue=False ).then(fn=respond, inputs=[chatbot, voice], outputs=[chatbot, ai_response, audio_playback]) #.then(fn=tts_interface, inputs=[ai_response, voice], outputs=[audio_playback]) return demo # Run the application demo = asyncio.run(create_demo()) demo.launch()