Files changed (1) hide show
  1. app__.py +111 -0
app__.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from llama_index.core.prompts import PromptTemplate
3
+ from transformers import AutoTokenizer
4
+ from llama_index.core import Settings
5
+ import os
6
+ import time
7
+ from llama_index.llms.text_generation_inference import TextGenerationInference
8
+ import whisper
9
+ import gradio as gr
10
+ from gtts import gTTS
11
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
12
+ import soundfile as sf
13
+ from datasets import load_dataset
14
+ model = whisper.load_model("base")
15
+ HF_API_TOKEN = os.getenv("HF_TOKEN")
16
+
17
+ def translate_audio(audio):
18
+
19
+ # load audio and pad/trim it to fit 30 seconds
20
+ audio = whisper.load_audio(audio)
21
+ audio = whisper.pad_or_trim(audio)
22
+
23
+ # make log-Mel spectrogram and move to the same device as the model
24
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
25
+
26
+ # decode the audio
27
+ options = whisper.DecodingOptions(language='en', task="transcribe", temperature=0)
28
+ result = whisper.decode(model, mel, options)
29
+ return result.text
30
+
31
+ def audio_response(text, output_path="speech.wav"):
32
+ # Load the processor, model, and vocoder
33
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
34
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
35
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
36
+
37
+ # Process the input text
38
+ inputs = processor(text=text, return_tensors="pt")
39
+
40
+ # Load xvector containing speaker's voice characteristics
41
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
42
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
43
+
44
+ # Generate speech
45
+ with torch.no_grad():
46
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
47
+
48
+ # Save the audio to a file
49
+ sf.write(output_path, speech.numpy(), samplerate=16000) # Ensure the sample rate matches your needs
50
+
51
+ return output_path
52
+
53
+ def messages_to_prompt(messages):
54
+ # Default system message for a chatbot
55
+ default_system_prompt = "You are an AI chatbot designed to assist with user queries in a friendly and conversational manner."
56
+
57
+ prompt = default_system_prompt + "\n"
58
+
59
+ for message in messages:
60
+ if message.role == 'system':
61
+ prompt += f"\n{message.content}</s>\n"
62
+ elif message.role == 'user':
63
+ prompt += f"\n{message.content}</s>\n"
64
+ elif message.role == 'assistant':
65
+ prompt += f"\n{message.content}</s>\n"
66
+
67
+ # Ensure we start with a system prompt, insert blank if needed
68
+ if not prompt.startswith("\n"):
69
+ prompt = "\n</s>\n" + prompt
70
+
71
+ # Add final assistant prompt
72
+ prompt = prompt + "\n"
73
+
74
+ return prompt
75
+
76
+ def completion_to_prompt(completion):
77
+ return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
78
+
79
+ Settings.llm = TextGenerationInference(
80
+ model_url="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct",
81
+ token=HF_API_TOKEN,
82
+ messages_to_prompt=messages_to_prompt,
83
+ completion_to_prompt=completion_to_prompt
84
+ )
85
+ def text_response(t):
86
+ time.sleep(1) # Adjust the delay as needed
87
+ response = Settings.llm.complete(t)
88
+ message = response.text
89
+ return message
90
+
91
+ def transcribe_(a):
92
+ t1 = translate_audio(a)
93
+ t2 = text_response(t1)
94
+ t3 = audio_response(t2)
95
+ return (t1, t2, t3)
96
+
97
+ output_1 = gr.Textbox(label="Speech to Text")
98
+ output_2 = gr.Textbox(label="LLM Output")
99
+ output_3 = gr.Audio(label="LLM output to audio")
100
+
101
+ gr.Interface(
102
+ title='AI Voice Assistant',
103
+ fn=transcribe_,
104
+ inputs=[
105
+ gr.Audio(sources="microphone", type="filepath"),
106
+ ],
107
+ outputs=[
108
+ output_1, output_2, output_3
109
+ ]
110
+ ).launch(share=True)
111
+