import gradio as gr from transformers import pipeline, TFAutoModelForSeq2SeqLM, AutoTokenizer import torch # Initialize Hugging Face pipelines speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large") text_to_speech = pipeline("text-to-speech", model="facebook/tacotron2", device=0) # Set device to CPU (0) or GPU (cuda) # Function to process speech to text and text to speech def process_audio(input_audio): # Convert the audio to text using Whisper model (speech-to-text) recognized_text = speech_to_text(input_audio)["text"] print(f"Recognized text: {recognized_text}") # Process the text to speech using Tacotron2 model audio_response = text_to_speech(recognized_text) return audio_response, recognized_text # Gradio Interface for the app def create_gradio_interface(): with gr.Blocks() as demo: gr.Markdown("## AI Voice Bot for Food Ordering") # Audio Input: User speaks into microphone or uploads a file (filepath) audio_input = gr.Audio(type="filepath", label="Speak to the bot (Upload or Record Audio)") # Display the bot's response after recognition output_audio = gr.Audio(label="Bot Response", type="numpy") output_text = gr.Textbox(label="Bot Response (Text)") # Define the button to process the audio input audio_input.change(fn=process_audio, inputs=audio_input, outputs=[output_audio, output_text]) return demo # Create and launch the Gradio app if __name__ == "__main__": app = create_gradio_interface() app.launch(share=True)