DSatishchandra commited on
Commit
f0ffae0
1 Parent(s): df2a3af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -59
app.py CHANGED
@@ -1,61 +1,20 @@
1
  import gradio as gr
2
- import speech_recognition as sr
3
- from gtts import gTTS
4
- import os
5
- import pygame # Use pygame for playing audio
6
- from transformers import pipeline
7
-
8
- # Initialize pygame for audio playback
9
- pygame.mixer.init()
10
-
11
- # Initialize recognizer for speech recognition
12
- recognizer = sr.Recognizer()
13
-
14
- # Initialize Hugging Face NLP pipeline for intent recognition using a specific model
15
- nlp = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
16
-
17
- # Define the food menu
18
- menu = {
19
- 'Pizza': ['Cheese', 'Pepperoni', 'Vegetarian'],
20
- 'Beverages': ['Coke', 'Pepsi', 'Water']
21
- }
22
-
23
- # Function to process the order
24
- def process_order(order):
25
- if 'pizza' in order.lower():
26
- return "What type of pizza would you like? Cheese, Pepperoni, or Vegetarian?"
27
- elif 'coke' in order.lower():
28
- return "One Coke added to your order."
29
- else:
30
- return "Sorry, we didn't catch that. Please try again."
31
-
32
- # Function to handle speech recognition from audio files or microphone
33
- def recognize_speech(audio):
34
- try:
35
- # If audio is from file, use SpeechRecognition to convert speech to text
36
- if isinstance(audio, str): # Audio file input (filepath)
37
- with sr.AudioFile(audio) as source:
38
- audio_data = recognizer.record(source)
39
- text = recognizer.recognize_google(audio_data)
40
- else: # Audio from microphone input
41
- text = recognizer.recognize_google(audio)
42
-
43
- print(f"Recognized text: {text}") # Print the recognized text for debugging
44
-
45
- response = process_order(text)
46
-
47
- # Using gTTS to respond back with speech
48
- tts = gTTS(text=response, lang='en')
49
- tts.save("response.mp3")
50
-
51
- # Play the MP3 response using pygame
52
- pygame.mixer.music.load("response.mp3")
53
- pygame.mixer.music.play()
54
-
55
- return response
56
- except Exception as e:
57
- print(f"Error: {e}") # Print the error for debugging
58
- return "Sorry, I could not understand."
59
 
60
  # Gradio Interface for the app
61
  def create_gradio_interface():
@@ -66,10 +25,11 @@ def create_gradio_interface():
66
  audio_input = gr.Audio(type="filepath", label="Speak to the bot (Upload or Record Audio)")
67
 
68
  # Display the bot's response after recognition
69
- output_text = gr.Textbox(label="Bot Response")
 
70
 
71
  # Define the button to process the audio input
72
- audio_input.change(fn=recognize_speech, inputs=audio_input, outputs=output_text)
73
 
74
  return demo
75
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, TFAutoModelForSeq2SeqLM, AutoTokenizer
3
+ import torch
4
+
5
+ # Initialize Hugging Face pipelines
6
+ speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large")
7
+ text_to_speech = pipeline("text-to-speech", model="facebook/tacotron2", device=0) # Set device to CPU (0) or GPU (cuda)
8
+
9
+ # Function to process speech to text and text to speech
10
+ def process_audio(input_audio):
11
+ # Convert the audio to text using Whisper model (speech-to-text)
12
+ recognized_text = speech_to_text(input_audio)["text"]
13
+ print(f"Recognized text: {recognized_text}")
14
+
15
+ # Process the text to speech using Tacotron2 model
16
+ audio_response = text_to_speech(recognized_text)
17
+ return audio_response, recognized_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Gradio Interface for the app
20
  def create_gradio_interface():
 
25
  audio_input = gr.Audio(type="filepath", label="Speak to the bot (Upload or Record Audio)")
26
 
27
  # Display the bot's response after recognition
28
+ output_audio = gr.Audio(label="Bot Response", type="numpy")
29
+ output_text = gr.Textbox(label="Bot Response (Text)")
30
 
31
  # Define the button to process the audio input
32
+ audio_input.change(fn=process_audio, inputs=audio_input, outputs=[output_audio, output_text])
33
 
34
  return demo
35