Dpngtm commited on
Commit
0a3c034
1 Parent(s): 87f6c9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -40
app.py CHANGED
@@ -16,70 +16,46 @@ processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion
16
  # Define device
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
  model.to(device)
19
- model.eval() # Set model to evaluation mode
20
 
21
  def recognize_emotion(audio):
22
- """
23
- Predicts the emotion and confidence scores from an audio file.
24
- Max duration: 60 seconds
25
- """
26
  try:
27
  if audio is None:
28
  return {emotion: 0.0 for emotion in emotion_labels}
29
 
30
- # Handle audio input
31
  audio_path = audio if isinstance(audio, str) else audio.name
32
-
33
- # Load and resample audio
34
  speech_array, sampling_rate = torchaudio.load(audio_path)
35
 
36
- # Check audio duration
37
  duration = speech_array.shape[1] / sampling_rate
38
- if duration > 60: # 60 seconds (1 minute) limit
39
  return {
40
  "Error": "Audio too long (max 1 minute)",
41
  **{emotion: 0.0 for emotion in emotion_labels}
42
  }
43
 
44
- # Resample if needed
45
  if sampling_rate != 16000:
46
  resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
47
  speech_array = resampler(speech_array)
48
 
49
- # Convert to mono if stereo
50
  if speech_array.shape[0] > 1:
51
  speech_array = torch.mean(speech_array, dim=0, keepdim=True)
52
 
53
- # Normalize audio
54
  speech_array = speech_array / torch.max(torch.abs(speech_array))
55
-
56
- # Convert to numpy and squeeze
57
  speech_array = speech_array.squeeze().numpy()
58
 
59
- # Process input
60
- inputs = processor(
61
- speech_array,
62
- sampling_rate=16000,
63
- return_tensors='pt',
64
- padding=True
65
- )
66
  input_values = inputs.input_values.to(device)
67
 
68
- # Get predictions
69
  with torch.no_grad():
70
  outputs = model(input_values)
71
  logits = outputs.logits
72
-
73
- # Get probabilities using softmax
74
  probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
75
 
76
- # Get confidence scores for all emotions
77
  confidence_scores = {
78
- emotion: round(float(prob) * 100, 2) # Convert to percentage with 2 decimal places
79
  for emotion, prob in zip(emotion_labels, probs)
80
  }
81
 
82
- # Sort confidence scores by value
83
  sorted_scores = dict(sorted(
84
  confidence_scores.items(),
85
  key=lambda x: x[1],
@@ -94,14 +70,13 @@ def recognize_emotion(audio):
94
  **{emotion: 0.0 for emotion in emotion_labels}
95
  }
96
 
97
- # Create Gradio interface
98
  interface = gr.Interface(
99
  fn=recognize_emotion,
100
  inputs=gr.Audio(
101
- sources=["microphone", "upload"],
102
  type="filepath",
103
  label="Upload audio or record from microphone",
104
- max_length=60 # Set max length to 60 seconds in Gradio interface
105
  ),
106
  outputs=gr.Label(
107
  num_top_classes=len(emotion_labels),
@@ -130,13 +105,13 @@ interface = gr.Interface(
130
  - Maximum audio length: 1 minute
131
  - Best results with clear speech and minimal background noise
132
  - Confidence scores are shown as percentages
133
- """,
134
-
135
 
136
- # Launch the app
137
- interface.launch(
138
- share=True,
139
- debug=True,
140
- server_name="0.0.0.0",
141
- server_port=7860
142
- )
 
16
  # Define device
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
  model.to(device)
19
+ model.eval()
20
 
21
  def recognize_emotion(audio):
 
 
 
 
22
  try:
23
  if audio is None:
24
  return {emotion: 0.0 for emotion in emotion_labels}
25
 
 
26
  audio_path = audio if isinstance(audio, str) else audio.name
 
 
27
  speech_array, sampling_rate = torchaudio.load(audio_path)
28
 
 
29
  duration = speech_array.shape[1] / sampling_rate
30
+ if duration > 60:
31
  return {
32
  "Error": "Audio too long (max 1 minute)",
33
  **{emotion: 0.0 for emotion in emotion_labels}
34
  }
35
 
 
36
  if sampling_rate != 16000:
37
  resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
38
  speech_array = resampler(speech_array)
39
 
 
40
  if speech_array.shape[0] > 1:
41
  speech_array = torch.mean(speech_array, dim=0, keepdim=True)
42
 
 
43
  speech_array = speech_array / torch.max(torch.abs(speech_array))
 
 
44
  speech_array = speech_array.squeeze().numpy()
45
 
46
+ inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
 
 
 
 
 
 
47
  input_values = inputs.input_values.to(device)
48
 
 
49
  with torch.no_grad():
50
  outputs = model(input_values)
51
  logits = outputs.logits
 
 
52
  probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
53
 
 
54
  confidence_scores = {
55
+ emotion: round(float(prob) * 100, 2)
56
  for emotion, prob in zip(emotion_labels, probs)
57
  }
58
 
 
59
  sorted_scores = dict(sorted(
60
  confidence_scores.items(),
61
  key=lambda x: x[1],
 
70
  **{emotion: 0.0 for emotion in emotion_labels}
71
  }
72
 
 
73
  interface = gr.Interface(
74
  fn=recognize_emotion,
75
  inputs=gr.Audio(
76
+ sources=["microphone", "upload"],
77
  type="filepath",
78
  label="Upload audio or record from microphone",
79
+ max_length=60
80
  ),
81
  outputs=gr.Label(
82
  num_top_classes=len(emotion_labels),
 
105
  - Maximum audio length: 1 minute
106
  - Best results with clear speech and minimal background noise
107
  - Confidence scores are shown as percentages
108
+ """
109
+ )
110
 
111
+ if __name__ == "__main__":
112
+ interface.launch(
113
+ share=True,
114
+ debug=True,
115
+ server_name="0.0.0.0",
116
+ server_port=7860
117
+ )