sagar007 commited on
Commit
24a5380
1 Parent(s): 8067820

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -25
app.py CHANGED
@@ -1,39 +1,27 @@
1
  import gradio as gr
2
  import torch
3
- from moshi import MoshiServer
4
 
5
- # Initialize MoshiServer
6
- server = MoshiServer()
7
- server.load_model("kyutai/moshika-pytorch-bf16")
8
 
9
- def process_audio(audio, sample_rate):
10
- # Convert audio to the correct format
11
  audio_tensor = torch.tensor(audio).float()
12
  if audio_tensor.dim() == 2:
13
  audio_tensor = audio_tensor.mean(dim=1)
14
 
15
- # Process audio
16
- result = server.process_audio(audio_tensor, sample_rate)
17
- return result.text
18
-
19
- def generate_speech(text):
20
- # Generate speech from text
21
- result = server.generate_speech(text)
22
- return (server.config.sample_rate, result.audio.cpu().numpy())
23
 
24
  # Create Gradio interface
25
  iface = gr.Interface(
26
- fn=[process_audio, generate_speech],
27
- inputs=[
28
- gr.Audio(source="microphone", type="numpy", label="Speak"),
29
- gr.Textbox(label="Enter text for speech synthesis")
30
- ],
31
- outputs=[
32
- gr.Textbox(label="Transcription"),
33
- gr.Audio(label="Synthesized Speech")
34
- ],
35
- title="Moshi Speech-Text Interaction",
36
- description="Interact with Moshi for speech-to-text and text-to-speech tasks."
37
  )
38
 
39
  # Launch the app
 
1
  import gradio as gr
2
  import torch
3
+ from moshi import load_model, transcribe
4
 
5
+ # Load the model
6
+ model = load_model("kyutai/moshika-pytorch-bf16")
 
7
 
8
+ def process_audio(audio):
9
+ # Convert audio to tensor
10
  audio_tensor = torch.tensor(audio).float()
11
  if audio_tensor.dim() == 2:
12
  audio_tensor = audio_tensor.mean(dim=1)
13
 
14
+ # Transcribe audio
15
+ transcription = transcribe(model, audio_tensor)
16
+ return transcription
 
 
 
 
 
17
 
18
  # Create Gradio interface
19
  iface = gr.Interface(
20
+ fn=process_audio,
21
+ inputs=gr.Audio(source="upload", type="numpy"),
22
+ outputs=gr.Textbox(label="Transcription"),
23
+ title="Moshi Speech-to-Text",
24
+ description="Upload an audio file to transcribe using the Moshi model."
 
 
 
 
 
 
25
  )
26
 
27
  # Launch the app