sagar007 commited on
Commit
8067820
1 Parent(s): 9f43cd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -25
app.py CHANGED
@@ -1,37 +1,29 @@
1
  import gradio as gr
2
  import torch
3
- from moshi.model import Moshi
4
- from moshi.tokenizer import Tokenizer
5
- from moshi.configuration import MoshiConfig
6
 
7
- # Initialize Moshi
8
- config = MoshiConfig()
9
- tokenizer = Tokenizer(config)
10
- moshi = Moshi(config)
11
- moshi.load_pretrained("kyutai/moshika-pytorch-bf16")
12
 
13
- # Move to GPU if available
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- moshi.to(device)
 
 
 
 
 
 
16
 
17
- def speech_to_text(audio):
18
- # Convert audio to tensor and process
19
- audio_tensor = torch.tensor(audio).to(device).float()
20
- with torch.no_grad():
21
- tokens = moshi.encode_audio(audio_tensor)
22
- text = tokenizer.decode(tokens)
23
- return text
24
-
25
- def text_to_speech(text):
26
  # Generate speech from text
27
- with torch.no_grad():
28
- tokens = tokenizer.encode(text)
29
- speech = moshi.decode_audio(tokens)
30
- return (config.sample_rate, speech.cpu().numpy())
31
 
32
  # Create Gradio interface
33
  iface = gr.Interface(
34
- fn=[speech_to_text, text_to_speech],
35
  inputs=[
36
  gr.Audio(source="microphone", type="numpy", label="Speak"),
37
  gr.Textbox(label="Enter text for speech synthesis")
 
1
  import gradio as gr
2
  import torch
3
+ from moshi import MoshiServer
 
 
4
 
5
+ # Initialize MoshiServer
6
+ server = MoshiServer()
7
+ server.load_model("kyutai/moshika-pytorch-bf16")
 
 
8
 
9
+ def process_audio(audio, sample_rate):
10
+ # Convert audio to the correct format
11
+ audio_tensor = torch.tensor(audio).float()
12
+ if audio_tensor.dim() == 2:
13
+ audio_tensor = audio_tensor.mean(dim=1)
14
+
15
+ # Process audio
16
+ result = server.process_audio(audio_tensor, sample_rate)
17
+ return result.text
18
 
19
+ def generate_speech(text):
 
 
 
 
 
 
 
 
20
  # Generate speech from text
21
+ result = server.generate_speech(text)
22
+ return (server.config.sample_rate, result.audio.cpu().numpy())
 
 
23
 
24
  # Create Gradio interface
25
  iface = gr.Interface(
26
+ fn=[process_audio, generate_speech],
27
  inputs=[
28
  gr.Audio(source="microphone", type="numpy", label="Speak"),
29
  gr.Textbox(label="Enter text for speech synthesis")