Spaces:

sagar007
/

FRIDAY

Sleeping

sagar007 commited on Sep 23

Commit

8067820

•

1 Parent(s): 9f43cd0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,37 +1,29 @@
 import gradio as gr
 import torch
-from moshi.model import Moshi
-from moshi.tokenizer import Tokenizer
-from moshi.configuration import MoshiConfig
-# Initialize Moshi
-config = MoshiConfig()
-tokenizer = Tokenizer(config)
-moshi = Moshi(config)
-moshi.load_pretrained("kyutai/moshika-pytorch-bf16")
-# Move to GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-moshi.to(device)
-def speech_to_text(audio):
-    # Convert audio to tensor and process
-    audio_tensor = torch.tensor(audio).to(device).float()
-    with torch.no_grad():
-        tokens = moshi.encode_audio(audio_tensor)
-        text = tokenizer.decode(tokens)
-    return text
-def text_to_speech(text):
     # Generate speech from text
-    with torch.no_grad():
-        tokens = tokenizer.encode(text)
-        speech = moshi.decode_audio(tokens)
-    return (config.sample_rate, speech.cpu().numpy())
 # Create Gradio interface
 iface = gr.Interface(
-    fn=[speech_to_text, text_to_speech],
     inputs=[
         gr.Audio(source="microphone", type="numpy", label="Speak"),
         gr.Textbox(label="Enter text for speech synthesis")

 import gradio as gr
 import torch
+from moshi import MoshiServer
+# Initialize MoshiServer
+server = MoshiServer()
+server.load_model("kyutai/moshika-pytorch-bf16")
+def process_audio(audio, sample_rate):
+    # Convert audio to the correct format
+    audio_tensor = torch.tensor(audio).float()
+    if audio_tensor.dim() == 2:
+        audio_tensor = audio_tensor.mean(dim=1)
+    # Process audio
+    result = server.process_audio(audio_tensor, sample_rate)
+    return result.text
+def generate_speech(text):
     # Generate speech from text
+    result = server.generate_speech(text)
+    return (server.config.sample_rate, result.audio.cpu().numpy())
 # Create Gradio interface
 iface = gr.Interface(
+    fn=[process_audio, generate_speech],
     inputs=[
         gr.Audio(source="microphone", type="numpy", label="Speak"),
         gr.Textbox(label="Enter text for speech synthesis")