MMAPI-2

Running

App Files Files Community

DeFactOfficial commited on Oct 26

Commit

ae8262b

•

1 Parent(s): 5a69576

Cleanup TTS API and add utterance route for simple one-party TTS

Browse files

Files changed (1) hide show

api.js +84 -52

api.js CHANGED Viewed

@@ -76,7 +76,9 @@ async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') {
   await fsp.writeFile(audioFilename, buffer);
 }
-async function generateAudio(speakerName, content) {
   const voiceLookupTable = {
     DEFAULT: 'alloy',
     ALICE: 'shimmer',
@@ -86,14 +88,64 @@ async function generateAudio(speakerName, content) {
     MALE_GUEST: 'onyx',
     FEMALE_GUEST: 'alloy',
   };
-  const actualVoiceId = voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT'];
   const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
-  await runOpenAITTS(content, fileName, actualVoiceId, 'tts-1-hd');
   return fileName;
 }
 function concatenateAudioFiles(audioFiles, outputFilePath) {
   return new Promise((resolve, reject) => {
     if (audioFiles.length === 1) {
@@ -134,64 +186,44 @@ function concatenateAudioFiles(audioFiles, outputFilePath) {
   });
 }
-// Existing GET endpoint (unchanged)
-app.get('/list-models', (req, res) => {
-  // Placeholder for listing models, replace with actual implementation if needed
-  res.json(['Model 1', 'Model 2', 'Model 3']);
-});
-// Existing GET endpoint (unchanged)
-app.get('/generate/speech', async (req, res) => {
-  try {
-    const apiKey = req.query.api_key || 'their_api_key';
-    if (apiKey !== 'their_api_key') {
-      // Replace "their_api_key" with your actual method of managing API keys
-      res.status(401).send('Unauthorized');
-      return;
-    }
-    const script = req.query.payload;
-    if (!script) {
-      res.status(400).send('Bad Request: Missing payload');
-      return;
-    }
-    const hash = crypto.createHash('sha1');
-    hash.update(script);
-    const scriptHash = hash.digest('hex');
-    if (audioCache[scriptHash]) {
-      const filePath = audioCache[scriptHash];
-      res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
-      return;
-    }
-    const parsedSegments = parseScript(script);
-    const audioSegments = [];
-    for (const segment of parsedSegments) {
-      const audioPath = await generateAudio(segment.speaker_name, segment.content);
-      audioSegments.push(audioPath);
-    }
-    if (audioSegments.length === 0) {
-      res.status(400).send('No audio generated');
-      return;
-    }
-    // Concatenate audio files into one using FFmpeg
-    const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
-    await concatenateAudioFiles(audioSegments, combinedAudioPath);
-    audioCache[scriptHash] = combinedAudioPath;
-    res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
-  } catch (error) {
-    console.error('Error generating speech:', error);
-    res.status(500).send('Internal Server Error');
-  }
-});
-// New POST endpoint with SSE support
 app.post('/api/generate/speech/stream', async (req, res) => {
   try {
     const apiKey = req.query.api_key || 'their_api_key';

   await fsp.writeFile(audioFilename, buffer);
 }
+//this supports all openai voices with tts-1 and tts-1-hd models
+//voice name can be in openai format or one of the aliases in voiceLookupTable below
+async function generateAudio(speakerName, content, ttsModel="tts-1") {
   const voiceLookupTable = {
     DEFAULT: 'alloy',
     ALICE: 'shimmer',
     MALE_GUEST: 'onyx',
     FEMALE_GUEST: 'alloy',
   };
+  const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx']
+  const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']);
   const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
+  await runOpenAITTS(content, fileName, actualVoiceId, ttsModel);
   return fileName;
 }
+async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) {
+  try {
+    /* TODO
+    if (apiKey !== 'DEFAULT_API_KEY') {
+      // Replace "DEFAULT_API_KEY" with your actual method of managing API keys
+      res.status(401).send('Unauthorized');
+      return;
+    } */
+    if (!script) {
+      res.status(400).send('Bad Request: Missing payload');
+      return;
+    }
+    const hash = crypto.createHash('sha1');
+    hash.update(script);
+    const scriptHash = hash.digest('hex');
+    if (audioCache[scriptHash]) {
+      const filePath = audioCache[scriptHash];
+      res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
+      return;
+    }
+    const parsedSegments = parseScript(script);
+    const audioSegments = [];
+    for (const segment of parsedSegments) {
+      const audioPath = await generateAudio(segment.speaker_name, segment.content);
+      audioSegments.push(audioPath);
+    }
+    if (audioSegments.length === 0) {
+      res.status(400).send('No audio generated');
+      return;
+    }
+    // Concatenate audio files into one using FFmpeg
+    const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
+    await concatenateAudioFiles(audioSegments, combinedAudioPath);
+    audioCache[scriptHash] = combinedAudioPath;
+    res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
+  } catch (error) {
+    console.error('Error generating speech:', error);
+    res.status(500).send('Internal Server Error');
+  }
+}
 function concatenateAudioFiles(audioFiles, outputFilePath) {
   return new Promise((resolve, reject) => {
     if (audioFiles.length === 1) {
   });
 }
+// Payload should be film script style: speakernames in all caps and a blank line between them
+// ALICE: Hi bob,how are you?
+//
+// BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch
+// This is for multi-party TTS... For ordinary TTS call api/generate/utterance
+app.get('api/generate/speech', async (req, res) => {
+  const {payload} = req.query
+  await generateSpeechFromScript(payload)
+})
+app.post('api/generate/speech', async (req, res) =>{
+    const {payload} = req.body
+    await generateSpeechFromScript(payload)
+})
+// This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table
+app.get('api/generate/utterance', async (req, res) => {
+  const {voice, text, model} = req.query
+  const outputFilename= await generateAudio(voice, text, model || "tts-1")
+  // We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
+  res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
+})
+app.post('api/generate/utterance', async (req, res) =>{
+  const {voice, text, model} = req.body
+  const outputFilename= await generateAudio(voice, text, model || "tts-1")
+  // We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
+  res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
+})
+// This returns a stream of SSE (application/event-stream) similar to a streaming response from an LLM
+// See example in public/client for how to consume the stream
 app.post('/api/generate/speech/stream', async (req, res) => {
   try {
     const apiKey = req.query.api_key || 'their_api_key';