DeFactOfficial commited on
Commit
ae8262b
1 Parent(s): 5a69576

Cleanup TTS API and add utterance route for simple one-party TTS

Browse files
Files changed (1) hide show
  1. api.js +84 -52
api.js CHANGED
@@ -76,7 +76,9 @@ async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') {
76
  await fsp.writeFile(audioFilename, buffer);
77
  }
78
 
79
- async function generateAudio(speakerName, content) {
 
 
80
  const voiceLookupTable = {
81
  DEFAULT: 'alloy',
82
  ALICE: 'shimmer',
@@ -86,14 +88,64 @@ async function generateAudio(speakerName, content) {
86
  MALE_GUEST: 'onyx',
87
  FEMALE_GUEST: 'alloy',
88
  };
89
-
90
- const actualVoiceId = voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT'];
 
91
  const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
92
 
93
- await runOpenAITTS(content, fileName, actualVoiceId, 'tts-1-hd');
94
  return fileName;
95
  }
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  function concatenateAudioFiles(audioFiles, outputFilePath) {
98
  return new Promise((resolve, reject) => {
99
  if (audioFiles.length === 1) {
@@ -134,64 +186,44 @@ function concatenateAudioFiles(audioFiles, outputFilePath) {
134
  });
135
  }
136
 
137
- // Existing GET endpoint (unchanged)
138
- app.get('/list-models', (req, res) => {
139
- // Placeholder for listing models, replace with actual implementation if needed
140
- res.json(['Model 1', 'Model 2', 'Model 3']);
141
- });
 
 
 
 
142
 
143
- // Existing GET endpoint (unchanged)
144
- app.get('/generate/speech', async (req, res) => {
145
- try {
146
- const apiKey = req.query.api_key || 'their_api_key';
147
- if (apiKey !== 'their_api_key') {
148
- // Replace "their_api_key" with your actual method of managing API keys
149
- res.status(401).send('Unauthorized');
150
- return;
151
- }
152
 
153
- const script = req.query.payload;
154
- if (!script) {
155
- res.status(400).send('Bad Request: Missing payload');
156
- return;
157
- }
158
 
159
- const hash = crypto.createHash('sha1');
160
- hash.update(script);
161
- const scriptHash = hash.digest('hex');
162
 
163
- if (audioCache[scriptHash]) {
164
- const filePath = audioCache[scriptHash];
165
- res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
166
- return;
167
- }
168
 
169
- const parsedSegments = parseScript(script);
170
- const audioSegments = [];
 
171
 
172
- for (const segment of parsedSegments) {
173
- const audioPath = await generateAudio(segment.speaker_name, segment.content);
174
- audioSegments.push(audioPath);
175
- }
176
 
177
- if (audioSegments.length === 0) {
178
- res.status(400).send('No audio generated');
179
- return;
180
- }
181
 
182
- // Concatenate audio files into one using FFmpeg
183
- const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
184
- await concatenateAudioFiles(audioSegments, combinedAudioPath);
185
 
186
- audioCache[scriptHash] = combinedAudioPath;
187
- res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
188
- } catch (error) {
189
- console.error('Error generating speech:', error);
190
- res.status(500).send('Internal Server Error');
191
- }
192
- });
193
 
194
- // New POST endpoint with SSE support
 
195
  app.post('/api/generate/speech/stream', async (req, res) => {
196
  try {
197
  const apiKey = req.query.api_key || 'their_api_key';
 
76
  await fsp.writeFile(audioFilename, buffer);
77
  }
78
 
79
+ //this supports all openai voices with tts-1 and tts-1-hd models
80
+ //voice name can be in openai format or one of the aliases in voiceLookupTable below
81
+ async function generateAudio(speakerName, content, ttsModel="tts-1") {
82
  const voiceLookupTable = {
83
  DEFAULT: 'alloy',
84
  ALICE: 'shimmer',
 
88
  MALE_GUEST: 'onyx',
89
  FEMALE_GUEST: 'alloy',
90
  };
91
+ const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx']
92
+
93
+ const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']);
94
  const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);
95
 
96
+ await runOpenAITTS(content, fileName, actualVoiceId, ttsModel);
97
  return fileName;
98
  }
99
 
100
+ async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) {
101
+ try {
102
+ /* TODO
103
+ if (apiKey !== 'DEFAULT_API_KEY') {
104
+ // Replace "DEFAULT_API_KEY" with your actual method of managing API keys
105
+ res.status(401).send('Unauthorized');
106
+ return;
107
+ } */
108
+
109
+ if (!script) {
110
+ res.status(400).send('Bad Request: Missing payload');
111
+ return;
112
+ }
113
+
114
+ const hash = crypto.createHash('sha1');
115
+ hash.update(script);
116
+ const scriptHash = hash.digest('hex');
117
+
118
+ if (audioCache[scriptHash]) {
119
+ const filePath = audioCache[scriptHash];
120
+ res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
121
+ return;
122
+ }
123
+
124
+ const parsedSegments = parseScript(script);
125
+ const audioSegments = [];
126
+
127
+ for (const segment of parsedSegments) {
128
+ const audioPath = await generateAudio(segment.speaker_name, segment.content);
129
+ audioSegments.push(audioPath);
130
+ }
131
+
132
+ if (audioSegments.length === 0) {
133
+ res.status(400).send('No audio generated');
134
+ return;
135
+ }
136
+
137
+ // Concatenate audio files into one using FFmpeg
138
+ const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
139
+ await concatenateAudioFiles(audioSegments, combinedAudioPath);
140
+
141
+ audioCache[scriptHash] = combinedAudioPath;
142
+ res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
143
+ } catch (error) {
144
+ console.error('Error generating speech:', error);
145
+ res.status(500).send('Internal Server Error');
146
+ }
147
+ }
148
+
149
  function concatenateAudioFiles(audioFiles, outputFilePath) {
150
  return new Promise((resolve, reject) => {
151
  if (audioFiles.length === 1) {
 
186
  });
187
  }
188
 
189
+ // Payload should be film script style: speakernames in all caps and a blank line between them
190
+ // ALICE: Hi bob,how are you?
191
+ //
192
+ // BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch
193
+ // This is for multi-party TTS... For ordinary TTS call api/generate/utterance
194
+ app.get('api/generate/speech', async (req, res) => {
195
+ const {payload} = req.query
196
+ await generateSpeechFromScript(payload)
197
+ })
198
 
199
+ app.post('api/generate/speech', async (req, res) =>{
200
+ const {payload} = req.body
201
+ await generateSpeechFromScript(payload)
202
+ })
 
 
 
 
 
203
 
 
 
 
 
 
204
 
 
 
 
205
 
206
+ // This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table
207
+ app.get('api/generate/utterance', async (req, res) => {
208
+ const {voice, text, model} = req.query
209
+ const outputFilename= await generateAudio(voice, text, model || "tts-1")
 
210
 
211
+ // We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
212
+ res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
213
+ })
214
 
215
+ app.post('api/generate/utterance', async (req, res) =>{
216
+ const {voice, text, model} = req.body
217
+ const outputFilename= await generateAudio(voice, text, model || "tts-1")
 
218
 
219
+ // We want the browser to cache this response, because there's no reason to TTS the same text-voice-model combination more than once
220
+ res.sendFile(path.resolve(outputFilename), { headers: { 'Content-Type': 'audio/mpeg', 'Cache-Control', 'Max-Age=8640000' } });
221
+ })
 
222
 
 
 
 
223
 
 
 
 
 
 
 
 
224
 
225
+ // This returns a stream of SSE (application/event-stream) similar to a streaming response from an LLM
226
+ // See example in public/client for how to consume the stream
227
  app.post('/api/generate/speech/stream', async (req, res) => {
228
  try {
229
  const apiKey = req.query.api_key || 'their_api_key';