File size: 8,076 Bytes
eee16bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72d3175
eee16bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae8262b
 
 
eee16bc
 
 
 
 
 
 
 
 
ae8262b
 
 
eee16bc
 
ae8262b
eee16bc
 
 
ae8262b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee16bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae8262b
 
 
 
 
 
 
477624d
ae8262b
eee16bc
ae8262b
 
477624d
ae8262b
eee16bc
 
f84190a
 
 
996c544
ae8262b
996c544
2544d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a170f48
eee16bc
 
 
 
a170f48
eee16bc
 
 
 
 
 
a6f672d
0bb8d5a
eee16bc
e189791
eee16bc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
const express = require('express');
const fs = require('fs');
const fsp = fs.promises;
const path = require('path');
const crypto = require('crypto');
const { spawn } = require('child_process');
const fetch = require('node-fetch');
const { v4: uuidv4 } = require('uuid');
const cors = require('cors');
const {generateImage} = require('./image.js')
const app = express();
app.use(express.json()); // To parse JSON payloads
app.use(cors()); // Enable CORS for all routes

require('dotenv').config()


const MEDIA_FOLDER = `public/media`
const OPENAI_API_KEY = process.env.OPENAI_API_KEY

// Ensure the MEDIA_FOLDER directory exists
async function ensureDir(dir) {
  try {
    await fsp.mkdir(dir, { recursive: true });
  } catch (err) {
    if (err.code !== 'EEXIST') throw err;
  }
}

(async () => {
  await ensureDir(MEDIA_FOLDER);
})();

const audioCache = {}; // { [scriptHash]: audioFilePath }

function parseScript(script) {
  const segments = script.trim().split('\n\n');
  const parsedSegments = [];

  for (const segment of segments) {
    const [speaker_name, ...contentParts] = segment.split(': ');
    const content = contentParts.join(': ');
    parsedSegments.push({ speaker_name, content });
  }

  return parsedSegments;
}

async function runOpenAITTS(text, audioFilename, voiceId, ttsModel='tts-1') {
    
  if (!OPENAI_API_KEY) {
    throw new Error('OPENAI_API_KEY is not set.');
  }

  // Replace the URL below with the actual OpenAI TTS endpoint if available
  const response = await fetch('https://api.openai.com/v1/audio/speech', {
    method: 'POST',
    headers: {
      Authorization: `Bearer ${OPENAI_API_KEY}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: ttsModel,
      voice: voiceId,
      input: text,
    }),
  });

  if (!response.ok) {
    const errorText = await response.text();
    throw new Error(`OpenAI TTS request failed: ${errorText}`);
  }

  const arrayBuffer = await response.arrayBuffer();
  const buffer = Buffer.from(arrayBuffer);
  await fsp.writeFile(audioFilename, buffer);
}

//this supports all openai voices with tts-1 and tts-1-hd models
//voice name can be in openai format or one of the aliases in voiceLookupTable below
async function generateAudio(speakerName, content, ttsModel="tts-1") {
  const voiceLookupTable = {
    DEFAULT: 'alloy',
    ALICE: 'shimmer',
    BOB: 'echo',
    JENNIFER: 'nova',
    PROFESSOR: 'fable',
    MALE_GUEST: 'onyx',
    FEMALE_GUEST: 'alloy',
  };
  const openaiVoices = ['alloy', 'shimmer', 'echo', 'nova', 'fable', 'onyx']
  
  const actualVoiceId = openaiVoices.indexOf(speakerName) > -1 ? speakerName : (voiceLookupTable[speakerName] || voiceLookupTable['DEFAULT']);
  const fileName = path.join(MEDIA_FOLDER, `${uuidv4()}.mp3`);

  await runOpenAITTS(content, fileName, actualVoiceId, ttsModel);
  return fileName;
}

async function generateSpeechFromScript(script="ALICE: Hello, world\n\nBOB: Hello, hamster", res) {
  try {
    /* TODO
    if (apiKey !== 'DEFAULT_API_KEY') {
      // Replace "DEFAULT_API_KEY" with your actual method of managing API keys
      res.status(401).send('Unauthorized');
      return;
    } */

    if (!script) {
      res.status(400).send('Bad Request: Missing payload');
      return;
    }

    const hash = crypto.createHash('sha1');
    hash.update(script);
    const scriptHash = hash.digest('hex');

    if (audioCache[scriptHash]) {
      const filePath = audioCache[scriptHash];
      res.sendFile(path.resolve(filePath), { headers: { 'Content-Type': 'audio/mpeg' } });
      return;
    }

    const parsedSegments = parseScript(script);
    const audioSegments = [];

    for (const segment of parsedSegments) {
      const audioPath = await generateAudio(segment.speaker_name, segment.content);
      audioSegments.push(audioPath);
    }

    if (audioSegments.length === 0) {
      res.status(400).send('No audio generated');
      return;
    }

    // Concatenate audio files into one using FFmpeg
    const combinedAudioPath = path.join(MEDIA_FOLDER, `combined_${uuidv4()}.mp3`);
    await concatenateAudioFiles(audioSegments, combinedAudioPath);

    audioCache[scriptHash] = combinedAudioPath;
    res.sendFile(path.resolve(combinedAudioPath), { headers: { 'Content-Type': 'audio/mpeg' } });
  } catch (error) {
    console.error('Error generating speech:', error);
    res.status(500).send('Internal Server Error');
  }
}

function concatenateAudioFiles(audioFiles, outputFilePath) {
  return new Promise((resolve, reject) => {
    if (audioFiles.length === 1) {
      // If only one audio file, copy it directly
      fs.copyFileSync(audioFiles[0], outputFilePath);
      resolve();
      return;
    }

    const listContent = audioFiles.join('|');

    // Run FFmpeg with the concat protocol
    // ffmpeg -i "concat:file1.mp3|file2.mp3" -acodec copy output.mp3

    const ffmpeg = spawn('ffmpeg', [
      '-i',
      `concat:${listContent}`,
      '-acodec',
      'copy',
      outputFilePath,
    ]);

    ffmpeg.stdout.on('data', (data) => {
      console.log(`stdout: ${data}`);
    });

    ffmpeg.stderr.on('data', (data) => {
      console.error(`stderr: ${data}`);
    });

    ffmpeg.on('close', (code) => {
      if (code === 0) {
        resolve();
      } else {
        reject(new Error(`FFmpeg failed with exit code ${code}`));
      }
    });
  });
}

// Payload should be film script style: speakernames in all caps and a blank line between them
// ALICE: Hi bob,how are you?
//
// BOB: Shitty. One of my coworkers put my hamster in the microwave thinking it was his lunch
// This is for multi-party TTS... For ordinary TTS call api/generate/utterance
app.get('api/generate/speech', async (req, res) => {
  const {payload} = req.query
  await generateSpeechFromScript(payload, res)
})

app.post('api/generate/speech', async (req, res) =>{
    const {payload} = req.body
    await generateSpeechFromScript(payload, res)
})


app.get('/api/hello', (req, res) => {
  res.setHeader("Content-Type", "application/json")
  res.json({"hello": "goodbye"})
  res.end()
})
// This is normal TTS: specify voice, text, model. Voices are from openai, use those names or the aliases in lookup table

//Image generation parameters
//response_format: image | url
//prompt: the text prompt for generating the image. Length is not capped, however, most models only see the first 50 words (70 tokens)
//model: you can use any model that's on huggingface and has serverless inference enabled. Specify in vendor/modelId format, ex: stabilityai/stable-diffusion-3-medium-diffusers
//width: in pixels, optional, defaults to 1024
//height: in pixels, optional, defaults to 1024
//please note: individual models will have different limits for width and height... some older models cap out at 512 or 768. If your image gen requests are failing, try lowering width and height

//The GET version defaults to an image response format (returns the binary content of the image directly)
//This enables the creation of HTML, markdown, and other types of hypertext documents with "self-generating images" that call this API declaratively... no code needed

//For example:
//<h1>Welcome To Bob's Pet Store</h1>
//<img class="hero" src="https://defactofficial-mmapi-2.hf.space/api/generate/image?prompt=A%20large%20hamster&model=stabilityai/stable-diffusion-3-medium-diffusers" />
//
//In addition to saving you lots of time and money when used as a substitute for stock photography, it also means your text-only LLMs just got multimodal superpowers
//Check this:

app.get('/api/generate/image', async (req, res) => {
  const responseFormat = req.query.response_format || 'image';
  await generateImage(req.query, res, responseFormat)
});

app.post("/api/generate/image", async(req, res)=> {
  const responseFormat = req.query.response_format || 'url';
  await generateImage(req.body, res, responseFormat)
}) 



// Client webpages and storage for generated content
app.use('/', express.static("public"));

const port = 7860;
app.listen(port, () => {
  console.log(`Listening on port ${port}`);
});