KingNish commited on
Commit
37ed712
1 Parent(s): df40d37

Update script1.js

Browse files
Files changed (1) hide show
  1. script1.js +344 -296
script1.js CHANGED
@@ -1,108 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  const startStopButton = document.getElementById('startStopButton');
2
  const voiceSelectionDropdown = document.getElementById('voiceSelect');
3
  const modelSelectionDropdown = document.getElementById('modelSelect');
4
- const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
5
  const responseTimeDisplay = document.getElementById('responseTime');
6
  const userActivityIndicator = document.getElementById('userIndicator');
7
  const aiActivityIndicator = document.getElementById('aiIndicator');
8
  const transcriptDiv = document.getElementById('transcript');
9
 
 
10
  let speechRecognizer;
 
 
 
11
  let activeQuery = null;
12
  let queryStartTime = 0;
13
- let completeTranscript = '';
14
  let isRequestInProgress = false;
15
  let isUserSpeaking = false;
16
- let isSpeechRecognitionActive = false;
17
- let userManuallyStoppedRecognizer = false;
18
  let requestAbortController = null;
19
- let partialTranscript = '';
20
- let lastUserSpeechTimestamp = null;
21
- let prefetchTextQuery = "";
22
  let firstResponseTextTimestamp = null;
23
 
24
- // Configuration
25
- const USER_SPEECH_INTERRUPT_DELAY = 500;
26
- const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
27
- const CHUNK_SIZE = 300;
28
-
29
  // Audio Management
30
  let currentAudio = null;
31
  let audioPlaybackQueue = [];
32
- let prefetchQueue = [];
33
 
34
- // Enhanced Prefetching and Caching
35
- const prefetchCache = new Map();
36
  const pendingPrefetchRequests = new Map();
37
- const MAX_PREFETCH_REQUESTS = 10;
38
- const prefetchCacheExpiration = 60000; // 1 minute
39
 
40
- // Global Conversation History
41
  let conversationHistory = [];
42
 
43
  // Audio Caching
44
- const audioCache = new Map();
45
- const audioCacheExpiration = 3600000; // 1 hour
 
46
 
47
- // Normalize query text
48
  const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
49
 
50
  // Generate a cache key
51
- const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
52
- `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  // Prefetch and cache the first TTS audio chunk
55
  const prefetchFirstAudioChunk = (query, voice) => {
56
- const normalizedQuery = normalizeQueryText(query);
57
- const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
58
 
59
- if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
60
 
61
- prefetchQueue.push({ query:query.trim(), voice, cacheKey });
62
- processPrefetchQueue();
63
  };
64
 
65
  // Process the prefetch queue
66
  const processPrefetchQueue = async () => {
67
- while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
68
- const { query, voice, cacheKey } = prefetchQueue.shift();
69
- const abortController = new AbortController();
70
- pendingPrefetchRequests.set(cacheKey, abortController);
71
 
72
- const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value : 'none';
 
73
 
74
- const url = '/stream_text';
75
- const requestBody = {
76
- query: query,
77
- history: JSON.stringify(conversationHistory),
78
- model: modelSelectionDropdown.value,
79
- api_key: userSambanovaKey
80
- };
81
 
82
- try {
83
- const response = await fetch(url, {
84
- method: 'POST',
85
- headers: {
86
- 'Accept': 'text/event-stream',
87
- 'Content-Type': 'application/json'
88
- },
89
- body: JSON.stringify(requestBody),
90
- signal: abortController.signal
91
- });
 
 
 
 
 
 
 
 
 
 
92
 
93
- if (!response.ok) throw new Error('Network response was not ok');
94
 
95
- const firstAudioUrl = await handleStreamingResponseForPrefetch(response.body, voice, abortController.signal);
96
 
97
- if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
 
 
 
98
 
99
- } catch (error) {
100
- if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
101
- } finally {
102
- pendingPrefetchRequests.delete(cacheKey);
103
- processPrefetchQueue();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  }
 
105
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  };
107
 
108
  // Handle the streaming response for prefetching
@@ -126,8 +370,7 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
126
  if (line.startsWith('data: ')) {
127
  const textContent = line.substring(6).trim();
128
  if (textContent) {
129
- const audioUrl = await generateTextToSpeechAudio(textContent, voice);
130
- return audioUrl;
131
  }
132
  }
133
  }
@@ -143,127 +386,6 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
143
  return null;
144
  };
145
 
146
- // Play audio from the queue
147
- const playNextAudio = async () => {
148
- if (audioPlaybackQueue.length > 0) {
149
- const audioData = audioPlaybackQueue.shift();
150
- const audio = new Audio(audioData.url);
151
- updateActivityIndicators();
152
-
153
- const audioPromise = new Promise(resolve => {
154
- audio.onended = resolve;
155
- audio.onerror = resolve;
156
- });
157
- if (currentAudio) {
158
- currentAudio.pause();
159
- currentAudio.currentTime = 0;
160
- }
161
-
162
- currentAudio = audio;
163
- await audio.play();
164
- await audioPromise;
165
- playNextAudio();
166
- } else {
167
- updateActivityIndicators();
168
- }
169
- };
170
-
171
- // Generate Text-to-Speech audio with caching
172
- const generateTextToSpeechAudio = async (text, voice) => {
173
- const normalizedText = normalizeQueryText(text);
174
- const cacheKey = `${normalizedText}-${voice}`;
175
-
176
- if (audioCache.has(cacheKey)) {
177
- const cachedData = audioCache.get(cacheKey);
178
- if (Date.now() - cachedData.timestamp < audioCacheExpiration) {
179
- return cachedData.url;
180
- } else {
181
- audioCache.delete(cacheKey);
182
- }
183
- }
184
-
185
- try {
186
- const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
187
- if (!response.ok) throw new Error('Network response was not ok');
188
- const audioBlob = await response.blob();
189
- const audioUrl = URL.createObjectURL(audioBlob);
190
-
191
- audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
192
- return audioUrl;
193
- } catch (error) {
194
- console.error("Error generating TTS audio:", error);
195
- return null;
196
- }
197
- };
198
-
199
- // Send a query to the AI
200
- const sendQueryToAI = async (query) => {
201
- console.log("Sending query to AI:", query);
202
- isRequestInProgress = true;
203
- updateActivityIndicators();
204
- firstResponseTextTimestamp = null;
205
-
206
- const normalizedQuery = normalizeQueryText(query);
207
- const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
208
-
209
- queryStartTime = Date.now();
210
-
211
- if (prefetchCache.has(cacheKey)) {
212
- const cachedData = prefetchCache.get(cacheKey);
213
- if (Date.now() - cachedData.timestamp < prefetchCacheExpiration) {
214
- const prefetchedAudioUrl = cachedData.url;
215
- audioPlaybackQueue.push({ url: prefetchedAudioUrl, isPrefetched: true });
216
- playNextAudio();
217
- } else {
218
- prefetchCache.delete(cacheKey);
219
- }
220
- }
221
-
222
- requestAbortController = new AbortController();
223
-
224
- const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value : 'none';
225
-
226
- const url = '/stream_text';
227
- const requestBody = {
228
- query: query,
229
- history: JSON.stringify(conversationHistory),
230
- model: modelSelectionDropdown.value,
231
- api_key: userSambanovaKey
232
- };
233
-
234
- try {
235
- const response = await fetch(url, {
236
- method: 'POST',
237
- headers: {
238
- 'Accept': 'text/event-stream',
239
- 'Content-Type': 'application/json'
240
- },
241
- body: JSON.stringify(requestBody),
242
- signal: requestAbortController.signal
243
- });
244
-
245
- if (!response.ok) {
246
- if (response.status === 429) {
247
- console.log("Rate limit hit, retrying in 1 second...");
248
- await new Promise(resolve => setTimeout(resolve, 1000));
249
- await sendQueryToAI(query);
250
- return;
251
- }
252
- throw new Error(`Network response was not ok: ${response.status}`);
253
- }
254
-
255
- console.log("Streaming audio response received");
256
- await handleStreamingResponse(response.body, voiceSelectionDropdown.value, requestAbortController.signal);
257
- } catch (error) {
258
- if (error.name !== 'AbortError') {
259
- console.error("Error sending query to AI:", error);
260
- }
261
- } finally {
262
- isRequestInProgress = false;
263
- updateActivityIndicators();
264
- }
265
- };
266
-
267
  // Handle the streaming audio response
268
  const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
269
  const reader = responseStream.getReader();
@@ -273,7 +395,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
273
  let fullResponseText = "";
274
  let fullResponseText2 = "";
275
  let textChunk = "";
276
- let sentText = "";
277
 
278
  try {
279
  while (true) {
@@ -283,7 +405,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
283
 
284
  if (isUserSpeaking) {
285
  interruptAudioPlayback('user is speaking');
286
- break;
287
  }
288
 
289
  const chunk = decoder.decode(value, { stream: true });
@@ -300,7 +422,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
300
  fullResponseText += textContent + " ";
301
  fullResponseText2 += textContent + " ";
302
  textChunk += textContent + " ";
303
- transcriptDiv.textContent = fullResponseText2;
304
 
305
  if (initialChunksSent < 2) {
306
  const audioUrl = await generateTextToSpeechAudio(textContent, voice);
@@ -308,7 +430,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
308
  audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
309
  if (!currentAudio) playNextAudio();
310
  }
311
- sentText += textContent + " ";
312
  initialChunksSent++;
313
  } else {
314
  let unsentTextChunk = textChunk.replace(sentText, '').trim();
@@ -319,12 +441,12 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
319
  audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
320
  if (!currentAudio) playNextAudio();
321
  }
322
- textChunk = "";
323
  }
324
  }
325
 
326
  if (fullResponseText !== '') {
327
- fullResponseText = '';
328
  }
329
  }
330
  }
@@ -356,30 +478,37 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
356
  }
357
  };
358
 
359
- // Update activity indicators
360
- const updateActivityIndicators = (state = null) => {
361
- userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
362
- userActivityIndicator.className = isUserSpeaking
363
- ? "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-blue-400 to-blue-600 hover:bg-gradient-to-r from-blue-500 to-blue-700"
364
- : "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-gray-300 to-gray-400 dark:from-gray-700 dark:to-gray-800 hover:bg-gradient-to-r from-gray-400 to-gray-500"; // Tailwind classes
365
-
366
- if (isRequestInProgress && !currentAudio) {
367
- aiActivityIndicator.textContent = "AI: Processing...";
368
- aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-purple-400 to-purple-600 hover:bg-gradient-to-r from-purple-500 to-purple-700"; // Tailwind class for thinking
369
- } else if (currentAudio && !isUserSpeaking) {
370
- aiActivityIndicator.textContent = state || "AI: Speaking";
371
- aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-green-400 to-green-600 hover:bg-gradient-to-r from-green-500 to-green-700"; // Tailwind class for speaking
372
- } else if (isUserSpeaking) {
373
- aiActivityIndicator.textContent = "AI: Listening";
374
- aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-yellow-400 to-yellow-600 hover:bg-gradient-to-r from-yellow-500 to-yellow-700"; // Tailwind class for listening
375
- } else {
376
- aiActivityIndicator.textContent = "AI: Idle";
377
- aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-gray-300 to-gray-400 dark:from-gray-700 dark:to-gray-800 hover:bg-gradient-to-r from-gray-400 to-gray-500"; // Tailwind classes
 
 
 
 
 
 
378
  }
379
  };
380
 
381
 
382
- // Initialize speech recognition
 
383
  if ('webkitSpeechRecognition' in window) {
384
  speechRecognizer = new webkitSpeechRecognition();
385
  Object.assign(speechRecognizer, {
@@ -391,10 +520,9 @@ if ('webkitSpeechRecognition' in window) {
391
 
392
  speechRecognizer.onstart = () => {
393
  console.log("Speech recognition started");
394
- completeTranscript = '';
395
  isUserSpeaking = true;
396
  lastUserSpeechTimestamp = Date.now();
397
- updateActivityIndicators();
398
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
399
  };
400
 
@@ -403,18 +531,16 @@ if ('webkitSpeechRecognition' in window) {
403
  for (let i = event.resultIndex; i < event.results.length; i++) {
404
  const transcript = event.results[i][0].transcript;
405
  if (event.results[i].isFinal) {
406
- completeTranscript += transcript;
407
  interruptAudioPlayback('final');
408
- processSpeechTranscript(completeTranscript);
409
- completeTranscript = '';
410
  isUserSpeaking = false;
411
- updateActivityIndicators();
412
  queryStartTime = Date.now();
413
  } else {
414
  interimTranscript += transcript;
415
  isUserSpeaking = true;
416
  lastUserSpeechTimestamp = Date.now();
417
- updateActivityIndicators();
418
 
419
  if (interimTranscript.length > prefetchTextQuery.length + 5) {
420
  cancelPrefetchRequests(prefetchTextQuery);
@@ -436,29 +562,19 @@ if ('webkitSpeechRecognition' in window) {
436
 
437
  speechRecognizer.onend = () => {
438
  isUserSpeaking = false;
439
- updateActivityIndicators();
440
-
441
- if (!isRequestInProgress && completeTranscript !== '') {
442
- processSpeechTranscript(completeTranscript);
443
- completeTranscript = '';
444
- }
445
 
446
- // Only restart if the user hasn't manually stopped the recognizer
447
- if (isSpeechRecognitionActive && !userManuallyStoppedRecognizer) {
448
- speechRecognizer.start();
449
- }
450
  };
451
 
452
  startStopButton.addEventListener('click', () => {
453
  if (isSpeechRecognitionActive) {
454
  speechRecognizer.stop();
455
  isSpeechRecognitionActive = false;
456
- userManuallyStoppedRecognizer = true; // Set to true when manually stopped
457
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
458
  } else {
459
  speechRecognizer.start();
460
  isSpeechRecognitionActive = true;
461
- userManuallyStoppedRecognizer = false; // Set to false when started
462
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
463
  }
464
  });
@@ -466,75 +582,7 @@ if ('webkitSpeechRecognition' in window) {
466
  alert('Your browser does not support the Web Speech API.');
467
  }
468
 
469
- // Add to conversation history
470
- const addToConversationHistory = (role, content) => {
471
- if (conversationHistory.length > 0 &&
472
- conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
473
- conversationHistory[conversationHistory.length - 1].content === "") {
474
- conversationHistory.pop();
475
- }
476
-
477
- conversationHistory.push({ role, content });
478
-
479
- if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
480
- };
481
 
482
- // Process the final speech transcript
483
- const processSpeechTranscript = (transcript) => {
484
- const trimmedTranscript = transcript.trimStart();
485
- if (trimmedTranscript !== '' && !isRequestInProgress) {
486
- activeQuery = trimmedTranscript;
487
- sendQueryToAI(activeQuery);
488
- addToConversationHistory('user', activeQuery);
489
- }
490
- };
491
-
492
- // Check if audio playback should be interrupted
493
- const shouldInterruptAudioPlayback = (interimTranscript) =>
494
- Date.now() - lastUserSpeechTimestamp > USER_SPEECH_INTERRUPT_DELAY || interimTranscript.length > 5;
495
-
496
- // Interrupt audio playback
497
- const interruptAudioPlayback = (reason = 'unknown') => {
498
- console.log(`Interrupting audio (reason: ${reason})...`);
499
- if (currentAudio) {
500
- currentAudio.pause();
501
- currentAudio.currentTime = 0;
502
- currentAudio = null;
503
- }
504
-
505
- audioPlaybackQueue.length = 0;
506
- isRequestInProgress = false;
507
-
508
- if (requestAbortController) {
509
- requestAbortController.abort();
510
- requestAbortController = null;
511
- }
512
-
513
- prefetchCache.clear();
514
- prefetchQueue.length = 0;
515
- updateActivityIndicators();
516
- };
517
-
518
- // Cancel pending prefetch requests
519
- const cancelPrefetchRequests = (query) => {
520
- const normalizedQuery = normalizeQueryText(query);
521
-
522
- for (const [cacheKey, abortController] of pendingPrefetchRequests) {
523
- if (cacheKey.startsWith(normalizedQuery)) {
524
- abortController.abort();
525
- pendingPrefetchRequests.delete(cacheKey);
526
- }
527
- }
528
- };
529
-
530
- // Update latency display
531
- const updateLatency = () => {
532
- if (firstResponseTextTimestamp) {
533
- const latency = firstResponseTextTimestamp - queryStartTime;
534
- responseTimeDisplay.textContent = `Latency: ${latency}ms`;
535
- } else {
536
- responseTimeDisplay.textContent = "Latency: 0ms";
537
- }
538
- };
539
 
540
- setInterval(updateLatency, 100);
 
1
+ // script1.js
2
+ import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
3
+
4
+ const video = document.getElementById('webcam');
5
+ let gradioApp; // Declare gradioApp outside the function
6
+
7
+ const GRADIO_CLIENTS = [
8
+ "multimodalart/Florence-2-l4",
9
+ "gokaygokay/Florence-2",
10
+ "multimodalart/Florence-2-l4-2",
11
+ "gokaygokay/Florence-2",
12
+ ];
13
+
14
+ async function startWebcam() {
15
+ try {
16
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
17
+ video.srcObject = stream;
18
+ } catch (error) {
19
+ console.error("Error accessing webcam:", error);
20
+ }
21
+ }
22
+
23
+ async function getCaption() {
24
+ if (!gradioApp) {
25
+ try {
26
+ const randomClient = GRADIO_CLIENTS[Math.floor(Math.random() * GRADIO_CLIENTS.length)];
27
+ gradioApp = await client(randomClient);
28
+ } catch (error) {
29
+ console.error("Error loading Gradio client:", error);
30
+ return "Error getting caption"; // Or some other default
31
+ }
32
+ }
33
+
34
+ try {
35
+ const canvas = document.createElement('canvas');
36
+ canvas.width = video.videoWidth;
37
+ canvas.height = video.videoHeight;
38
+ const context = canvas.getContext('2d');
39
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
40
+ const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
41
+ const handledFile = await handle_file(blob);
42
+
43
+ const result = await gradioApp.predict("/process_image", [handledFile, "More Detailed Caption"]);
44
+ return result.data[0];
45
+ } catch (error) {
46
+ console.error("Error getting caption:", error);
47
+ return "Error getting caption"; // Or handle the error differently
48
+ }
49
+ }
50
+
51
+
52
+ // Constants and Configuration
53
+ const USER_SPEECH_INTERRUPT_DELAY = 500;
54
+ const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
55
+ const CHUNK_SIZE = 300;
56
+ const MAX_PREFETCH_REQUESTS = 10;
57
+ const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
58
+ const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
59
+
60
+ // DOM Elements
61
  const startStopButton = document.getElementById('startStopButton');
62
  const voiceSelectionDropdown = document.getElementById('voiceSelect');
63
  const modelSelectionDropdown = document.getElementById('modelSelect');
64
+ const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
65
  const responseTimeDisplay = document.getElementById('responseTime');
66
  const userActivityIndicator = document.getElementById('userIndicator');
67
  const aiActivityIndicator = document.getElementById('aiIndicator');
68
  const transcriptDiv = document.getElementById('transcript');
69
 
70
+ // Speech Recognition
71
  let speechRecognizer;
72
+ let isSpeechRecognitionActive = false;
73
+
74
+ // AI Interaction State
75
  let activeQuery = null;
76
  let queryStartTime = 0;
 
77
  let isRequestInProgress = false;
78
  let isUserSpeaking = false;
 
 
79
  let requestAbortController = null;
 
 
 
80
  let firstResponseTextTimestamp = null;
81
 
 
 
 
 
 
82
  // Audio Management
83
  let currentAudio = null;
84
  let audioPlaybackQueue = [];
 
85
 
86
+ // Prefetching and Caching
87
+ const prefetchCache = new Map();
88
  const pendingPrefetchRequests = new Map();
89
+ const prefetchQueue = [];
90
+ let prefetchTextQuery = "";
91
 
92
+ // Conversation History
93
  let conversationHistory = [];
94
 
95
  // Audio Caching
96
+ const audioCache = new Map();
97
+
98
+ // Utility Functions
99
 
100
+ // Normalize query text
101
  const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
102
 
103
  // Generate a cache key
104
+ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
105
+ `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;
106
+
107
+ // Update activity indicators
108
+ const updateActivityIndicators = (state = null) => {
109
+ userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
110
+
111
+ if (isRequestInProgress && !currentAudio) {
112
+ aiActivityIndicator.textContent = "AI: Processing...";
113
+ } else if (currentAudio && !isUserSpeaking) {
114
+ aiActivityIndicator.textContent = state || "AI: Speaking";
115
+ } else if (isUserSpeaking) {
116
+ aiActivityIndicator.textContent = "AI: Listening";
117
+ } else {
118
+ aiActivityIndicator.textContent = "AI: Idle";
119
+ }
120
+ };
121
+
122
+ // Update latency display
123
+ const updateLatency = () => {
124
+ if (firstResponseTextTimestamp) {
125
+ const latency = firstResponseTextTimestamp - queryStartTime;
126
+ responseTimeDisplay.textContent = `Latency: ${latency}ms`;
127
+ } else {
128
+ responseTimeDisplay.textContent = "Latency: 0ms";
129
+ }
130
+ };
131
+
132
+ // Add to conversation history
133
+ const addToConversationHistory = (role, content) => {
134
+ if (conversationHistory.length > 0 &&
135
+ conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
136
+ conversationHistory[conversationHistory.length - 1].content === "") {
137
+ conversationHistory.pop();
138
+ }
139
+
140
+ conversationHistory.push({ role, content });
141
+
142
+ if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
143
+ };
144
+
145
+ // Check if audio playback should be interrupted
146
+ const shouldInterruptAudioPlayback = (interimTranscript) =>
147
+ Date.now() - lastUserSpeechTimestamp > USER_SPEECH_INTERRUPT_DELAY || interimTranscript.length > 5;
148
+
149
+
150
+ // Audio Management Functions
151
+
152
+ // Play audio from the queue
153
+ const playNextAudio = async () => {
154
+ if (audioPlaybackQueue.length > 0) {
155
+ const audioData = audioPlaybackQueue.shift();
156
+ const audio = new Audio(audioData.url);
157
+ updateActivityIndicators();
158
+
159
+ const audioPromise = new Promise(resolve => {
160
+ audio.onended = resolve;
161
+ audio.onerror = resolve;
162
+ });
163
+ if (currentAudio) {
164
+ currentAudio.pause();
165
+ currentAudio.currentTime = 0;
166
+ }
167
+
168
+ currentAudio = audio;
169
+ await audio.play();
170
+ await audioPromise;
171
+ playNextAudio();
172
+ } else {
173
+ updateActivityIndicators();
174
+ }
175
+ };
176
+
177
+ // Interrupt audio playback
178
+ const interruptAudioPlayback = (reason = 'unknown') => {
179
+ console.log(`Interrupting audio (reason: ${reason})...`);
180
+ if (currentAudio) {
181
+ currentAudio.pause();
182
+ currentAudio.currentTime = 0;
183
+ currentAudio = null;
184
+ }
185
+
186
+ audioPlaybackQueue.length = 0;
187
+ isRequestInProgress = false;
188
+
189
+ if (requestAbortController) {
190
+ requestAbortController.abort();
191
+ requestAbortController = null;
192
+ }
193
+
194
+ prefetchCache.clear();
195
+ prefetchQueue.length = 0;
196
+ updateActivityIndicators();
197
+ };
198
+
199
+
200
+ // Prefetching and Caching Functions
201
 
202
  // Prefetch and cache the first TTS audio chunk
203
  const prefetchFirstAudioChunk = (query, voice) => {
204
+ const normalizedQuery = normalizeQueryText(query);
205
+ const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
206
 
207
+ if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
208
 
209
+ prefetchQueue.push({ query: query.trim(), voice, cacheKey });
210
+ processPrefetchQueue();
211
  };
212
 
213
  // Process the prefetch queue
214
  const processPrefetchQueue = async () => {
215
+ while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
216
+ const { query, voice, cacheKey } = prefetchQueue.shift();
217
+ const abortController = new AbortController();
218
+ pendingPrefetchRequests.set(cacheKey, abortController);
219
 
220
+ try {
221
+ const firstAudioUrl = await streamAndPrefetchAudio(query, voice, abortController.signal);
222
 
223
+ if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
 
 
 
 
 
 
224
 
225
+ } catch (error) {
226
+ if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
227
+ } finally {
228
+ pendingPrefetchRequests.delete(cacheKey);
229
+ processPrefetchQueue();
230
+ }
231
+ }
232
+ };
233
+
234
+ // Cancel pending prefetch requests
235
+ const cancelPrefetchRequests = (query) => {
236
+ const normalizedQuery = normalizeQueryText(query);
237
+
238
+ for (const [cacheKey, abortController] of pendingPrefetchRequests) {
239
+ if (cacheKey.startsWith(normalizedQuery)) {
240
+ abortController.abort();
241
+ pendingPrefetchRequests.delete(cacheKey);
242
+ }
243
+ }
244
+ };
245
 
 
246
 
247
+ // AI Interaction Functions
248
 
249
+ // Modify sendQueryToAI to include the caption
250
+ async function sendQueryToAI(query) {
251
+ const caption = await getCaption();
252
+ const modifiedQuery = JSON.stringify({ USER: query, CAPTION: caption });
253
 
254
+ console.log("Sending query to AI:", modifiedQuery);
255
+ isRequestInProgress = true;
256
+ updateActivityIndicators();
257
+ firstResponseTextTimestamp = null;
258
+
259
+ const normalizedQuery = normalizeQueryText(query);
260
+ const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
261
+
262
+ queryStartTime = Date.now();
263
+
264
+ // Check prefetch cache
265
+ if (prefetchCache.has(cacheKey)) {
266
+ const cachedData = prefetchCache.get(cacheKey);
267
+ if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
268
+ audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
269
+ playNextAudio();
270
+ } else {
271
+ prefetchCache.delete(cacheKey);
272
+ }
273
+ }
274
+
275
+ requestAbortController = new AbortController();
276
+
277
+ try {
278
+ await streamAndHandleAudioResponse(modifiedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
279
+ } catch (error) {
280
+ if (error.name !== 'AbortError') {
281
+ console.error("Error sending query to AI:", error);
282
+ }
283
+ } finally {
284
+ isRequestInProgress = false;
285
+ updateActivityIndicators();
286
+ }
287
+ };
288
+
289
+ // Process the final speech transcript
290
+ const processSpeechTranscript = (transcript) => {
291
+ const trimmedTranscript = transcript.trimStart();
292
+ if (trimmedTranscript !== '' && !isRequestInProgress) {
293
+ activeQuery = trimmedTranscript;
294
+ sendQueryToAI(activeQuery);
295
+ addToConversationHistory('user', activeQuery);
296
+ }
297
+ };
298
+
299
+
300
+ // Network and Streaming Functions
301
+
302
+ // Stream AI response and handle audio
303
+ const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
304
+ const response = await fetchAIResponse(query, abortSignal);
305
+
306
+ if (!response.ok) {
307
+ if (response.status === 429) {
308
+ console.log("Rate limit hit, retrying in 1 second...");
309
+ await new Promise(resolve => setTimeout(resolve, 1000));
310
+ await sendQueryToAI(query);
311
+ return;
312
  }
313
+ throw new Error(`Network response was not ok: ${response.status}`);
314
  }
315
+
316
+ console.log("Streaming audio response received");
317
+ await handleStreamingResponse(response.body, voice, abortSignal);
318
+ };
319
+
320
+ // Stream AI response for prefetching
321
+ const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
322
+ const response = await fetchAIResponse(query, abortSignal);
323
+
324
+ if (!response.ok) throw new Error('Network response was not ok');
325
+
326
+ return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
327
+ };
328
+
329
+ // Fetch AI response
330
+ const fetchAIResponse = async (query, abortSignal) => {
331
+ const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
332
+
333
+ const url = '/stream_text';
334
+ const requestBody = {
335
+ query: query,
336
+ history: JSON.stringify(conversationHistory),
337
+ model: modelSelectionDropdown.value,
338
+ api_key: userSambanovaKey
339
+ };
340
+
341
+ return fetch(url, {
342
+ method: 'POST',
343
+ headers: {
344
+ 'Accept': 'text/event-stream',
345
+ 'Content-Type': 'application/json'
346
+ },
347
+ body: JSON.stringify(requestBody),
348
+ signal: abortSignal
349
+ });
350
  };
351
 
352
  // Handle the streaming response for prefetching
 
370
  if (line.startsWith('data: ')) {
371
  const textContent = line.substring(6).trim();
372
  if (textContent) {
373
+ return await generateTextToSpeechAudio(textContent, voice);
 
374
  }
375
  }
376
  }
 
386
  return null;
387
  };
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  // Handle the streaming audio response
390
  const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
391
  const reader = responseStream.getReader();
 
395
  let fullResponseText = "";
396
  let fullResponseText2 = "";
397
  let textChunk = "";
398
+ let sentText = "";
399
 
400
  try {
401
  while (true) {
 
405
 
406
  if (isUserSpeaking) {
407
  interruptAudioPlayback('user is speaking');
408
+ break;
409
  }
410
 
411
  const chunk = decoder.decode(value, { stream: true });
 
422
  fullResponseText += textContent + " ";
423
  fullResponseText2 += textContent + " ";
424
  textChunk += textContent + " ";
425
+ transcriptDiv.textContent = fullResponseText2;
426
 
427
  if (initialChunksSent < 2) {
428
  const audioUrl = await generateTextToSpeechAudio(textContent, voice);
 
430
  audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
431
  if (!currentAudio) playNextAudio();
432
  }
433
+ sentText += textContent + " ";
434
  initialChunksSent++;
435
  } else {
436
  let unsentTextChunk = textChunk.replace(sentText, '').trim();
 
441
  audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
442
  if (!currentAudio) playNextAudio();
443
  }
444
+ textChunk = "";
445
  }
446
  }
447
 
448
  if (fullResponseText !== '') {
449
+ fullResponseText = '';
450
  }
451
  }
452
  }
 
478
  }
479
  };
480
 
481
+ // Generate Text-to-Speech audio with caching
482
+ const generateTextToSpeechAudio = async (text, voice) => {
483
+ const normalizedText = normalizeQueryText(text);
484
+ const cacheKey = `${normalizedText}-${voice}`;
485
+
486
+ if (audioCache.has(cacheKey)) {
487
+ const cachedData = audioCache.get(cacheKey);
488
+ if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
489
+ return cachedData.url;
490
+ } else {
491
+ audioCache.delete(cacheKey);
492
+ }
493
+ }
494
+
495
+ try {
496
+ const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
497
+ if (!response.ok) throw new Error('Network response was not ok');
498
+ const audioBlob = await response.blob();
499
+ const audioUrl = URL.createObjectURL(audioBlob);
500
+
501
+ audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
502
+ return audioUrl;
503
+ } catch (error) {
504
+ console.error("Error generating TTS audio:", error);
505
+ return null;
506
  }
507
  };
508
 
509
 
510
+ // Speech Recognition Initialization
511
+
512
  if ('webkitSpeechRecognition' in window) {
513
  speechRecognizer = new webkitSpeechRecognition();
514
  Object.assign(speechRecognizer, {
 
520
 
521
  speechRecognizer.onstart = () => {
522
  console.log("Speech recognition started");
 
523
  isUserSpeaking = true;
524
  lastUserSpeechTimestamp = Date.now();
525
+ updateActivityIndicators();
526
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
527
  };
528
 
 
531
  for (let i = event.resultIndex; i < event.results.length; i++) {
532
  const transcript = event.results[i][0].transcript;
533
  if (event.results[i].isFinal) {
 
534
  interruptAudioPlayback('final');
535
+ processSpeechTranscript(transcript);
 
536
  isUserSpeaking = false;
537
+ updateActivityIndicators();
538
  queryStartTime = Date.now();
539
  } else {
540
  interimTranscript += transcript;
541
  isUserSpeaking = true;
542
  lastUserSpeechTimestamp = Date.now();
543
+ updateActivityIndicators();
544
 
545
  if (interimTranscript.length > prefetchTextQuery.length + 5) {
546
  cancelPrefetchRequests(prefetchTextQuery);
 
562
 
563
  speechRecognizer.onend = () => {
564
  isUserSpeaking = false;
565
+ updateActivityIndicators();
 
 
 
 
 
566
 
567
+ if (isSpeechRecognitionActive) speechRecognizer.start();
 
 
 
568
  };
569
 
570
  startStopButton.addEventListener('click', () => {
571
  if (isSpeechRecognitionActive) {
572
  speechRecognizer.stop();
573
  isSpeechRecognitionActive = false;
 
574
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
575
  } else {
576
  speechRecognizer.start();
577
  isSpeechRecognitionActive = true;
 
578
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
579
  }
580
  });
 
582
  alert('Your browser does not support the Web Speech API.');
583
  }
584
 
585
+ setInterval(updateLatency, 100);
 
 
 
 
 
 
 
 
 
 
 
586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
+ window.onload = startWebcam;