KingNish commited on
Commit
1fa2d43
1 Parent(s): 15991ac

Previous one is working but slow

Browse files
Files changed (1) hide show
  1. script1.js +361 -316
script1.js CHANGED
@@ -1,57 +1,6 @@
1
- // script1.js
2
- import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
3
-
4
- const video = document.getElementById('webcam');
5
- let gradioApp; // Declare gradioApp outside the function
6
-
7
- const GRADIO_CLIENTS = [
8
- "multimodalart/Florence-2-l4",
9
- "gokaygokay/Florence-2",
10
- "multimodalart/Florence-2-l4-2",
11
- "gokaygokay/Florence-2",
12
- ];
13
-
14
- async function startWebcam() {
15
- try {
16
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
17
- video.srcObject = stream;
18
- } catch (error) {
19
- console.error("Error accessing webcam:", error);
20
- }
21
- }
22
-
23
- async function getCaption() {
24
- if (!gradioApp) {
25
- try {
26
- const randomClient = GRADIO_CLIENTS[Math.floor(Math.random() * GRADIO_CLIENTS.length)];
27
- gradioApp = await client(randomClient);
28
- } catch (error) {
29
- console.error("Error loading Gradio client:", error);
30
- return "Error getting caption"; // Or some other default
31
- }
32
- }
33
-
34
- try {
35
- const canvas = document.createElement('canvas');
36
- canvas.width = video.videoWidth;
37
- canvas.height = video.videoHeight;
38
- const context = canvas.getContext('2d');
39
- context.drawImage(video, 0, 0, canvas.width, canvas.height);
40
- const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
41
- const handledFile = await handle_file(blob);
42
-
43
- const result = await gradioApp.predict("/process_image", [handledFile, "More Detailed Caption"]);
44
- return result.data[0];
45
- } catch (error) {
46
- console.error("Error getting caption:", error);
47
- return "Error getting caption"; // Or handle the error differently
48
- }
49
- }
50
-
51
-
52
  // Constants and Configuration
53
  const USER_SPEECH_INTERRUPT_DELAY = 500;
54
- const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
55
  const CHUNK_SIZE = 300;
56
  const MAX_PREFETCH_REQUESTS = 10;
57
  const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
@@ -61,7 +10,7 @@ const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
61
  const startStopButton = document.getElementById('startStopButton');
62
  const voiceSelectionDropdown = document.getElementById('voiceSelect');
63
  const modelSelectionDropdown = document.getElementById('modelSelect');
64
- const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
65
  const responseTimeDisplay = document.getElementById('responseTime');
66
  const userActivityIndicator = document.getElementById('userIndicator');
67
  const aiActivityIndicator = document.getElementById('aiIndicator');
@@ -95,6 +44,19 @@ let conversationHistory = [];
95
  // Audio Caching
96
  const audioCache = new Map();
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  // Utility Functions
99
 
100
  // Normalize query text
@@ -246,54 +208,51 @@ const cancelPrefetchRequests = (query) => {
246
 
247
  // AI Interaction Functions
248
 
249
- // Modify sendQueryToAI to include the caption
250
- async function sendQueryToAI(query) {
251
- const caption = await getCaption();
252
- const modifiedQuery = JSON.stringify({ USER: query, CAPTION: caption });
253
-
254
- console.log("Sending query to AI:", modifiedQuery);
255
- isRequestInProgress = true;
256
- updateActivityIndicators();
257
- firstResponseTextTimestamp = null;
258
-
259
- const normalizedQuery = normalizeQueryText(query);
260
- const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
261
-
262
- queryStartTime = Date.now();
263
 
264
- // Check prefetch cache
265
- if (prefetchCache.has(cacheKey)) {
266
- const cachedData = prefetchCache.get(cacheKey);
267
- if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
268
- audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
269
- playNextAudio();
270
- } else {
271
- prefetchCache.delete(cacheKey);
 
 
 
 
 
 
272
  }
273
- }
274
 
275
- requestAbortController = new AbortController();
276
 
277
- try {
278
- await streamAndHandleAudioResponse(modifiedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
279
- } catch (error) {
280
- if (error.name !== 'AbortError') {
281
- console.error("Error sending query to AI:", error);
 
 
 
 
282
  }
283
- } finally {
284
- isRequestInProgress = false;
285
- updateActivityIndicators();
286
- }
287
  };
288
 
289
  // Process the final speech transcript
290
  const processSpeechTranscript = (transcript) => {
291
- const trimmedTranscript = transcript.trimStart();
292
- if (trimmedTranscript !== '' && !isRequestInProgress) {
293
- activeQuery = trimmedTranscript;
294
- sendQueryToAI(activeQuery);
295
- addToConversationHistory('user', activeQuery);
296
- }
297
  };
298
 
299
 
@@ -301,288 +260,374 @@ const processSpeechTranscript = (transcript) => {
301
 
302
  // Stream AI response and handle audio
303
  const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
304
- const response = await fetchAIResponse(query, abortSignal);
305
-
306
- if (!response.ok) {
307
- if (response.status === 429) {
308
- console.log("Rate limit hit, retrying in 1 second...");
309
- await new Promise(resolve => setTimeout(resolve, 1000));
310
- await sendQueryToAI(query);
311
- return;
 
 
312
  }
313
- throw new Error(`Network response was not ok: ${response.status}`);
314
- }
315
 
316
- console.log("Streaming audio response received");
317
- await handleStreamingResponse(response.body, voice, abortSignal);
318
  };
319
 
320
  // Stream AI response for prefetching
321
  const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
322
- const response = await fetchAIResponse(query, abortSignal);
323
 
324
- if (!response.ok) throw new Error('Network response was not ok');
325
 
326
- return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
327
  };
328
 
329
  // Fetch AI response
330
  const fetchAIResponse = async (query, abortSignal) => {
331
- const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
332
-
333
- const url = '/stream_text';
334
- const requestBody = {
335
- query: query,
336
- history: JSON.stringify(conversationHistory),
337
- model: modelSelectionDropdown.value,
338
- api_key: userSambanovaKey
339
- };
340
-
341
- return fetch(url, {
342
- method: 'POST',
343
- headers: {
344
- 'Accept': 'text/event-stream',
345
- 'Content-Type': 'application/json'
346
- },
347
- body: JSON.stringify(requestBody),
348
- signal: abortSignal
349
- });
350
  };
351
 
352
  // Handle the streaming response for prefetching
353
  const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
354
- const reader = responseStream.getReader();
355
- const decoder = new TextDecoder("utf-8");
356
- let buffer = "";
357
-
358
- try {
359
- while (true) {
360
- const { done, value } = await reader.read();
361
- if (done) break;
362
- if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
363
-
364
- const chunk = decoder.decode(value, { stream: true });
365
- buffer += chunk;
366
- const lines = buffer.split('\n');
367
-
368
- for (let i = 0; i < lines.length - 1; i++) {
369
- const line = lines[i];
370
- if (line.startsWith('data: ')) {
371
- const textContent = line.substring(6).trim();
372
- if (textContent) {
373
- return await generateTextToSpeechAudio(textContent, voice);
374
- }
375
- }
376
- }
377
 
378
- buffer = lines[lines.length - 1];
 
 
 
 
 
379
  }
380
- } catch (error) {
381
- console.error("Error in handleStreamingResponseForPrefetch:", error);
382
- } finally {
383
- reader.releaseLock();
384
- }
385
 
386
- return null;
387
  };
388
 
389
  // Handle the streaming audio response
390
  const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
391
- const reader = responseStream.getReader();
392
- const decoder = new TextDecoder("utf-8");
393
- let buffer = "";
394
- let initialChunksSent = 0;
395
- let fullResponseText = "";
396
- let fullResponseText2 = "";
397
- let textChunk = "";
398
- let sentText = "";
399
-
400
- try {
401
- while (true) {
402
- const { done, value } = await reader.read();
403
- if (done) break;
404
- if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
405
-
406
- if (isUserSpeaking) {
407
- interruptAudioPlayback('user is speaking');
408
- break;
409
- }
410
-
411
- const chunk = decoder.decode(value, { stream: true });
412
- buffer += chunk;
413
- const lines = buffer.split('\n');
414
-
415
- for (let i = 0; i < lines.length - 1; i++) {
416
- const line = lines[i];
417
- if (line.startsWith('data: ')) {
418
- const textContent = line.substring(6).trim();
419
- if (textContent) {
420
- if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
421
-
422
- fullResponseText += textContent + " ";
423
- fullResponseText2 += textContent + " ";
424
- textChunk += textContent + " ";
425
- transcriptDiv.textContent = fullResponseText2;
426
-
427
- if (initialChunksSent < 2) {
428
- const audioUrl = await generateTextToSpeechAudio(textContent, voice);
429
- if (audioUrl) {
430
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
431
- if (!currentAudio) playNextAudio();
432
- }
433
- sentText += textContent + " ";
434
- initialChunksSent++;
435
- } else {
436
- let unsentTextChunk = textChunk.replace(sentText, '').trim();
437
 
438
- if (unsentTextChunk.length >= CHUNK_SIZE) {
439
- const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
440
- if (audioUrl) {
441
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
442
- if (!currentAudio) playNextAudio();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  }
444
- textChunk = "";
445
- }
446
  }
447
 
448
- if (fullResponseText !== '') {
449
- fullResponseText = '';
 
 
 
 
 
 
 
 
 
 
 
450
  }
451
- }
452
  }
453
- }
454
 
455
- buffer = lines[lines.length - 1];
456
- }
457
- } catch (error) {
458
- console.error("Error in handleStreamingResponse:", error);
459
- } finally {
460
- reader.releaseLock();
461
-
462
- let unsentTextChunk = textChunk.replace(sentText, '').trim();
463
- if (unsentTextChunk !== "") {
464
- const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
465
- if (audioUrl) {
466
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
467
- if (!currentAudio) playNextAudio();
468
- }
469
- }
470
-
471
- if (fullResponseText !== '') {
472
- fullResponseText = '';
473
- }
474
- if (fullResponseText2 !== '') {
475
- addToConversationHistory('assistant', fullResponseText2);
476
- fullResponseText2 = '';
477
  }
478
- }
479
  };
480
 
481
  // Generate Text-to-Speech audio with caching
482
  const generateTextToSpeechAudio = async (text, voice) => {
483
- const normalizedText = normalizeQueryText(text);
484
- const cacheKey = `${normalizedText}-${voice}`;
485
-
486
- if (audioCache.has(cacheKey)) {
487
- const cachedData = audioCache.get(cacheKey);
488
- if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
489
- return cachedData.url;
490
- } else {
491
- audioCache.delete(cacheKey);
 
492
  }
493
- }
494
 
495
- try {
496
- const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
497
- if (!response.ok) throw new Error('Network response was not ok');
498
- const audioBlob = await response.blob();
499
- const audioUrl = URL.createObjectURL(audioBlob);
500
 
501
- audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
502
- return audioUrl;
503
- } catch (error) {
504
- console.error("Error generating TTS audio:", error);
505
- return null;
506
- }
507
  };
508
 
509
 
510
  // Speech Recognition Initialization
511
 
512
  if ('webkitSpeechRecognition' in window) {
513
- speechRecognizer = new webkitSpeechRecognition();
514
- Object.assign(speechRecognizer, {
515
- continuous: true,
516
- interimResults: true,
517
- language: 'en-US',
518
- maxAlternatives: 3
519
- });
520
-
521
- speechRecognizer.onstart = () => {
522
- console.log("Speech recognition started");
523
- isUserSpeaking = true;
524
- lastUserSpeechTimestamp = Date.now();
525
- updateActivityIndicators();
526
- startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
527
- };
528
-
529
- speechRecognizer.onresult = (event) => {
530
- let interimTranscript = '';
531
- for (let i = event.resultIndex; i < event.results.length; i++) {
532
- const transcript = event.results[i][0].transcript;
533
- if (event.results[i].isFinal) {
534
- interruptAudioPlayback('final');
535
- processSpeechTranscript(transcript);
536
- isUserSpeaking = false;
537
- updateActivityIndicators();
538
- queryStartTime = Date.now();
539
- } else {
540
- interimTranscript += transcript;
541
  isUserSpeaking = true;
542
  lastUserSpeechTimestamp = Date.now();
543
  updateActivityIndicators();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
- if (interimTranscript.length > prefetchTextQuery.length + 5) {
546
- cancelPrefetchRequests(prefetchTextQuery);
 
 
547
  }
548
- prefetchTextQuery = interimTranscript;
549
- prefetchFirstAudioChunk(interimTranscript, voiceSelectionDropdown.value);
 
 
 
 
 
 
 
 
550
 
551
- if (isRequestInProgress && shouldInterruptAudioPlayback(interimTranscript)) {
552
- interruptAudioPlayback('interim');
 
 
 
 
 
 
 
 
 
 
553
  }
554
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  }
556
- };
557
 
558
- speechRecognizer.onerror = (event) => {
559
- console.error('Speech recognition error:', event.error);
560
- if (isSpeechRecognitionActive) speechRecognizer.start();
561
- };
562
 
563
- speechRecognizer.onend = () => {
564
- isUserSpeaking = false;
565
- updateActivityIndicators();
 
 
 
566
 
567
- if (isSpeechRecognitionActive) speechRecognizer.start();
568
- };
 
569
 
570
- startStopButton.addEventListener('click', () => {
571
- if (isSpeechRecognitionActive) {
572
- speechRecognizer.stop();
573
- isSpeechRecognitionActive = false;
574
- startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
575
- } else {
576
- speechRecognizer.start();
577
- isSpeechRecognitionActive = true;
578
- startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
 
 
 
 
 
 
579
  }
580
- });
581
- } else {
582
- alert('Your browser does not support the Web Speech API.');
583
  }
584
 
585
- setInterval(updateLatency, 100);
586
 
587
 
588
- window.onload = startWebcam;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  // Constants and Configuration
2
  const USER_SPEECH_INTERRUPT_DELAY = 500;
3
+ const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech"; // Replace with your TTS endpoint
4
  const CHUNK_SIZE = 300;
5
  const MAX_PREFETCH_REQUESTS = 10;
6
  const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
 
10
  const startStopButton = document.getElementById('startStopButton');
11
  const voiceSelectionDropdown = document.getElementById('voiceSelect');
12
  const modelSelectionDropdown = document.getElementById('modelSelect');
13
+ const noiseSuppressionCheckbox = document.getElementById('noiseSuppression'); // Assuming you have this in your HTML
14
  const responseTimeDisplay = document.getElementById('responseTime');
15
  const userActivityIndicator = document.getElementById('userIndicator');
16
  const aiActivityIndicator = document.getElementById('aiIndicator');
 
44
  // Audio Caching
45
  const audioCache = new Map();
46
 
47
+ // Webcam and Gradio Integration
48
+ import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
49
+ const video = document.getElementById('webcam');
50
+ const clients = [
51
+ "multimodalart/Florence-2-l4",
52
+ "gokaygokay/Florence-2",
53
+ "multimodalart/Florence-2-l4-2",
54
+ "gokaygokay/Florence-2",
55
+ ]; // Or your preferred Gradio models
56
+ let app;
57
+ let lastCaption = "";
58
+
59
+
60
  // Utility Functions
61
 
62
  // Normalize query text
 
208
 
209
  // AI Interaction Functions
210
 
211
+ // Send a query to the AI
212
+ const sendQueryToAI = async (query) => {
213
+ console.log("Sending query to AI:", query);
214
+ isRequestInProgress = true;
215
+ updateActivityIndicators();
216
+ firstResponseTextTimestamp = null;
 
 
 
 
 
 
 
 
217
 
218
+ const normalizedQuery = normalizeQueryText(query);
219
+ const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
220
+
221
+ queryStartTime = Date.now();
222
+
223
+ // Check prefetch cache
224
+ if (prefetchCache.has(cacheKey)) {
225
+ const cachedData = prefetchCache.get(cacheKey);
226
+ if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
227
+ audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
228
+ playNextAudio();
229
+ } else {
230
+ prefetchCache.delete(cacheKey);
231
+ }
232
  }
 
233
 
234
+ requestAbortController = new AbortController();
235
 
236
+ try {
237
+ await streamAndHandleAudioResponse(query, voiceSelectionDropdown.value, requestAbortController.signal);
238
+ } catch (error) {
239
+ if (error.name !== 'AbortError') {
240
+ console.error("Error sending query to AI:", error);
241
+ }
242
+ } finally {
243
+ isRequestInProgress = false;
244
+ updateActivityIndicators();
245
  }
 
 
 
 
246
  };
247
 
248
  // Process the final speech transcript
249
  const processSpeechTranscript = (transcript) => {
250
+ const trimmedTranscript = transcript.trimStart();
251
+ if (trimmedTranscript !== '' && !isRequestInProgress) {
252
+ activeQuery = trimmedTranscript;
253
+ sendQueryToAI(activeQuery);
254
+ addToConversationHistory('user', activeQuery);
255
+ }
256
  };
257
 
258
 
 
260
 
261
  // Stream AI response and handle audio
262
  const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
263
+ const response = await fetchAIResponse(query, abortSignal);
264
+
265
+ if (!response.ok) {
266
+ if (response.status === 429) {
267
+ console.log("Rate limit hit, retrying in 1 second...");
268
+ await new Promise(resolve => setTimeout(resolve, 1000));
269
+ await sendQueryToAI(query);
270
+ return;
271
+ }
272
+ throw new Error(`Network response was not ok: ${response.status}`);
273
  }
 
 
274
 
275
+ console.log("Streaming audio response received");
276
+ await handleStreamingResponse(response.body, voice, abortSignal);
277
  };
278
 
279
  // Stream AI response for prefetching
280
  const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
281
+ const response = await fetchAIResponse(query, abortSignal);
282
 
283
+ if (!response.ok) throw new Error('Network response was not ok');
284
 
285
+ return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
286
  };
287
 
288
  // Fetch AI response
289
  const fetchAIResponse = async (query, abortSignal) => {
290
+ const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
291
+
292
+ const url = '/stream_text';
293
+ const requestBody = {
294
+ query: query,
295
+ history: JSON.stringify(conversationHistory),
296
+ model: modelSelectionDropdown.value,
297
+ api_key: userSambanovaKey
298
+ };
299
+
300
+ return fetch(url, {
301
+ method: 'POST',
302
+ headers: {
303
+ 'Accept': 'text/event-stream',
304
+ 'Content-Type': 'application/json'
305
+ },
306
+ body: JSON.stringify(requestBody),
307
+ signal: abortSignal
308
+ });
309
  };
310
 
311
  // Handle the streaming response for prefetching
312
  const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
313
+ const reader = responseStream.getReader();
314
+ const decoder = new TextDecoder("utf-8");
315
+ let buffer = "";
316
+
317
+ try {
318
+ while (true) {
319
+ const { done, value } = await reader.read();
320
+ if (done) break;
321
+ if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
322
+
323
+ const chunk = decoder.decode(value, { stream: true });
324
+ buffer += chunk;
325
+ const lines = buffer.split('\n');
326
+
327
+ for (let i = 0; i < lines.length - 1; i++) {
328
+ const line = lines[i];
329
+ if (line.startsWith('data: ')) {
330
+ const textContent = line.substring(6).trim();
331
+ if (textContent) {
332
+ return await generateTextToSpeechAudio(textContent, voice);
333
+ }
334
+ }
335
+ }
336
 
337
+ buffer = lines[lines.length - 1];
338
+ }
339
+ } catch (error) {
340
+ console.error("Error in handleStreamingResponseForPrefetch:", error);
341
+ } finally {
342
+ reader.releaseLock();
343
  }
 
 
 
 
 
344
 
345
+ return null;
346
  };
347
 
348
  // Handle the streaming audio response
349
  const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
350
+ const reader = responseStream.getReader();
351
+ const decoder = new TextDecoder("utf-8");
352
+ let buffer = "";
353
+ let initialChunksSent = 0;
354
+ let fullResponseText = "";
355
+ let fullResponseText2 = "";
356
+ let textChunk = "";
357
+ let sentText = "";
358
+
359
+ try {
360
+ while (true) {
361
+ const { done, value } = await reader.read();
362
+ if (done) break;
363
+ if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
364
+
365
+ if (isUserSpeaking) {
366
+ interruptAudioPlayback('user is speaking');
367
+ break;
368
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ const chunk = decoder.decode(value, { stream: true });
371
+ buffer += chunk;
372
+ const lines = buffer.split('\n');
373
+
374
+ for (let i = 0; i < lines.length - 1; i++) {
375
+ const line = lines[i];
376
+ if (line.startsWith('data: ')) {
377
+ const textContent = line.substring(6).trim();
378
+ if (textContent) {
379
+ if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
380
+
381
+ fullResponseText += textContent + " ";
382
+ fullResponseText2 += textContent + " ";
383
+ textChunk += textContent + " ";
384
+ transcriptDiv.textContent = fullResponseText2;
385
+
386
+ if (initialChunksSent < 2) {
387
+ const audioUrl = await generateTextToSpeechAudio(textContent, voice);
388
+ if (audioUrl) {
389
+ audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
390
+ if (!currentAudio) playNextAudio();
391
+ }
392
+ sentText += textContent + " ";
393
+ initialChunksSent++;
394
+ } else {
395
+ let unsentTextChunk = textChunk.replace(sentText, '').trim();
396
+
397
+ if (unsentTextChunk.length >= CHUNK_SIZE) {
398
+ const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
399
+ if (audioUrl) {
400
+ audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
401
+ if (!currentAudio) playNextAudio();
402
+ }
403
+ textChunk = "";
404
+ }
405
+ }
406
+
407
+ if (fullResponseText !== '') {
408
+ fullResponseText = '';
409
+ }
410
+ }
411
  }
 
 
412
  }
413
 
414
+ buffer = lines[lines.length - 1];
415
+ }
416
+ } catch (error) {
417
+ console.error("Error in handleStreamingResponse:", error);
418
+ } finally {
419
+ reader.releaseLock();
420
+
421
+ let unsentTextChunk = textChunk.replace(sentText, '').trim();
422
+ if (unsentTextChunk !== "") {
423
+ const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
424
+ if (audioUrl) {
425
+ audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
426
+ if (!currentAudio) playNextAudio();
427
  }
 
428
  }
 
429
 
430
+ if (fullResponseText !== '') {
431
+ fullResponseText = '';
432
+ }
433
+ if (fullResponseText2 !== '') {
434
+ addToConversationHistory('assistant', fullResponseText2);
435
+ fullResponseText2 = '';
436
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  }
 
438
  };
439
 
440
  // Generate Text-to-Speech audio with caching
441
  const generateTextToSpeechAudio = async (text, voice) => {
442
+ const normalizedText = normalizeQueryText(text);
443
+ const cacheKey = `${normalizedText}-${voice}`;
444
+
445
+ if (audioCache.has(cacheKey)) {
446
+ const cachedData = audioCache.get(cacheKey);
447
+ if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
448
+ return cachedData.url;
449
+ } else {
450
+ audioCache.delete(cacheKey);
451
+ }
452
  }
 
453
 
454
+ try {
455
+ const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
456
+ if (!response.ok) throw new Error('Network response was not ok');
457
+ const audioBlob = await response.blob();
458
+ const audioUrl = URL.createObjectURL(audioBlob);
459
 
460
+ audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
461
+ return audioUrl;
462
+ } catch (error) {
463
+ console.error("Error generating TTS audio:", error);
464
+ return null;
465
+ }
466
  };
467
 
468
 
469
  // Speech Recognition Initialization
470
 
471
  if ('webkitSpeechRecognition' in window) {
472
+ speechRecognizer = new webkitSpeechRecognition();
473
+ Object.assign(speechRecognizer, {
474
+ continuous: true,
475
+ interimResults: true,
476
+ language: 'en-US',
477
+ maxAlternatives: 3
478
+ });
479
+
480
+ speechRecognizer.onstart = () => {
481
+ console.log("Speech recognition started");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  isUserSpeaking = true;
483
  lastUserSpeechTimestamp = Date.now();
484
  updateActivityIndicators();
485
+ startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
486
+ };
487
+
488
+ speechRecognizer.onresult = (event) => {
489
+ let interimTranscript = '';
490
+ for (let i = event.resultIndex; i < event.results.length; i++) {
491
+ const transcript = event.results[i][0].transcript;
492
+ if (event.results[i].isFinal) {
493
+ interruptAudioPlayback('final');
494
+ processSpeechTranscript(transcript);
495
+ isUserSpeaking = false;
496
+ updateActivityIndicators();
497
+ queryStartTime = Date.now();
498
+ } else {
499
+ interimTranscript += transcript;
500
+ isUserSpeaking = true;
501
+ lastUserSpeechTimestamp = Date.now();
502
+ updateActivityIndicators();
503
+
504
+ if (interimTranscript.length > prefetchTextQuery.length + 5) {
505
+ cancelPrefetchRequests(prefetchTextQuery);
506
+ }
507
+ prefetchTextQuery = interimTranscript;
508
+ prefetchFirstAudioChunk(interimTranscript, voiceSelectionDropdown.value);
509
 
510
+ if (isRequestInProgress && shouldInterruptAudioPlayback(interimTranscript)) {
511
+ interruptAudioPlayback('interim');
512
+ }
513
+ }
514
  }
515
+ };
516
+
517
+ speechRecognizer.onerror = (event) => {
518
+ console.error('Speech recognition error:', event.error);
519
+ if (isSpeechRecognitionActive) speechRecognizer.start();
520
+ };
521
+
522
+ speechRecognizer.onend = () => {
523
+ isUserSpeaking = false;
524
+ updateActivityIndicators();
525
 
526
+ if (isSpeechRecognitionActive) speechRecognizer.start();
527
+ };
528
+
529
+ startStopButton.addEventListener('click', () => {
530
+ if (isSpeechRecognitionActive) {
531
+ speechRecognizer.stop();
532
+ isSpeechRecognitionActive = false;
533
+ startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
534
+ } else {
535
+ speechRecognizer.start();
536
+ isSpeechRecognitionActive = true;
537
+ startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
538
  }
539
+ });
540
+ } else {
541
+ alert('Your browser does not support the Web Speech API.');
542
+ }
543
+
544
+ setInterval(updateLatency, 100);
545
+
546
+ // Webcam Functions
547
+
548
+ async function startWebcam() {
549
+ try {
550
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
551
+ video.srcObject = stream;
552
+ setInterval(captureAndProcessImage, 5000); // Adjust interval as needed
553
+ } catch (error) {
554
+ console.error("Error accessing webcam: ", error);
555
+ // Consider adding user feedback here, e.g., alert or display a message.
556
  }
557
+ }
558
 
 
 
 
 
559
 
560
+ async function captureAndProcessImage() {
561
+ const canvas = document.createElement('canvas');
562
+ canvas.width = video.videoWidth;
563
+ canvas.height = video.videoHeight;
564
+ const context = canvas.getContext('2d');
565
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
566
 
567
+ const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
568
+ await processWithGradio(blob);
569
+ }
570
 
571
+
572
+ async function processWithGradio(imageBlob) {
573
+ try {
574
+ const randomClient = clients[Math.floor(Math.random() * clients.length)];
575
+ app = await client(randomClient);
576
+ const handledFile = await handle_file(imageBlob);
577
+
578
+ const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
579
+
580
+ const dataString = result.data[0]; // Assuming the caption is the first element in the response
581
+ lastCaption = dataString || ""; // Handle potential errors
582
+ } catch (error) {
583
+ console.error("Error processing with Gradio:", error);
584
+ // Add error handling here (e.g., display a message to the user).
585
+ lastCaption = ""; // Reset caption if there's an error.
586
  }
 
 
 
587
  }
588
 
 
589
 
590
 
591
+ // Modify sendQueryToAI to include the caption
592
+ async function sendQueryToAI(query) {
593
+ console.log("Sending query to AI:", query);
594
+ isRequestInProgress = true;
595
+ updateActivityIndicators();
596
+ firstResponseTextTimestamp = null;
597
+
598
+ const normalizedQuery = normalizeQueryText(query);
599
+ const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
600
+
601
+ queryStartTime = Date.now();
602
+
603
+ // Check prefetch cache
604
+ if (prefetchCache.has(cacheKey)) {
605
+ const cachedData = prefetchCache.get(cacheKey);
606
+ if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
607
+ audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
608
+ playNextAudio();
609
+ } else {
610
+ prefetchCache.delete(cacheKey);
611
+ }
612
+ }
613
+
614
+ requestAbortController = new AbortController();
615
+
616
+ try {
617
+ const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}`;
618
+ await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
619
+ } catch (error) {
620
+ if (error.name !== 'AbortError') {
621
+ console.error("Error sending query to AI:", error);
622
+ }
623
+ } finally {
624
+ isRequestInProgress = false;
625
+ updateActivityIndicators();
626
+ }
627
+ };
628
+
629
+
630
+ // Initialize Webcam and Speech Recognition on Load
631
+ window.onload = () => {
632
+ startWebcam();
633
+ };