KingNish commited on
Commit
194daa5
1 Parent(s): 051a4de

Update script1.js

Browse files
Files changed (1) hide show
  1. script1.js +65 -117
script1.js CHANGED
@@ -4,8 +4,8 @@
4
  const USER_SPEECH_INTERRUPT_DELAY = 500;
5
  const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
6
  const CHUNK_SIZE = 300;
7
- const MAX_PREFETCH_REQUESTS = 10;
8
- const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
9
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
10
 
11
  // DOM Elements
@@ -36,8 +36,7 @@ let audioPlaybackQueue = [];
36
 
37
  // Prefetching and Caching
38
  const prefetchCache = new Map();
39
- const pendingPrefetchRequests = new Map();
40
- const prefetchQueue = [];
41
  let prefetchTextQuery = "";
42
 
43
  // Conversation History
@@ -46,20 +45,6 @@ let conversationHistory = [];
46
  // Audio Caching
47
  const audioCache = new Map();
48
 
49
- // Webcam Integration
50
- import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
51
- const video = document.getElementById('webcam');
52
- let app;
53
- let lastCaption = "";
54
- let isWebcamActive = false; // Flag for webcam state
55
-
56
- const clients = [
57
- "multimodalart/Florence-2-l4",
58
- "gokaygokay/Florence-2",
59
- "multimodalart/Florence-2-l4-2",
60
- "gokaygokay/Florence-2",
61
- ];
62
-
63
  // Utility Functions
64
 
65
  // Normalize query text
@@ -72,7 +57,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
72
  // Update activity indicators
73
  const updateActivityIndicators = (state = null) => {
74
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
75
-
76
  if (isRequestInProgress && !currentAudio) {
77
  aiActivityIndicator.textContent = "AI: Processing...";
78
  } else if (currentAudio && !isUserSpeaking) {
@@ -157,7 +142,6 @@ const interruptAudioPlayback = (reason = 'unknown') => {
157
  }
158
 
159
  prefetchCache.clear();
160
- prefetchQueue.length = 0;
161
  updateActivityIndicators();
162
  };
163
 
@@ -165,45 +149,32 @@ const interruptAudioPlayback = (reason = 'unknown') => {
165
  // Prefetching and Caching Functions
166
 
167
  // Prefetch and cache the first TTS audio chunk
168
- const prefetchFirstAudioChunk = (query, voice) => {
169
  const normalizedQuery = normalizeQueryText(query);
170
  const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
171
 
172
  if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
173
 
174
- prefetchQueue.push({ query: query.trim(), voice, cacheKey });
175
- processPrefetchQueue();
176
- };
177
-
178
- // Process the prefetch queue
179
- const processPrefetchQueue = async () => {
180
- while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
181
- const { query, voice, cacheKey } = prefetchQueue.shift();
182
- const abortController = new AbortController();
183
- pendingPrefetchRequests.set(cacheKey, abortController);
184
-
185
- try {
186
- const firstAudioUrl = await streamAndPrefetchAudio(query, voice, abortController.signal);
187
-
188
- if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
189
 
190
- } catch (error) {
191
- if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
192
- } finally {
193
- pendingPrefetchRequests.delete(cacheKey);
194
- processPrefetchQueue();
195
- }
 
196
  }
197
  };
198
 
 
199
  // Cancel pending prefetch requests
200
  const cancelPrefetchRequests = (query) => {
201
  const normalizedQuery = normalizeQueryText(query);
202
-
203
- for (const [cacheKey, abortController] of pendingPrefetchRequests) {
204
- if (cacheKey.startsWith(normalizedQuery)) {
205
- abortController.abort();
206
- pendingPrefetchRequests.delete(cacheKey);
207
  }
208
  }
209
  };
@@ -219,7 +190,7 @@ async function sendQueryToAI(query) {
219
  firstResponseTextTimestamp = null;
220
 
221
  const normalizedQuery = normalizeQueryText(query);
222
- const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
223
 
224
  queryStartTime = Date.now();
225
 
@@ -237,7 +208,7 @@ async function sendQueryToAI(query) {
237
  requestAbortController = new AbortController();
238
 
239
  try {
240
- const combinedQuery = isWebcamActive ? `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}` : query;
241
  await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
242
  } catch (error) {
243
  if (error.name !== 'AbortError') {
@@ -281,12 +252,12 @@ const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
281
  };
282
 
283
  // Stream AI response for prefetching
284
- const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
285
- const response = await fetchAIResponse(query, abortSignal);
286
 
287
  if (!response.ok) throw new Error('Network response was not ok');
288
 
289
- return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
290
  };
291
 
292
  // Fetch AI response
@@ -313,7 +284,7 @@ const fetchAIResponse = async (query, abortSignal) => {
313
  };
314
 
315
  // Handle the streaming response for prefetching
316
- const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
317
  const reader = responseStream.getReader();
318
  const decoder = new TextDecoder("utf-8");
319
  let buffer = "";
@@ -322,7 +293,6 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
322
  while (true) {
323
  const { done, value } = await reader.read();
324
  if (done) break;
325
- if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
326
 
327
  const chunk = decoder.decode(value, { stream: true });
328
  buffer += chunk;
@@ -354,7 +324,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
354
  const reader = responseStream.getReader();
355
  const decoder = new TextDecoder("utf-8");
356
  let buffer = "";
357
- let initialChunksSent = 0;
358
  let fullResponseText = "";
359
  let fullResponseText2 = "";
360
  let textChunk = "";
@@ -387,25 +356,10 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
387
  textChunk += textContent + " ";
388
  transcriptDiv.textContent = fullResponseText2;
389
 
390
- if (initialChunksSent < 2) {
391
- const audioUrl = await generateTextToSpeechAudio(textContent, voice);
392
- if (audioUrl) {
393
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
394
- if (!currentAudio) playNextAudio();
395
- }
396
- sentText += textContent + " ";
397
- initialChunksSent++;
398
- } else {
399
- let unsentTextChunk = textChunk.replace(sentText, '').trim();
400
-
401
- if (unsentTextChunk.length >= CHUNK_SIZE) {
402
- const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
403
- if (audioUrl) {
404
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
405
- if (!currentAudio) playNextAudio();
406
- }
407
- textChunk = "";
408
- }
409
  }
410
 
411
  if (fullResponseText !== '') {
@@ -422,18 +376,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
422
  } finally {
423
  reader.releaseLock();
424
 
425
- let unsentTextChunk = textChunk.replace(sentText, '').trim();
426
- if (unsentTextChunk !== "") {
427
- const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
428
- if (audioUrl) {
429
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
430
- if (!currentAudio) playNextAudio();
431
- }
432
- }
433
-
434
- if (fullResponseText !== '') {
435
- fullResponseText = '';
436
- }
437
  if (fullResponseText2 !== '') {
438
  addToConversationHistory('assistant', fullResponseText2);
439
  fullResponseText2 = '';
@@ -487,12 +429,6 @@ if ('webkitSpeechRecognition' in window) {
487
  lastUserSpeechTimestamp = Date.now();
488
  updateActivityIndicators();
489
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
490
-
491
- // Start webcam processing if not already active
492
- if (!isWebcamActive) {
493
- startWebcam();
494
- isWebcamActive = true;
495
- }
496
  };
497
 
498
  speechRecognizer.onresult = (event) => {
@@ -541,34 +477,39 @@ if ('webkitSpeechRecognition' in window) {
541
  speechRecognizer.stop();
542
  isSpeechRecognitionActive = false;
543
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
544
-
545
- // Stop webcam processing
546
- isWebcamActive = false;
547
  } else {
548
  speechRecognizer.start();
549
  isSpeechRecognitionActive = true;
550
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
 
551
  }
552
  });
553
  } else {
554
  alert('Your browser does not support the Web Speech API.');
555
  }
556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
- // Webcam Functions (optimized)
559
  async function startWebcam() {
560
  try {
561
  const stream = await navigator.mediaDevices.getUserMedia({ video: true });
562
- video.srcObject = stream;
563
-
564
- // Capture and process image every 5 seconds while webcam is active
565
- const webcamInterval = setInterval(async () => {
566
- if (!isWebcamActive) {
567
- clearInterval(webcamInterval);
568
- return;
569
- }
570
- await captureAndProcessImage();
571
- }, 5000);
572
  } catch (error) {
573
  console.error("Error accessing webcam: ", error);
574
  }
@@ -576,34 +517,41 @@ async function startWebcam() {
576
 
577
  async function captureAndProcessImage() {
578
  const canvas = document.createElement('canvas');
579
- canvas.width = video.videoWidth;
580
- canvas.height = video.videoHeight;
581
  const context = canvas.getContext('2d');
582
- context.drawImage(video, 0, 0, canvas.width, canvas.height);
583
 
584
- const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/jpeg', 0.8)); // Use JPEG for smaller size
585
  await processWithGradio(blob);
586
  }
587
 
588
-
589
  async function processWithGradio(imageBlob) {
590
  try {
591
  const randomClient = clients[Math.floor(Math.random() * clients.length)];
592
  app = await client(randomClient);
593
-
594
  const handledFile = await handle_file(imageBlob);
595
- const result = await app.predict("/process_image", [handledFile, "More Detailed Caption"]);
 
596
 
597
  const dataString = result.data[0];
598
- lastCaption = dataString || lastCaption;
599
  } catch (error) {
600
  console.error("Error processing with Gradio:", error);
601
  }
602
  }
603
 
604
-
605
- setInterval(updateLatency, 100);
606
 
607
  window.onload = () => {
608
- startWebcam();
 
 
 
 
 
 
 
 
 
609
  };
 
4
  const USER_SPEECH_INTERRUPT_DELAY = 500;
5
  const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
6
  const CHUNK_SIZE = 300;
7
+ const MAX_PREFETCH_REQUESTS = 5;
8
+ const PREFETCH_CACHE_EXPIRATION = 30000; // 30 seconds
9
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
10
 
11
  // DOM Elements
 
36
 
37
  // Prefetching and Caching
38
  const prefetchCache = new Map();
39
+ const pendingPrefetchRequests = new Set();
 
40
  let prefetchTextQuery = "";
41
 
42
  // Conversation History
 
45
  // Audio Caching
46
  const audioCache = new Map();
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  // Utility Functions
49
 
50
  // Normalize query text
 
57
  // Update activity indicators
58
  const updateActivityIndicators = (state = null) => {
59
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
60
+
61
  if (isRequestInProgress && !currentAudio) {
62
  aiActivityIndicator.textContent = "AI: Processing...";
63
  } else if (currentAudio && !isUserSpeaking) {
 
142
  }
143
 
144
  prefetchCache.clear();
 
145
  updateActivityIndicators();
146
  };
147
 
 
149
  // Prefetching and Caching Functions
150
 
151
  // Prefetch and cache the first TTS audio chunk
152
+ const prefetchFirstAudioChunk = async (query, voice) => {
153
  const normalizedQuery = normalizeQueryText(query);
154
  const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
155
 
156
  if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
157
 
158
+ pendingPrefetchRequests.add(cacheKey);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ try {
161
+ const firstAudioUrl = await streamAndPrefetchAudio(query, voice);
162
+ if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
163
+ } catch (error) {
164
+ if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
165
+ } finally {
166
+ pendingPrefetchRequests.delete(cacheKey);
167
  }
168
  };
169
 
170
+
171
  // Cancel pending prefetch requests
172
  const cancelPrefetchRequests = (query) => {
173
  const normalizedQuery = normalizeQueryText(query);
174
+ for (const key of pendingPrefetchRequests) {
175
+ if (key.startsWith(normalizedQuery)) {
176
+ pendingPrefetchRequests.delete(key);
177
+ // Implement abort logic if needed for your fetch implementation
 
178
  }
179
  }
180
  };
 
190
  firstResponseTextTimestamp = null;
191
 
192
  const normalizedQuery = normalizeQueryText(query);
193
+ const cacheKey = generateCacheKey(normalizedQuery, voiceSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
194
 
195
  queryStartTime = Date.now();
196
 
 
208
  requestAbortController = new AbortController();
209
 
210
  try {
211
+ const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}`;
212
  await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
213
  } catch (error) {
214
  if (error.name !== 'AbortError') {
 
252
  };
253
 
254
  // Stream AI response for prefetching
255
+ const streamAndPrefetchAudio = async (query, voice) => {
256
+ const response = await fetchAIResponse(query, undefined);
257
 
258
  if (!response.ok) throw new Error('Network response was not ok');
259
 
260
+ return handleStreamingResponseForPrefetch(response.body, voice);
261
  };
262
 
263
  // Fetch AI response
 
284
  };
285
 
286
  // Handle the streaming response for prefetching
287
+ const handleStreamingResponseForPrefetch = async (responseStream, voice) => {
288
  const reader = responseStream.getReader();
289
  const decoder = new TextDecoder("utf-8");
290
  let buffer = "";
 
293
  while (true) {
294
  const { done, value } = await reader.read();
295
  if (done) break;
 
296
 
297
  const chunk = decoder.decode(value, { stream: true });
298
  buffer += chunk;
 
324
  const reader = responseStream.getReader();
325
  const decoder = new TextDecoder("utf-8");
326
  let buffer = "";
 
327
  let fullResponseText = "";
328
  let fullResponseText2 = "";
329
  let textChunk = "";
 
356
  textChunk += textContent + " ";
357
  transcriptDiv.textContent = fullResponseText2;
358
 
359
+ const audioUrl = await generateTextToSpeechAudio(textContent, voice); // Call TTS immediately
360
+ if (audioUrl) {
361
+ audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
362
+ if (!currentAudio) playNextAudio();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  }
364
 
365
  if (fullResponseText !== '') {
 
376
  } finally {
377
  reader.releaseLock();
378
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  if (fullResponseText2 !== '') {
380
  addToConversationHistory('assistant', fullResponseText2);
381
  fullResponseText2 = '';
 
429
  lastUserSpeechTimestamp = Date.now();
430
  updateActivityIndicators();
431
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
 
 
 
 
 
 
432
  };
433
 
434
  speechRecognizer.onresult = (event) => {
 
477
  speechRecognizer.stop();
478
  isSpeechRecognitionActive = false;
479
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
480
+ clearInterval(imageCaptureInterval); // Stop webcam processing
 
 
481
  } else {
482
  speechRecognizer.start();
483
  isSpeechRecognitionActive = true;
484
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
485
+ imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
486
  }
487
  });
488
  } else {
489
  alert('Your browser does not support the Web Speech API.');
490
  }
491
 
492
+ setInterval(updateLatency, 100);
493
+
494
+
495
+
496
+ // Webcam Integration
497
+ import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
498
+
499
+ let app;
500
+ let lastCaption = "";
501
+
502
+ const clients = [
503
+ "multimodalart/Florence-2-l4",
504
+ "gokaygokay/Florence-2",
505
+ "multimodalart/Florence-2-l4-2",
506
+ "gokaygokay/Florence-2",
507
+ ];
508
 
 
509
  async function startWebcam() {
510
  try {
511
  const stream = await navigator.mediaDevices.getUserMedia({ video: true });
512
+ webcamVideo.srcObject = stream;
 
 
 
 
 
 
 
 
 
513
  } catch (error) {
514
  console.error("Error accessing webcam: ", error);
515
  }
 
517
 
518
  async function captureAndProcessImage() {
519
  const canvas = document.createElement('canvas');
520
+ canvas.width = webcamVideo.videoWidth;
521
+ canvas.height = webcamVideo.videoHeight;
522
  const context = canvas.getContext('2d');
523
+ context.drawImage(webcamVideo, 0, 0, canvas.width, canvas.height);
524
 
525
+ const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
526
  await processWithGradio(blob);
527
  }
528
 
 
529
  async function processWithGradio(imageBlob) {
530
  try {
531
  const randomClient = clients[Math.floor(Math.random() * clients.length)];
532
  app = await client(randomClient);
 
533
  const handledFile = await handle_file(imageBlob);
534
+
535
+ const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
536
 
537
  const dataString = result.data[0];
538
+ lastCaption = dataString || lastCaption;
539
  } catch (error) {
540
  console.error("Error processing with Gradio:", error);
541
  }
542
  }
543
 
544
+ let imageCaptureInterval; // Declare interval outside the event listener
 
545
 
546
  window.onload = () => {
547
+ startWebcam();
548
+
549
+ startStopButton.addEventListener('click', () => {
550
+ // ... (start/stop speech recognition and webcam captioning)
551
+ if (isSpeechRecognitionActive) {
552
+ clearInterval(imageCaptureInterval); // Stop webcam processing
553
+ } else {
554
+ imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
555
+ }
556
+ });
557
  };