KingNish commited on
Commit
8f2e652
1 Parent(s): 384e5e7

Update script1.js

Browse files
Files changed (1) hide show
  1. script1.js +46 -59
script1.js CHANGED
@@ -1,12 +1,12 @@
1
  // Constants and Configuration
2
  const USER_SPEECH_INTERRUPT_DELAY = 500;
3
- const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
4
  const CHUNK_SIZE = 300;
5
- const MAX_PREFETCH_REQUESTS = 5; // Reduced to avoid overloading
6
  const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
7
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
8
  const WEBCAM_INTERVAL = 5000;
9
- const MAX_HISTORY_LENGTH = 6; // Limit history for better performance
10
 
11
  // DOM Elements
12
  const startStopButton = document.getElementById('startStopButton');
@@ -17,6 +17,7 @@ const responseTimeDisplay = document.getElementById('responseTime');
17
  const userActivityIndicator = document.getElementById('userIndicator');
18
  const aiActivityIndicator = document.getElementById('aiIndicator');
19
  const transcriptDiv = document.getElementById('transcript');
 
20
 
21
  // Speech Recognition
22
  let speechRecognizer;
@@ -46,6 +47,22 @@ let conversationHistory = [];
46
  // Audio Caching
47
  const audioCache = new Map();
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  // Utility Functions
50
 
51
  // Normalize query text
@@ -58,7 +75,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
58
  // Update activity indicators
59
  const updateActivityIndicators = (state = null) => {
60
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
61
-
62
  if (isRequestInProgress && !currentAudio) {
63
  aiActivityIndicator.textContent = "AI: Processing...";
64
  } else if (currentAudio && !isUserSpeaking) {
@@ -194,7 +211,6 @@ const cancelPrefetchRequests = (query) => {
194
 
195
  // Send a query to the AI
196
  async function sendQueryToAI(query) {
197
- console.log("Sending query to AI:", query);
198
  isRequestInProgress = true;
199
  updateActivityIndicators();
200
  firstResponseTextTimestamp = null;
@@ -210,7 +226,6 @@ async function sendQueryToAI(query) {
210
  combinedQuery += `, {USER: "${query}"}`;
211
 
212
  await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
213
-
214
  } catch (error) {
215
  if (error.name !== 'AbortError') {
216
  console.error("Error sending query to AI:", error);
@@ -226,7 +241,7 @@ const processSpeechTranscript = (transcript) => {
226
  const trimmedTranscript = transcript.trimStart();
227
  if (trimmedTranscript !== '' && !isRequestInProgress) {
228
  activeQuery = trimmedTranscript;
229
- addToConversationHistory('user', activeQuery); // Add history before sending
230
  sendQueryToAI(activeQuery);
231
  }
232
  };
@@ -330,7 +345,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
330
  let fullResponseText2 = "";
331
  let textChunk = "";
332
 
333
-
334
  try {
335
  while (true) {
336
  const { done, value } = await reader.read();
@@ -346,34 +360,38 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
346
  buffer += chunk;
347
  const lines = buffer.split('\n');
348
 
349
- for (const line of lines) { // Simplified loop
350
  if (line.startsWith('data: ')) {
351
  const textContent = line.substring(6).trim();
352
  if (textContent) {
353
  if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
354
- fullResponseText += textContent + " "; // Accumulate full response
 
355
  fullResponseText2 += textContent + " ";
356
  textChunk += textContent + " ";
357
  transcriptDiv.textContent = fullResponseText2;
358
 
359
 
360
  if (textChunk.length >= CHUNK_SIZE) {
361
- const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
362
- if (audioUrl) {
363
- audioPlaybackQueue.push({ url: audioUrl });
364
- if (!currentAudio) playNextAudio();
365
- }
366
- textChunk = ""; // Clear after sending
367
  }
368
  }
369
  }
370
  }
 
 
371
  }
372
  } catch (error) {
373
  console.error("Error in handleStreamingResponse:", error);
374
  } finally {
375
- // ... (Send any remaining textChunk)
376
- if (textChunk !== "") {
 
377
  const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
378
  if (audioUrl) {
379
  audioPlaybackQueue.push({ url: audioUrl });
@@ -381,12 +399,9 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
381
  }
382
  }
383
 
384
- addToConversationHistory('assistant', fullResponseText2);
385
- fullResponseText = "";
386
- fullResponseText2 = "";
387
-
388
- reader.releaseLock();
389
-
390
  }
391
  };
392
 
@@ -484,20 +499,14 @@ if ('webkitSpeechRecognition' in window) {
484
  speechRecognizer.stop();
485
  isSpeechRecognitionActive = false;
486
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
487
-
488
- // Stop webcam capture when speech recognition stops
489
- clearInterval(webcamInterval);
490
- video.srcObject = null;
491
- lastCaption = "";
492
- isCaptioningEnabled = false;
493
-
494
  } else {
495
  speechRecognizer.start();
496
  isSpeechRecognitionActive = true;
497
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
498
-
499
- // Start webcam capture when speech recognition starts
500
- isCaptioningEnabled = true;
501
  startWebcam();
502
  }
503
  });
@@ -508,28 +517,13 @@ if ('webkitSpeechRecognition' in window) {
508
  setInterval(updateLatency, 100);
509
 
510
 
511
-
512
- // Webcam Integration
513
- import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
514
-
515
- const video = document.getElementById('webcam');
516
- let app;
517
- let lastCaption = "";
518
-
519
- const clients = [
520
- "multimodalart/Florence-2-l4",
521
- "gokaygokay/Florence-2",
522
- "multimodalart/Florence-2-l4-2",
523
- "gokaygokay/Florence-2",
524
- ];
525
-
526
- let webcamInterval; // Store the interval ID
527
 
528
  async function startWebcam() {
529
  try {
530
  const stream = await navigator.mediaDevices.getUserMedia({ video: true });
531
  video.srcObject = stream;
532
- webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL); // Set interval only once
533
  } catch (error) {
534
  console.error("Error accessing webcam: ", error);
535
  }
@@ -559,11 +553,4 @@ async function processWithGradio(imageBlob) {
559
  } catch (error) {
560
  console.error("Error processing with Gradio:", error);
561
  }
562
- }
563
-
564
- window.onload = () => {
565
- // Start webcam only if speech recognition is active
566
- if (isCaptioningEnabled) {
567
- startWebcam();
568
- }
569
- };
 
1
  // Constants and Configuration
2
  const USER_SPEECH_INTERRUPT_DELAY = 500;
3
+ const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
4
  const CHUNK_SIZE = 300;
5
+ const MAX_PREFETCH_REQUESTS = 5;
6
  const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
7
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
8
  const WEBCAM_INTERVAL = 5000;
9
+ const MAX_HISTORY_LENGTH = 6;
10
 
11
  // DOM Elements
12
  const startStopButton = document.getElementById('startStopButton');
 
17
  const userActivityIndicator = document.getElementById('userIndicator');
18
  const aiActivityIndicator = document.getElementById('aiIndicator');
19
  const transcriptDiv = document.getElementById('transcript');
20
+ const video = document.getElementById('webcam');
21
 
22
  // Speech Recognition
23
  let speechRecognizer;
 
47
  // Audio Caching
48
  const audioCache = new Map();
49
 
50
+ // Image Captioning State
51
+ let isCaptioningEnabled = false;
52
+ let lastCaption = "";
53
+
54
+ // Webcam Integration
55
+ import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
56
+ const clients = [
57
+ "multimodalart/Florence-2-l4",
58
+ "gokaygokay/Florence-2",
59
+ "multimodalart/Florence-2-l4-2",
60
+ "gokaygokay/Florence-2",
61
+ ];
62
+ let app;
63
+ let webcamInterval;
64
+
65
+
66
  // Utility Functions
67
 
68
  // Normalize query text
 
75
  // Update activity indicators
76
  const updateActivityIndicators = (state = null) => {
77
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
78
+
79
  if (isRequestInProgress && !currentAudio) {
80
  aiActivityIndicator.textContent = "AI: Processing...";
81
  } else if (currentAudio && !isUserSpeaking) {
 
211
 
212
  // Send a query to the AI
213
  async function sendQueryToAI(query) {
 
214
  isRequestInProgress = true;
215
  updateActivityIndicators();
216
  firstResponseTextTimestamp = null;
 
226
  combinedQuery += `, {USER: "${query}"}`;
227
 
228
  await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
 
229
  } catch (error) {
230
  if (error.name !== 'AbortError') {
231
  console.error("Error sending query to AI:", error);
 
241
  const trimmedTranscript = transcript.trimStart();
242
  if (trimmedTranscript !== '' && !isRequestInProgress) {
243
  activeQuery = trimmedTranscript;
244
+ addToConversationHistory('user', activeQuery);
245
  sendQueryToAI(activeQuery);
246
  }
247
  };
 
345
  let fullResponseText2 = "";
346
  let textChunk = "";
347
 
 
348
  try {
349
  while (true) {
350
  const { done, value } = await reader.read();
 
360
  buffer += chunk;
361
  const lines = buffer.split('\n');
362
 
363
+ for (const line of lines) {
364
  if (line.startsWith('data: ')) {
365
  const textContent = line.substring(6).trim();
366
  if (textContent) {
367
  if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
368
+
369
+ fullResponseText += textContent + " ";
370
  fullResponseText2 += textContent + " ";
371
  textChunk += textContent + " ";
372
  transcriptDiv.textContent = fullResponseText2;
373
 
374
 
375
  if (textChunk.length >= CHUNK_SIZE) {
376
+ const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
377
+ if (audioUrl) {
378
+ audioPlaybackQueue.push({ url: audioUrl });
379
+ if (!currentAudio) playNextAudio();
380
+ }
381
+ textChunk = "";
382
  }
383
  }
384
  }
385
  }
386
+
387
+ buffer = lines[lines.length - 1];
388
  }
389
  } catch (error) {
390
  console.error("Error in handleStreamingResponse:", error);
391
  } finally {
392
+ reader.releaseLock();
393
+
394
+ if (textChunk !== "") { // Send any remaining text
395
  const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
396
  if (audioUrl) {
397
  audioPlaybackQueue.push({ url: audioUrl });
 
399
  }
400
  }
401
 
402
+ addToConversationHistory('assistant', fullResponseText2);
403
+ fullResponseText = "";
404
+ fullResponseText2 = "";
 
 
 
405
  }
406
  };
407
 
 
499
  speechRecognizer.stop();
500
  isSpeechRecognitionActive = false;
501
  startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
502
+ clearInterval(webcamInterval);
503
+ video.srcObject = null;
504
+ lastCaption = "";
 
 
 
 
505
  } else {
506
  speechRecognizer.start();
507
  isSpeechRecognitionActive = true;
508
  startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
509
+ isCaptioningEnabled = true;
 
 
510
  startWebcam();
511
  }
512
  });
 
517
  setInterval(updateLatency, 100);
518
 
519
 
520
+ // Webcam Functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
  async function startWebcam() {
523
  try {
524
  const stream = await navigator.mediaDevices.getUserMedia({ video: true });
525
  video.srcObject = stream;
526
+ webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL);
527
  } catch (error) {
528
  console.error("Error accessing webcam: ", error);
529
  }
 
553
  } catch (error) {
554
  console.error("Error processing with Gradio:", error);
555
  }
556
+ }