Live-Video-Chat

Running

App Files Files Community

KingNish commited on Sep 28, 2024

Commit

37ed712

verified ·

1 Parent(s): df40d37

Update script1.js

Browse files

Files changed (1) hide show

script1.js +344 -296

script1.js CHANGED Viewed

@@ -1,108 +1,352 @@
 const startStopButton = document.getElementById('startStopButton');
 const voiceSelectionDropdown = document.getElementById('voiceSelect');
 const modelSelectionDropdown = document.getElementById('modelSelect');
-const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
 const responseTimeDisplay = document.getElementById('responseTime');
 const userActivityIndicator = document.getElementById('userIndicator');
 const aiActivityIndicator = document.getElementById('aiIndicator');
 const transcriptDiv = document.getElementById('transcript');
 let speechRecognizer;
 let activeQuery = null;
 let queryStartTime = 0;
-let completeTranscript = '';
 let isRequestInProgress = false;
 let isUserSpeaking = false;
-let isSpeechRecognitionActive = false;
-let userManuallyStoppedRecognizer = false;
 let requestAbortController = null;
-let partialTranscript = '';
-let lastUserSpeechTimestamp = null;
-let prefetchTextQuery = "";
 let firstResponseTextTimestamp = null;
-// Configuration
-const USER_SPEECH_INTERRUPT_DELAY = 500;
-const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
-const CHUNK_SIZE = 300;
 // Audio Management
 let currentAudio = null;
 let audioPlaybackQueue = [];
-let prefetchQueue = [];
-// Enhanced Prefetching and Caching
-const prefetchCache = new Map();
 const pendingPrefetchRequests = new Map();
-const MAX_PREFETCH_REQUESTS = 10;
-const prefetchCacheExpiration = 60000; // 1 minute
-// Global Conversation History
 let conversationHistory = [];
 // Audio Caching
-const audioCache = new Map();
-const audioCacheExpiration = 3600000; // 1 hour
-// Normalize query text
 const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
 // Generate a cache key
-const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
-  `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;
 // Prefetch and cache the first TTS audio chunk
 const prefetchFirstAudioChunk = (query, voice) => {
-  const normalizedQuery = normalizeQueryText(query);
-  const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
-  if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
-  prefetchQueue.push({ query:query.trim(), voice, cacheKey });
-  processPrefetchQueue();
 };
 // Process the prefetch queue
 const processPrefetchQueue = async () => {
-  while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
-    const { query, voice, cacheKey } = prefetchQueue.shift();
-    const abortController = new AbortController();
-    pendingPrefetchRequests.set(cacheKey, abortController);
-    const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value : 'none';
-    const url = '/stream_text';
-    const requestBody = {
-        query: query,
-        history: JSON.stringify(conversationHistory),
-        model: modelSelectionDropdown.value,
-        api_key: userSambanovaKey
-    };
-    try {
-      const response = await fetch(url, {
-        method: 'POST',
-        headers: {
-            'Accept': 'text/event-stream',
-            'Content-Type': 'application/json'
-        },
-        body: JSON.stringify(requestBody),
-        signal: abortController.signal
-    });
-      if (!response.ok) throw new Error('Network response was not ok');
-      const firstAudioUrl = await handleStreamingResponseForPrefetch(response.body, voice, abortController.signal);
-      if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
-    } catch (error) {
-      if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
-    } finally {
-      pendingPrefetchRequests.delete(cacheKey);
-      processPrefetchQueue();
     }
   }
 };
 // Handle the streaming response for prefetching
@@ -126,8 +370,7 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
         if (line.startsWith('data: ')) {
           const textContent = line.substring(6).trim();
           if (textContent) {
-            const audioUrl = await generateTextToSpeechAudio(textContent, voice);
-            return audioUrl;
           }
         }
       }
@@ -143,127 +386,6 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
   return null;
 };
-// Play audio from the queue
-const playNextAudio = async () => {
-  if (audioPlaybackQueue.length > 0) {
-    const audioData = audioPlaybackQueue.shift();
-    const audio = new Audio(audioData.url);
-    updateActivityIndicators();
-    const audioPromise = new Promise(resolve => {
-      audio.onended = resolve;
-      audio.onerror = resolve;
-    });
-    if (currentAudio) {
-      currentAudio.pause();
-      currentAudio.currentTime = 0;
-    }
-    currentAudio = audio;
-    await audio.play();
-    await audioPromise;
-    playNextAudio();
-  } else {
-    updateActivityIndicators();
-  }
-};
-// Generate Text-to-Speech audio with caching
-const generateTextToSpeechAudio = async (text, voice) => {
-  const normalizedText = normalizeQueryText(text);
-  const cacheKey = `${normalizedText}-${voice}`;
-  if (audioCache.has(cacheKey)) {
-    const cachedData = audioCache.get(cacheKey);
-    if (Date.now() - cachedData.timestamp < audioCacheExpiration) {
-      return cachedData.url;
-    } else {
-      audioCache.delete(cacheKey);
-    }
-  }
-  try {
-    const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
-    if (!response.ok) throw new Error('Network response was not ok');
-    const audioBlob = await response.blob();
-    const audioUrl = URL.createObjectURL(audioBlob);
-    audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
-    return audioUrl;
-  } catch (error) {
-    console.error("Error generating TTS audio:", error);
-    return null;
-  }
-};
-// Send a query to the AI
-const sendQueryToAI = async (query) => {
-    console.log("Sending query to AI:", query);
-    isRequestInProgress = true;
-    updateActivityIndicators();
-    firstResponseTextTimestamp = null;
-    const normalizedQuery = normalizeQueryText(query);
-    const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
-    queryStartTime = Date.now();
-    if (prefetchCache.has(cacheKey)) {
-        const cachedData = prefetchCache.get(cacheKey);
-        if (Date.now() - cachedData.timestamp < prefetchCacheExpiration) {
-            const prefetchedAudioUrl = cachedData.url;
-            audioPlaybackQueue.push({ url: prefetchedAudioUrl, isPrefetched: true });
-            playNextAudio();
-        } else {
-            prefetchCache.delete(cacheKey);
-        }
-    }
-    requestAbortController = new AbortController();
-    const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value : 'none';
-    const url = '/stream_text';
-    const requestBody = {
-        query: query,
-        history: JSON.stringify(conversationHistory),
-        model: modelSelectionDropdown.value,
-        api_key: userSambanovaKey
-    };
-    try {
-        const response = await fetch(url, {
-            method: 'POST',
-            headers: {
-                'Accept': 'text/event-stream',
-                'Content-Type': 'application/json'
-            },
-            body: JSON.stringify(requestBody),
-            signal: requestAbortController.signal
-        });
-        if (!response.ok) {
-            if (response.status === 429) {
-                console.log("Rate limit hit, retrying in 1 second...");
-                await new Promise(resolve => setTimeout(resolve, 1000));
-                await sendQueryToAI(query);
-                return;
-            }
-            throw new Error(`Network response was not ok: ${response.status}`);
-        }
-        console.log("Streaming audio response received");
-        await handleStreamingResponse(response.body, voiceSelectionDropdown.value, requestAbortController.signal);
-    } catch (error) {
-        if (error.name !== 'AbortError') {
-            console.error("Error sending query to AI:", error);
-        }
-    } finally {
-        isRequestInProgress = false;
-        updateActivityIndicators();
-    }
-};
 // Handle the streaming audio response
 const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
   const reader = responseStream.getReader();
@@ -273,7 +395,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
   let fullResponseText = "";
   let fullResponseText2 = "";
   let textChunk = "";
-  let sentText = "";
   try {
     while (true) {
@@ -283,7 +405,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
       if (isUserSpeaking) {
         interruptAudioPlayback('user is speaking');
-        break;
       }
       const chunk = decoder.decode(value, { stream: true });
@@ -300,7 +422,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
             fullResponseText += textContent + " ";
             fullResponseText2 += textContent + " ";
             textChunk += textContent + " ";
-            transcriptDiv.textContent = fullResponseText2;
             if (initialChunksSent < 2) {
               const audioUrl = await generateTextToSpeechAudio(textContent, voice);
@@ -308,7 +430,7 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
                 audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
                 if (!currentAudio) playNextAudio();
               }
-              sentText += textContent + " ";
               initialChunksSent++;
             } else {
               let unsentTextChunk = textChunk.replace(sentText, '').trim();
@@ -319,12 +441,12 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
                   audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
                   if (!currentAudio) playNextAudio();
                 }
-                textChunk = "";
               }
             }
             if (fullResponseText !== '') {
-              fullResponseText = '';
             }
           }
         }
@@ -356,30 +478,37 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
   }
 };
-// Update activity indicators
-const updateActivityIndicators = (state = null) => {
-  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
-  userActivityIndicator.className = isUserSpeaking
-    ? "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-blue-400 to-blue-600 hover:bg-gradient-to-r from-blue-500 to-blue-700"
-    : "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-gray-300 to-gray-400 dark:from-gray-700 dark:to-gray-800 hover:bg-gradient-to-r from-gray-400 to-gray-500"; // Tailwind classes
-  if (isRequestInProgress && !currentAudio) {
-    aiActivityIndicator.textContent = "AI: Processing...";
-    aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-purple-400 to-purple-600 hover:bg-gradient-to-r from-purple-500 to-purple-700"; // Tailwind class for thinking
-  } else if (currentAudio && !isUserSpeaking) {
-    aiActivityIndicator.textContent = state || "AI: Speaking";
-    aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-green-400 to-green-600 hover:bg-gradient-to-r from-green-500 to-green-700"; // Tailwind class for speaking
-  } else if (isUserSpeaking) {
-    aiActivityIndicator.textContent = "AI: Listening";
-    aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-yellow-400 to-yellow-600 hover:bg-gradient-to-r from-yellow-500 to-yellow-700"; // Tailwind class for listening
-  } else {
-    aiActivityIndicator.textContent = "AI: Idle";
-    aiActivityIndicator.className = "indicator rounded-full px-4 py-2 text-white flex items-center transition-colors duration-300 bg-gradient-to-r from-gray-300 to-gray-400 dark:from-gray-700 dark:to-gray-800 hover:bg-gradient-to-r from-gray-400 to-gray-500"; // Tailwind classes
   }
 };
-// Initialize speech recognition
 if ('webkitSpeechRecognition' in window) {
   speechRecognizer = new webkitSpeechRecognition();
   Object.assign(speechRecognizer, {
@@ -391,10 +520,9 @@ if ('webkitSpeechRecognition' in window) {
   speechRecognizer.onstart = () => {
     console.log("Speech recognition started");
-    completeTranscript = '';
     isUserSpeaking = true;
     lastUserSpeechTimestamp = Date.now();
-    updateActivityIndicators();
     startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
   };
@@ -403,18 +531,16 @@ if ('webkitSpeechRecognition' in window) {
     for (let i = event.resultIndex; i < event.results.length; i++) {
       const transcript = event.results[i][0].transcript;
       if (event.results[i].isFinal) {
-        completeTranscript += transcript;
         interruptAudioPlayback('final');
-        processSpeechTranscript(completeTranscript);
-        completeTranscript = '';
         isUserSpeaking = false;
-        updateActivityIndicators();
         queryStartTime = Date.now();
       } else {
         interimTranscript += transcript;
         isUserSpeaking = true;
         lastUserSpeechTimestamp = Date.now();
-        updateActivityIndicators();
         if (interimTranscript.length > prefetchTextQuery.length + 5) {
           cancelPrefetchRequests(prefetchTextQuery);
@@ -436,29 +562,19 @@ if ('webkitSpeechRecognition' in window) {
   speechRecognizer.onend = () => {
     isUserSpeaking = false;
-    updateActivityIndicators();
-    if (!isRequestInProgress && completeTranscript !== '') {
-      processSpeechTranscript(completeTranscript);
-      completeTranscript = '';
-    }
-    // Only restart if the user hasn't manually stopped the recognizer
-    if (isSpeechRecognitionActive && !userManuallyStoppedRecognizer) {
-      speechRecognizer.start();
-    }
   };
   startStopButton.addEventListener('click', () => {
     if (isSpeechRecognitionActive) {
       speechRecognizer.stop();
       isSpeechRecognitionActive = false;
-      userManuallyStoppedRecognizer = true; // Set to true when manually stopped
       startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
     } else {
       speechRecognizer.start();
       isSpeechRecognitionActive = true;
-      userManuallyStoppedRecognizer = false; // Set to false when started
       startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
     }
   });
@@ -466,75 +582,7 @@ if ('webkitSpeechRecognition' in window) {
   alert('Your browser does not support the Web Speech API.');
 }
-// Add to conversation history
-const addToConversationHistory = (role, content) => {
-  if (conversationHistory.length > 0 &&
-    conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
-    conversationHistory[conversationHistory.length - 1].content === "") {
-    conversationHistory.pop();
-  }
-  conversationHistory.push({ role, content });
-  if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
-};
-// Process the final speech transcript
-const processSpeechTranscript = (transcript) => {
-  const trimmedTranscript = transcript.trimStart();
-  if (trimmedTranscript !== '' && !isRequestInProgress) {
-    activeQuery = trimmedTranscript;
-    sendQueryToAI(activeQuery);
-    addToConversationHistory('user', activeQuery);
-  }
-};
-// Check if audio playback should be interrupted
-const shouldInterruptAudioPlayback = (interimTranscript) =>
-  Date.now() - lastUserSpeechTimestamp > USER_SPEECH_INTERRUPT_DELAY || interimTranscript.length > 5;
-// Interrupt audio playback
-const interruptAudioPlayback = (reason = 'unknown') => {
-  console.log(`Interrupting audio (reason: ${reason})...`);
-  if (currentAudio) {
-    currentAudio.pause();
-    currentAudio.currentTime = 0;
-    currentAudio = null;
-  }
-  audioPlaybackQueue.length = 0;
-  isRequestInProgress = false;
-  if (requestAbortController) {
-    requestAbortController.abort();
-    requestAbortController = null;
-  }
-  prefetchCache.clear();
-  prefetchQueue.length = 0;
-  updateActivityIndicators();
-};
-// Cancel pending prefetch requests
-const cancelPrefetchRequests = (query) => {
-  const normalizedQuery = normalizeQueryText(query);
-  for (const [cacheKey, abortController] of pendingPrefetchRequests) {
-    if (cacheKey.startsWith(normalizedQuery)) {
-      abortController.abort();
-      pendingPrefetchRequests.delete(cacheKey);
-    }
-  }
-};
-// Update latency display
-const updateLatency = () => {
-  if (firstResponseTextTimestamp) {
-    const latency = firstResponseTextTimestamp - queryStartTime;
-    responseTimeDisplay.textContent = `Latency: ${latency}ms`;
-  } else {
-    responseTimeDisplay.textContent = "Latency: 0ms";
-  }
-};
-setInterval(updateLatency, 100);

+// script1.js
+import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
+const video = document.getElementById('webcam');
+let gradioApp; // Declare gradioApp outside the function
+const GRADIO_CLIENTS = [
+  "multimodalart/Florence-2-l4",
+  "gokaygokay/Florence-2",
+  "multimodalart/Florence-2-l4-2",
+  "gokaygokay/Florence-2",
+];
+async function startWebcam() {
+  try {
+    const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+    video.srcObject = stream;
+  } catch (error) {
+    console.error("Error accessing webcam:", error);
+  }
+}
+async function getCaption() {
+  if (!gradioApp) {
+    try {
+      const randomClient = GRADIO_CLIENTS[Math.floor(Math.random() * GRADIO_CLIENTS.length)];
+      gradioApp = await client(randomClient);
+    } catch (error) {
+      console.error("Error loading Gradio client:", error);
+      return "Error getting caption"; // Or some other default
+    }
+  }
+  try {
+    const canvas = document.createElement('canvas');
+    canvas.width = video.videoWidth;
+    canvas.height = video.videoHeight;
+    const context = canvas.getContext('2d');
+    context.drawImage(video, 0, 0, canvas.width, canvas.height);
+    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
+    const handledFile = await handle_file(blob);
+    const result = await gradioApp.predict("/process_image", [handledFile, "More Detailed Caption"]);
+    return result.data[0];
+  } catch (error) {
+    console.error("Error getting caption:", error);
+    return "Error getting caption"; // Or handle the error differently
+  }
+}
+// Constants and Configuration
+const USER_SPEECH_INTERRUPT_DELAY = 500;
+const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
+const CHUNK_SIZE = 300;
+const MAX_PREFETCH_REQUESTS = 10;
+const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
+const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
+// DOM Elements
 const startStopButton = document.getElementById('startStopButton');
 const voiceSelectionDropdown = document.getElementById('voiceSelect');
 const modelSelectionDropdown = document.getElementById('modelSelect');
+const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
 const responseTimeDisplay = document.getElementById('responseTime');
 const userActivityIndicator = document.getElementById('userIndicator');
 const aiActivityIndicator = document.getElementById('aiIndicator');
 const transcriptDiv = document.getElementById('transcript');
+// Speech Recognition
 let speechRecognizer;
+let isSpeechRecognitionActive = false;
+// AI Interaction State
 let activeQuery = null;
 let queryStartTime = 0;
 let isRequestInProgress = false;
 let isUserSpeaking = false;
 let requestAbortController = null;
 let firstResponseTextTimestamp = null;
 // Audio Management
 let currentAudio = null;
 let audioPlaybackQueue = [];
+// Prefetching and Caching
+const prefetchCache = new Map();
 const pendingPrefetchRequests = new Map();
+const prefetchQueue = [];
+let prefetchTextQuery = "";
+// Conversation History
 let conversationHistory = [];
 // Audio Caching
+const audioCache = new Map();
+// Utility Functions
+// Normalize query text
 const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
 // Generate a cache key
+const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
+    `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;
+// Update activity indicators
+const updateActivityIndicators = (state = null) => {
+    userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
+    if (isRequestInProgress && !currentAudio) {
+        aiActivityIndicator.textContent = "AI: Processing...";
+    } else if (currentAudio && !isUserSpeaking) {
+        aiActivityIndicator.textContent = state || "AI: Speaking";
+    } else if (isUserSpeaking) {
+        aiActivityIndicator.textContent = "AI: Listening";
+    } else {
+        aiActivityIndicator.textContent = "AI: Idle";
+    }
+};
+// Update latency display
+const updateLatency = () => {
+    if (firstResponseTextTimestamp) {
+        const latency = firstResponseTextTimestamp - queryStartTime;
+        responseTimeDisplay.textContent = `Latency: ${latency}ms`;
+    } else {
+        responseTimeDisplay.textContent = "Latency: 0ms";
+    }
+};
+// Add to conversation history
+const addToConversationHistory = (role, content) => {
+    if (conversationHistory.length > 0 &&
+        conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
+        conversationHistory[conversationHistory.length - 1].content === "") {
+        conversationHistory.pop();
+    }
+    conversationHistory.push({ role, content });
+    if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
+};
+// Check if audio playback should be interrupted
+const shouldInterruptAudioPlayback = (interimTranscript) =>
+    Date.now() - lastUserSpeechTimestamp > USER_SPEECH_INTERRUPT_DELAY || interimTranscript.length > 5;
+// Audio Management Functions
+// Play audio from the queue
+const playNextAudio = async () => {
+    if (audioPlaybackQueue.length > 0) {
+        const audioData = audioPlaybackQueue.shift();
+        const audio = new Audio(audioData.url);
+        updateActivityIndicators();
+        const audioPromise = new Promise(resolve => {
+            audio.onended = resolve;
+            audio.onerror = resolve;
+        });
+        if (currentAudio) {
+            currentAudio.pause();
+            currentAudio.currentTime = 0;
+        }
+        currentAudio = audio;
+        await audio.play();
+        await audioPromise;
+        playNextAudio();
+    } else {
+        updateActivityIndicators();
+    }
+};
+// Interrupt audio playback
+const interruptAudioPlayback = (reason = 'unknown') => {
+    console.log(`Interrupting audio (reason: ${reason})...`);
+    if (currentAudio) {
+        currentAudio.pause();
+        currentAudio.currentTime = 0;
+        currentAudio = null;
+    }
+    audioPlaybackQueue.length = 0;
+    isRequestInProgress = false;
+    if (requestAbortController) {
+        requestAbortController.abort();
+        requestAbortController = null;
+    }
+    prefetchCache.clear();
+    prefetchQueue.length = 0;
+    updateActivityIndicators();
+};
+// Prefetching and Caching Functions
 // Prefetch and cache the first TTS audio chunk
 const prefetchFirstAudioChunk = (query, voice) => {
+    const normalizedQuery = normalizeQueryText(query);
+    const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
+    if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
+    prefetchQueue.push({ query: query.trim(), voice, cacheKey });
+    processPrefetchQueue();
 };
 // Process the prefetch queue
 const processPrefetchQueue = async () => {
+    while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
+        const { query, voice, cacheKey } = prefetchQueue.shift();
+        const abortController = new AbortController();
+        pendingPrefetchRequests.set(cacheKey, abortController);
+        try {
+            const firstAudioUrl = await streamAndPrefetchAudio(query, voice, abortController.signal);
+            if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
+        } catch (error) {
+            if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
+        } finally {
+            pendingPrefetchRequests.delete(cacheKey);
+            processPrefetchQueue();
+        }
+    }
+};
+// Cancel pending prefetch requests
+const cancelPrefetchRequests = (query) => {
+    const normalizedQuery = normalizeQueryText(query);
+    for (const [cacheKey, abortController] of pendingPrefetchRequests) {
+        if (cacheKey.startsWith(normalizedQuery)) {
+            abortController.abort();
+            pendingPrefetchRequests.delete(cacheKey);
+        }
+    }
+};
+// AI Interaction Functions
+// Modify sendQueryToAI to include the caption
+async function sendQueryToAI(query) {
+  const caption = await getCaption();
+  const modifiedQuery = JSON.stringify({ USER: query, CAPTION: caption });
+  console.log("Sending query to AI:", modifiedQuery);
+  isRequestInProgress = true;
+  updateActivityIndicators();
+  firstResponseTextTimestamp = null;
+  const normalizedQuery = normalizeQueryText(query);
+  const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
+  queryStartTime = Date.now();
+  // Check prefetch cache
+  if (prefetchCache.has(cacheKey)) {
+    const cachedData = prefetchCache.get(cacheKey);
+    if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
+      audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
+      playNextAudio();
+    } else {
+      prefetchCache.delete(cacheKey);
+    }
+  }
+  requestAbortController = new AbortController();
+  try {
+    await streamAndHandleAudioResponse(modifiedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
+  } catch (error) {
+    if (error.name !== 'AbortError') {
+      console.error("Error sending query to AI:", error);
+    }
+  } finally {
+    isRequestInProgress = false;
+    updateActivityIndicators();
+  }
+};
+// Process the final speech transcript
+const processSpeechTranscript = (transcript) => {
+  const trimmedTranscript = transcript.trimStart();
+  if (trimmedTranscript !== '' && !isRequestInProgress) {
+    activeQuery = trimmedTranscript;
+    sendQueryToAI(activeQuery);
+    addToConversationHistory('user', activeQuery);
+  }
+};
+// Network and Streaming Functions
+// Stream AI response and handle audio
+const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
+  const response = await fetchAIResponse(query, abortSignal);
+  if (!response.ok) {
+    if (response.status === 429) {
+      console.log("Rate limit hit, retrying in 1 second...");
+      await new Promise(resolve => setTimeout(resolve, 1000));
+      await sendQueryToAI(query);
+      return;
     }
+    throw new Error(`Network response was not ok: ${response.status}`);
   }
+  console.log("Streaming audio response received");
+  await handleStreamingResponse(response.body, voice, abortSignal);
+};
+// Stream AI response for prefetching
+const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
+  const response = await fetchAIResponse(query, abortSignal);
+  if (!response.ok) throw new Error('Network response was not ok');
+  return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
+};
+// Fetch AI response
+const fetchAIResponse = async (query, abortSignal) => {
+  const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
+  const url = '/stream_text';
+  const requestBody = {
+    query: query,
+    history: JSON.stringify(conversationHistory),
+    model: modelSelectionDropdown.value,
+    api_key: userSambanovaKey
+  };
+  return fetch(url, {
+    method: 'POST',
+    headers: {
+      'Accept': 'text/event-stream',
+      'Content-Type': 'application/json'
+    },
+    body: JSON.stringify(requestBody),
+    signal: abortSignal
+  });
 };
 // Handle the streaming response for prefetching
         if (line.startsWith('data: ')) {
           const textContent = line.substring(6).trim();
           if (textContent) {
+            return await generateTextToSpeechAudio(textContent, voice);
           }
         }
       }
   return null;
 };
 // Handle the streaming audio response
 const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
   const reader = responseStream.getReader();
   let fullResponseText = "";
   let fullResponseText2 = "";
   let textChunk = "";
+  let sentText = "";
   try {
     while (true) {
       if (isUserSpeaking) {
         interruptAudioPlayback('user is speaking');
+        break;
       }
       const chunk = decoder.decode(value, { stream: true });
             fullResponseText += textContent + " ";
             fullResponseText2 += textContent + " ";
             textChunk += textContent + " ";
+            transcriptDiv.textContent = fullResponseText2;
             if (initialChunksSent < 2) {
               const audioUrl = await generateTextToSpeechAudio(textContent, voice);
                 audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
                 if (!currentAudio) playNextAudio();
               }
+              sentText += textContent + " ";
               initialChunksSent++;
             } else {
               let unsentTextChunk = textChunk.replace(sentText, '').trim();
                   audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
                   if (!currentAudio) playNextAudio();
                 }
+                textChunk = "";
               }
             }
             if (fullResponseText !== '') {
+              fullResponseText = '';
             }
           }
         }
   }
 };
+// Generate Text-to-Speech audio with caching
+const generateTextToSpeechAudio = async (text, voice) => {
+  const normalizedText = normalizeQueryText(text);
+  const cacheKey = `${normalizedText}-${voice}`;
+  if (audioCache.has(cacheKey)) {
+    const cachedData = audioCache.get(cacheKey);
+    if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
+      return cachedData.url;
+    } else {
+      audioCache.delete(cacheKey);
+    }
+  }
+  try {
+    const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
+    if (!response.ok) throw new Error('Network response was not ok');
+    const audioBlob = await response.blob();
+    const audioUrl = URL.createObjectURL(audioBlob);
+    audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
+    return audioUrl;
+  } catch (error) {
+    console.error("Error generating TTS audio:", error);
+    return null;
   }
 };
+// Speech Recognition Initialization
 if ('webkitSpeechRecognition' in window) {
   speechRecognizer = new webkitSpeechRecognition();
   Object.assign(speechRecognizer, {
   speechRecognizer.onstart = () => {
     console.log("Speech recognition started");
     isUserSpeaking = true;
     lastUserSpeechTimestamp = Date.now();
+    updateActivityIndicators();
     startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
   };
     for (let i = event.resultIndex; i < event.results.length; i++) {
       const transcript = event.results[i][0].transcript;
       if (event.results[i].isFinal) {
         interruptAudioPlayback('final');
+        processSpeechTranscript(transcript);
         isUserSpeaking = false;
+        updateActivityIndicators();
         queryStartTime = Date.now();
       } else {
         interimTranscript += transcript;
         isUserSpeaking = true;
         lastUserSpeechTimestamp = Date.now();
+        updateActivityIndicators();
         if (interimTranscript.length > prefetchTextQuery.length + 5) {
           cancelPrefetchRequests(prefetchTextQuery);
   speechRecognizer.onend = () => {
     isUserSpeaking = false;
+    updateActivityIndicators();
+    if (isSpeechRecognitionActive) speechRecognizer.start();
   };
   startStopButton.addEventListener('click', () => {
     if (isSpeechRecognitionActive) {
       speechRecognizer.stop();
       isSpeechRecognitionActive = false;
       startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
     } else {
       speechRecognizer.start();
       isSpeechRecognitionActive = true;
       startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
     }
   });
   alert('Your browser does not support the Web Speech API.');
 }
+setInterval(updateLatency, 100);
+window.onload = startWebcam;