Live-Video-Chat

Running

App Files Files Community

KingNish commited on Sep 28

Commit

194daa5

•

1 Parent(s): 051a4de

Update script1.js

Browse files

Files changed (1) hide show

script1.js +65 -117

script1.js CHANGED Viewed

@@ -4,8 +4,8 @@
 const USER_SPEECH_INTERRUPT_DELAY = 500;
 const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
 const CHUNK_SIZE = 300;
-const MAX_PREFETCH_REQUESTS = 10;
-const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
 const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
 // DOM Elements
@@ -36,8 +36,7 @@ let audioPlaybackQueue = [];
 // Prefetching and Caching
 const prefetchCache = new Map();
-const pendingPrefetchRequests = new Map();
-const prefetchQueue = [];
 let prefetchTextQuery = "";
 // Conversation History
@@ -46,20 +45,6 @@ let conversationHistory = [];
 // Audio Caching
 const audioCache = new Map();
-// Webcam Integration
-import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
-const video = document.getElementById('webcam');
-let app;
-let lastCaption = "";
-let isWebcamActive = false; // Flag for webcam state
-const clients = [
-    "multimodalart/Florence-2-l4",
-    "gokaygokay/Florence-2",
-    "multimodalart/Florence-2-l4-2",
-    "gokaygokay/Florence-2",
-];
 // Utility Functions
 // Normalize query text
@@ -72,7 +57,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
 // Update activity indicators
 const updateActivityIndicators = (state = null) => {
     userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
     if (isRequestInProgress && !currentAudio) {
         aiActivityIndicator.textContent = "AI: Processing...";
     } else if (currentAudio && !isUserSpeaking) {
@@ -157,7 +142,6 @@ const interruptAudioPlayback = (reason = 'unknown') => {
     }
     prefetchCache.clear();
-    prefetchQueue.length = 0;
     updateActivityIndicators();
 };
@@ -165,45 +149,32 @@ const interruptAudioPlayback = (reason = 'unknown') => {
 // Prefetching and Caching Functions
 // Prefetch and cache the first TTS audio chunk
-const prefetchFirstAudioChunk = (query, voice) => {
     const normalizedQuery = normalizeQueryText(query);
     const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
     if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
-    prefetchQueue.push({ query: query.trim(), voice, cacheKey });
-    processPrefetchQueue();
-};
-// Process the prefetch queue
-const processPrefetchQueue = async () => {
-    while (prefetchQueue.length > 0 && pendingPrefetchRequests.size < MAX_PREFETCH_REQUESTS) {
-        const { query, voice, cacheKey } = prefetchQueue.shift();
-        const abortController = new AbortController();
-        pendingPrefetchRequests.set(cacheKey, abortController);
-        try {
-            const firstAudioUrl = await streamAndPrefetchAudio(query, voice, abortController.signal);
-            if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
-        } catch (error) {
-            if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
-        } finally {
-            pendingPrefetchRequests.delete(cacheKey);
-            processPrefetchQueue();
-        }
     }
 };
 // Cancel pending prefetch requests
 const cancelPrefetchRequests = (query) => {
     const normalizedQuery = normalizeQueryText(query);
-    for (const [cacheKey, abortController] of pendingPrefetchRequests) {
-        if (cacheKey.startsWith(normalizedQuery)) {
-            abortController.abort();
-            pendingPrefetchRequests.delete(cacheKey);
         }
     }
 };
@@ -219,7 +190,7 @@ async function sendQueryToAI(query) {
     firstResponseTextTimestamp = null;
     const normalizedQuery = normalizeQueryText(query);
-    const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
     queryStartTime = Date.now();
@@ -237,7 +208,7 @@ async function sendQueryToAI(query) {
     requestAbortController = new AbortController();
     try {
-        const combinedQuery = isWebcamActive ? `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}` : query;
         await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
     } catch (error) {
         if (error.name !== 'AbortError') {
@@ -281,12 +252,12 @@ const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
 };
 // Stream AI response for prefetching
-const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
-    const response = await fetchAIResponse(query, abortSignal);
     if (!response.ok) throw new Error('Network response was not ok');
-    return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
 };
 // Fetch AI response
@@ -313,7 +284,7 @@ const fetchAIResponse = async (query, abortSignal) => {
 };
 // Handle the streaming response for prefetching
-const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
     const reader = responseStream.getReader();
     const decoder = new TextDecoder("utf-8");
     let buffer = "";
@@ -322,7 +293,6 @@ const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSi
         while (true) {
             const { done, value } = await reader.read();
             if (done) break;
-            if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
             const chunk = decoder.decode(value, { stream: true });
             buffer += chunk;
@@ -354,7 +324,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
     const reader = responseStream.getReader();
     const decoder = new TextDecoder("utf-8");
     let buffer = "";
-    let initialChunksSent = 0;
     let fullResponseText = "";
     let fullResponseText2 = "";
     let textChunk = "";
@@ -387,25 +356,10 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
                         textChunk += textContent + " ";
                         transcriptDiv.textContent = fullResponseText2;
-                        if (initialChunksSent < 2) {
-                            const audioUrl = await generateTextToSpeechAudio(textContent, voice);
-                            if (audioUrl) {
-                                audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-                                if (!currentAudio) playNextAudio();
-                            }
-                            sentText += textContent + " ";
-                            initialChunksSent++;
-                        } else {
-                            let unsentTextChunk = textChunk.replace(sentText, '').trim();
-                            if (unsentTextChunk.length >= CHUNK_SIZE) {
-                                const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
-                                if (audioUrl) {
-                                    audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-                                    if (!currentAudio) playNextAudio();
-                                }
-                                textChunk = "";
-                            }
                         }
                         if (fullResponseText !== '') {
@@ -422,18 +376,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
     } finally {
         reader.releaseLock();
-        let unsentTextChunk = textChunk.replace(sentText, '').trim();
-        if (unsentTextChunk !== "") {
-            const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
-            if (audioUrl) {
-                audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-                if (!currentAudio) playNextAudio();
-            }
-        }
-        if (fullResponseText !== '') {
-            fullResponseText = '';
-        }
         if (fullResponseText2 !== '') {
             addToConversationHistory('assistant', fullResponseText2);
             fullResponseText2 = '';
@@ -487,12 +429,6 @@ if ('webkitSpeechRecognition' in window) {
         lastUserSpeechTimestamp = Date.now();
         updateActivityIndicators();
         startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
-        // Start webcam processing if not already active
-        if (!isWebcamActive) {
-            startWebcam();
-            isWebcamActive = true;
-        }
     };
     speechRecognizer.onresult = (event) => {
@@ -541,34 +477,39 @@ if ('webkitSpeechRecognition' in window) {
             speechRecognizer.stop();
             isSpeechRecognitionActive = false;
             startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
-            // Stop webcam processing
-            isWebcamActive = false;
         } else {
             speechRecognizer.start();
             isSpeechRecognitionActive = true;
             startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
         }
     });
 } else {
     alert('Your browser does not support the Web Speech API.');
 }
-// Webcam Functions (optimized)
 async function startWebcam() {
     try {
         const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-        video.srcObject = stream;
-        // Capture and process image every 5 seconds while webcam is active
-        const webcamInterval = setInterval(async () => {
-            if (!isWebcamActive) {
-                clearInterval(webcamInterval);
-                return;
-            }
-            await captureAndProcessImage();
-        }, 5000);
     } catch (error) {
         console.error("Error accessing webcam: ", error);
     }
@@ -576,34 +517,41 @@ async function startWebcam() {
 async function captureAndProcessImage() {
     const canvas = document.createElement('canvas');
-    canvas.width = video.videoWidth;
-    canvas.height = video.videoHeight;
     const context = canvas.getContext('2d');
-    context.drawImage(video, 0, 0, canvas.width, canvas.height);
-    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/jpeg', 0.8)); // Use JPEG for smaller size
     await processWithGradio(blob);
 }
 async function processWithGradio(imageBlob) {
     try {
         const randomClient = clients[Math.floor(Math.random() * clients.length)];
         app = await client(randomClient);
         const handledFile = await handle_file(imageBlob);
-        const result = await app.predict("/process_image", [handledFile, "More Detailed Caption"]);
         const dataString = result.data[0];
-        lastCaption = dataString || lastCaption;
     } catch (error) {
         console.error("Error processing with Gradio:", error);
     }
 }
-setInterval(updateLatency, 100);
 window.onload = () => {
-    startWebcam();
 };

 const USER_SPEECH_INTERRUPT_DELAY = 500;
 const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
 const CHUNK_SIZE = 300;
+const MAX_PREFETCH_REQUESTS = 5;
+const PREFETCH_CACHE_EXPIRATION = 30000; // 30 seconds
 const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
 // DOM Elements
 // Prefetching and Caching
 const prefetchCache = new Map();
+const pendingPrefetchRequests = new Set();
 let prefetchTextQuery = "";
 // Conversation History
 // Audio Caching
 const audioCache = new Map();
 // Utility Functions
 // Normalize query text
 // Update activity indicators
 const updateActivityIndicators = (state = null) => {
     userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
     if (isRequestInProgress && !currentAudio) {
         aiActivityIndicator.textContent = "AI: Processing...";
     } else if (currentAudio && !isUserSpeaking) {
     }
     prefetchCache.clear();
     updateActivityIndicators();
 };
 // Prefetching and Caching Functions
 // Prefetch and cache the first TTS audio chunk
+const prefetchFirstAudioChunk = async (query, voice) => {
     const normalizedQuery = normalizeQueryText(query);
     const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
     if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
+    pendingPrefetchRequests.add(cacheKey);
+    try {
+        const firstAudioUrl = await streamAndPrefetchAudio(query, voice);
+        if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
+    } catch (error) {
+        if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
+    } finally {
+        pendingPrefetchRequests.delete(cacheKey);
     }
 };
 // Cancel pending prefetch requests
 const cancelPrefetchRequests = (query) => {
     const normalizedQuery = normalizeQueryText(query);
+    for (const key of pendingPrefetchRequests) {
+        if (key.startsWith(normalizedQuery)) {
+            pendingPrefetchRequests.delete(key);
+            // Implement abort logic if needed for your fetch implementation
         }
     }
 };
     firstResponseTextTimestamp = null;
     const normalizedQuery = normalizeQueryText(query);
+    const cacheKey = generateCacheKey(normalizedQuery, voiceSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
     queryStartTime = Date.now();
     requestAbortController = new AbortController();
     try {
+        const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}`;
         await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
     } catch (error) {
         if (error.name !== 'AbortError') {
 };
 // Stream AI response for prefetching
+const streamAndPrefetchAudio = async (query, voice) => {
+    const response = await fetchAIResponse(query, undefined);
     if (!response.ok) throw new Error('Network response was not ok');
+    return handleStreamingResponseForPrefetch(response.body, voice);
 };
 // Fetch AI response
 };
 // Handle the streaming response for prefetching
+const handleStreamingResponseForPrefetch = async (responseStream, voice) => {
     const reader = responseStream.getReader();
     const decoder = new TextDecoder("utf-8");
     let buffer = "";
         while (true) {
             const { done, value } = await reader.read();
             if (done) break;
             const chunk = decoder.decode(value, { stream: true });
             buffer += chunk;
     const reader = responseStream.getReader();
     const decoder = new TextDecoder("utf-8");
     let buffer = "";
     let fullResponseText = "";
     let fullResponseText2 = "";
     let textChunk = "";
                         textChunk += textContent + " ";
                         transcriptDiv.textContent = fullResponseText2;
+                        const audioUrl = await generateTextToSpeechAudio(textContent, voice); // Call TTS immediately
+                        if (audioUrl) {
+                            audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
+                            if (!currentAudio) playNextAudio();
                         }
                         if (fullResponseText !== '') {
     } finally {
         reader.releaseLock();
         if (fullResponseText2 !== '') {
             addToConversationHistory('assistant', fullResponseText2);
             fullResponseText2 = '';
         lastUserSpeechTimestamp = Date.now();
         updateActivityIndicators();
         startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
     };
     speechRecognizer.onresult = (event) => {
             speechRecognizer.stop();
             isSpeechRecognitionActive = false;
             startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
+            clearInterval(imageCaptureInterval); // Stop webcam processing
         } else {
             speechRecognizer.start();
             isSpeechRecognitionActive = true;
             startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
+            imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
         }
     });
 } else {
     alert('Your browser does not support the Web Speech API.');
 }
+setInterval(updateLatency, 100);
+// Webcam Integration
+import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
+let app;
+let lastCaption = "";
+const clients = [
+    "multimodalart/Florence-2-l4",
+    "gokaygokay/Florence-2",
+    "multimodalart/Florence-2-l4-2",
+    "gokaygokay/Florence-2",
+];
 async function startWebcam() {
     try {
         const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+        webcamVideo.srcObject = stream;
     } catch (error) {
         console.error("Error accessing webcam: ", error);
     }
 async function captureAndProcessImage() {
     const canvas = document.createElement('canvas');
+    canvas.width = webcamVideo.videoWidth;
+    canvas.height = webcamVideo.videoHeight;
     const context = canvas.getContext('2d');
+    context.drawImage(webcamVideo, 0, 0, canvas.width, canvas.height);
+    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
     await processWithGradio(blob);
 }
 async function processWithGradio(imageBlob) {
     try {
         const randomClient = clients[Math.floor(Math.random() * clients.length)];
         app = await client(randomClient);
         const handledFile = await handle_file(imageBlob);
+        const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
         const dataString = result.data[0];
+        lastCaption = dataString || lastCaption;
     } catch (error) {
         console.error("Error processing with Gradio:", error);
     }
 }
+let imageCaptureInterval; // Declare interval outside the event listener
 window.onload = () => {
+    startWebcam();
+    startStopButton.addEventListener('click', () => {
+        // ... (start/stop speech recognition and webcam captioning)
+        if (isSpeechRecognitionActive) {
+            clearInterval(imageCaptureInterval); // Stop webcam processing
+        } else {
+            imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
+        }
+    });
 };