// Constants and Configuration const USER_SPEECH_INTERRUPT_DELAY = 500; const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech"; const CHUNK_SIZE = 300; const MAX_PREFETCH_REQUESTS = 10; const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour // DOM Elements const startStopButton = document.getElementById('startStopButton'); const voiceSelectionDropdown = document.getElementById('voiceSelect'); const modelSelectionDropdown = document.getElementById('modelSelect'); const noiseSuppressionCheckbox = document.getElementById('noiseSuppression'); const responseTimeDisplay = document.getElementById('responseTime'); const userActivityIndicator = document.getElementById('userIndicator'); const aiActivityIndicator = document.getElementById('aiIndicator'); const transcriptDiv = document.getElementById('transcript'); const webcamToggleButton = document.getElementById('webcamToggle'); // Speech Recognition let speechRecognizer; let isSpeechRecognitionActive = false; // AI Interaction State let activeQuery = null; let queryStartTime = 0; let isRequestInProgress = false; let isUserSpeaking = false; let requestAbortController = null; let firstResponseTextTimestamp = null; // Audio Management let currentAudio = null; let audioPlaybackQueue = []; // Prefetching and Caching const prefetchCache = new Map(); const pendingPrefetchRequests = new Map(); const prefetchQueue = []; let prefetchTextQuery = ""; // Conversation History let conversationHistory = []; // Audio Caching const audioCache = new Map(); // Webcam let isWebcamActive = false; let app; let lastCaption = ""; const clients = [ "multimodalart/Florence-2-l4", "gokaygokay/Florence-2", "multimodalart/Florence-2-l4-2", "gokaygokay/Florence-2", ]; // Utility Functions // Normalize query text const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, ''); // Generate a cache key const generateCacheKey = (normalizedQuery, voice, history, modelName) => `${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`; // Update activity indicators const updateActivityIndicators = (state = null) => { userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle"; if (isRequestInProgress && !currentAudio) { aiActivityIndicator.textContent = "AI: Processing..."; } else if (currentAudio && !isUserSpeaking) { aiActivityIndicator.textContent = state || "AI: Speaking"; } else if (isUserSpeaking) { aiActivityIndicator.textContent = "AI: Listening"; } else { aiActivityIndicator.textContent = "AI: Idle"; } }; // Update latency display const updateLatency = () => { if (firstResponseTextTimestamp) { const latency = firstResponseTextTimestamp - queryStartTime; responseTimeDisplay.textContent = `Latency: ${latency}ms`; } else { responseTimeDisplay.textContent = "Latency: 0ms"; } }; // Add to conversation history const addToConversationHistory = (role, content) => { if (conversationHistory.length > 0 && conversationHistory[conversationHistory.length - 1].role === 'assistant' && conversationHistory[conversationHistory.length - 1].content === "") { conversationHistory.pop(); } conversationHistory.push({ role, content }); if (conversationHistory.length > 6) conversationHistory.splice(0, 2); }; // Audio Management Functions // Play audio from the queue const playNextAudio = async () => { if (audioPlaybackQueue.length > 0) { const audioData = audioPlaybackQueue.shift(); const audio = new Audio(audioData.url); updateActivityIndicators(); const audioPromise = new Promise(resolve => { audio.onended = resolve; audio.onerror = resolve; }); if (currentAudio) { currentAudio.pause(); currentAudio.currentTime = 0; } currentAudio = audio; await audio.play(); await audioPromise; playNextAudio(); } else { updateActivityIndicators(); } }; // Prefetching and Caching Functions // Prefetch and cache the first TTS audio chunk const prefetchFirstAudioChunk = (query, voice) => { const normalizedQuery = normalizeQueryText(query); const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value); if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return; prefetchQueue.push({ query: query.trim(), voice, cacheKey }); processPrefetchQueue(); }; // Webcam Integration Functions const startWebcam = async () => { try { const stream = await navigator.mediaDevices.getUserMedia({ video: true }); document.getElementById('webcam').srcObject = stream; setInterval(captureAndProcessImage, 5000); } catch (error) { console.error("Error accessing webcam: ", error); } }; const stopWebcam = () => { const stream = document.getElementById('webcam').srcObject; if (stream) { const tracks = stream.getTracks(); tracks.forEach(track => track.stop()); } }; const captureAndProcessImage = async () => { if (!isWebcamActive) return; const canvas = document.createElement('canvas'); const video = document.getElementById('webcam'); canvas.width = video.videoWidth; canvas.height = video.videoHeight; const context = canvas.getContext('2d'); context.drawImage(video, 0, 0, canvas.width, canvas.height); const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png')); await processWithGradio(blob); }; const processWithGradio = async (imageBlob) => { try { const randomClient = clients[Math.floor(Math.random() * clients.length)]; app = await client(randomClient); const handledFile = await handle_file(imageBlob); const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]); const dataString = result.data[0]; lastCaption = dataString || lastCaption; } catch (error) { console.error("Error processing with Gradio:", error); } }; // Event Listeners startStopButton.addEventListener('click', () => { isSpeechRecognitionActive = !isSpeechRecognitionActive; if (isSpeechRecognitionActive) { speechRecognizer.start(); } else { speechRecognizer.stop(); } }); webcamToggleButton.addEventListener('click', () => { isWebcamActive = !isWebcamActive; if (isWebcamActive) { startWebcam(); } else { stopWebcam(); } }); // Speech Recognition Initialization if ('webkitSpeechRecognition' in window) { speechRecognizer = new webkitSpeechRecognition(); speechRecognizer.continuous = true; speechRecognizer.interimResults = true; speechRecognizer.onresult = (event) => { let interimTranscript = ''; for (let i = event.resultIndex; i < event.results.length; i++) { const transcript = event.results[i][0].transcript; if (event.results[i].isFinal) { processSpeechTranscript(transcript); isUserSpeaking = false; updateActivityIndicators(); queryStartTime = Date.now(); } else { interimTranscript += transcript; isUserSpeaking = true; updateActivityIndicators(); } } }; } setInterval(updateLatency, 100); window.onload = () => { startWebcam(); };