// Constants and Configuration | |
const CHUNK_SIZE = 300; | |
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute | |
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour | |
// DOM Elements | |
const startStopButton = document.getElementById('startStopButton'); | |
const voiceSelectionDropdown = document.getElementById('voiceSelect'); | |
const modelSelectionDropdown = document.getElementById('modelSelect'); | |
const noiseSuppressionCheckbox = document.getElementById('noiseSuppression'); | |
const responseTimeDisplay = document.getElementById('responseTime'); | |
const userActivityIndicator = document.getElementById('userIndicator'); | |
const aiActivityIndicator = document.getElementById('aiIndicator'); | |
const transcriptDiv = document.getElementById('transcript'); | |
const webcamToggleButton = document.getElementById('webcamToggle'); | |
// Speech Recognition | |
let speechRecognizer; | |
let isSpeechRecognitionActive = false; | |
// AI Interaction State | |
let activeQuery = null; | |
let queryStartTime = 0; | |
let isRequestInProgress = false; | |
let isUserSpeaking = false; | |
let requestAbortController = null; | |
let firstResponseTextTimestamp = null; | |
// Audio Management | |
let currentAudio = null; | |
let audioPlaybackQueue = []; | |
// Prefetching and Caching | |
const prefetchCache = new Map(); | |
const pendingPrefetchRequests = new Map(); | |
const prefetchQueue = []; | |
let prefetchTextQuery = ""; | |
// Conversation History | |
let conversationHistory = []; | |
// Audio Caching | |
const audioCache = new Map(); | |
// Webcam | |
let isWebcamActive = false; | |
let app; | |
let lastCaption = ""; | |
const clients = [ | |
"multimodalart/Florence-2-l4", | |
"gokaygokay/Florence-2", | |
"multimodalart/Florence-2-l4-2", | |
"gokaygokay/Florence-2", | |
]; | |
// Utility Functions | |
// Normalize query text | |
const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, ''); | |
// Generate a cache key | |
const generateCacheKey = (normalizedQuery, voice, history, modelName) => | |
`${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`; | |
// Update activity indicators | |
const updateActivityIndicators = (state = null) => { | |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle"; | |
if (isRequestInProgress && !currentAudio) { | |
aiActivityIndicator.textContent = "AI: Processing..."; | |
} else if (currentAudio && !isUserSpeaking) { | |
aiActivityIndicator.textContent = state || "AI: Speaking"; | |
} else if (isUserSpeaking) { | |
aiActivityIndicator.textContent = "AI: Listening"; | |
} else { | |
aiActivityIndicator.textContent = "AI: Idle"; | |
} | |
}; | |
// Update latency display | |
const updateLatency = () => { | |
if (firstResponseTextTimestamp) { | |
const latency = firstResponseTextTimestamp - queryStartTime; | |
responseTimeDisplay.textContent = `Latency: ${latency}ms`; | |
} else { | |
responseTimeDisplay.textContent = "Latency: 0ms"; | |
} | |
}; | |
// Add to conversation history | |
const addToConversationHistory = (role, content) => { | |
if (conversationHistory.length > 0 && | |
conversationHistory[conversationHistory.length - 1].role === 'assistant' && | |
conversationHistory[conversationHistory.length - 1].content === "") { | |
conversationHistory.pop(); | |
} | |
conversationHistory.push({ role, content }); | |
if (conversationHistory.length > 6) conversationHistory.splice(0, 2); | |
}; | |
// Audio Management Functions | |
// Play audio from the queue | |
const playNextAudio = async () => { | |
if (audioPlaybackQueue.length > 0) { | |
const audioData = audioPlaybackQueue.shift(); | |
const audio = new Audio(audioData.url); | |
updateActivityIndicators(); | |
const audioPromise = new Promise(resolve => { | |
audio.onended = resolve; | |
audio.onerror = resolve; | |
}); | |
if (currentAudio) { | |
currentAudio.pause(); | |
currentAudio.currentTime = 0; | |
} | |
currentAudio = audio; | |
await; | |
await audioPromise; | |
playNextAudio(); | |
} else { | |
updateActivityIndicators(); | |
} | |
}; | |
// Prefetching and Caching Functions | |
// Prefetch and cache the first TTS audio chunk | |
const prefetchFirstAudioChunk = (query, voice) => { | |
const normalizedQuery = normalizeQueryText(query); | |
const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value); | |
if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return; | |
prefetchQueue.push({ query: query.trim(), voice, cacheKey }); | |
processPrefetchQueue(); | |
}; | |
// Webcam Integration Functions | |
const startWebcam = async () => { | |
try { | |
const stream = await navigator.mediaDevices.getUserMedia({ video: true }); | |
document.getElementById('webcam').srcObject = stream; | |
setInterval(captureAndProcessImage, 5000); | |
} catch (error) { | |
console.error("Error accessing webcam: ", error); | |
} | |
}; | |
const stopWebcam = () => { | |
const stream = document.getElementById('webcam').srcObject; | |
if (stream) { | |
const tracks = stream.getTracks(); | |
tracks.forEach(track => track.stop()); | |
} | |
}; | |
const captureAndProcessImage = async () => { | |
if (!isWebcamActive) return; | |
const canvas = document.createElement('canvas'); | |
const video = document.getElementById('webcam'); | |
canvas.width = video.videoWidth; | |
canvas.height = video.videoHeight; | |
const context = canvas.getContext('2d'); | |
context.drawImage(video, 0, 0, canvas.width, canvas.height); | |
const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png')); | |
await processWithGradio(blob); | |
}; | |
const processWithGradio = async (imageBlob) => { | |
try { | |
const randomClient = clients[Math.floor(Math.random() * clients.length)]; | |
app = await client(randomClient); | |
const handledFile = await handle_file(imageBlob); | |
const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]); | |
const dataString =[0]; | |
lastCaption = dataString || lastCaption; | |
} catch (error) { | |
console.error("Error processing with Gradio:", error); | |
} | |
}; | |
// Event Listeners | |
startStopButton.addEventListener('click', () => { | |
isSpeechRecognitionActive = !isSpeechRecognitionActive; | |
if (isSpeechRecognitionActive) { | |
speechRecognizer.start(); | |
} else { | |
speechRecognizer.stop(); | |
} | |
}); | |
webcamToggleButton.addEventListener('click', () => { | |
isWebcamActive = !isWebcamActive; | |
if (isWebcamActive) { | |
startWebcam(); | |
} else { | |
stopWebcam(); | |
} | |
}); | |
// Speech Recognition Initialization | |
if ('webkitSpeechRecognition' in window) { | |
speechRecognizer = new webkitSpeechRecognition(); | |
speechRecognizer.continuous = true; | |
speechRecognizer.interimResults = true; | |
speechRecognizer.onresult = (event) => { | |
let interimTranscript = ''; | |
for (let i = event.resultIndex; i < event.results.length; i++) { | |
const transcript = event.results[i][0].transcript; | |
if (event.results[i].isFinal) { | |
processSpeechTranscript(transcript); | |
isUserSpeaking = false; | |
updateActivityIndicators(); | |
queryStartTime =; | |
} else { | |
interimTranscript += transcript; | |
isUserSpeaking = true; | |
updateActivityIndicators(); | |
} | |
} | |
}; | |
} | |
setInterval(updateLatency, 100); | |
window.onload = () => { | |
startWebcam(); | |
}; | |