Live-Video-Chat / script1.js
KingNish's picture
Update script1.js
8580e13 verified
raw
history blame
7.62 kB
// Constants and Configuration
const USER_SPEECH_INTERRUPT_DELAY = 500;
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
const CHUNK_SIZE = 300;
const MAX_PREFETCH_REQUESTS = 10;
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
// DOM Elements
const startStopButton = document.getElementById('startStopButton');
const voiceSelectionDropdown = document.getElementById('voiceSelect');
const modelSelectionDropdown = document.getElementById('modelSelect');
const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
const responseTimeDisplay = document.getElementById('responseTime');
const userActivityIndicator = document.getElementById('userIndicator');
const aiActivityIndicator = document.getElementById('aiIndicator');
const transcriptDiv = document.getElementById('transcript');
const webcamToggleButton = document.getElementById('webcamToggle');
// Speech Recognition
let speechRecognizer;
let isSpeechRecognitionActive = false;
// AI Interaction State
let activeQuery = null;
let queryStartTime = 0;
let isRequestInProgress = false;
let isUserSpeaking = false;
let requestAbortController = null;
let firstResponseTextTimestamp = null;
// Audio Management
let currentAudio = null;
let audioPlaybackQueue = [];
// Prefetching and Caching
const prefetchCache = new Map();
const pendingPrefetchRequests = new Map();
const prefetchQueue = [];
let prefetchTextQuery = "";
// Conversation History
let conversationHistory = [];
// Audio Caching
const audioCache = new Map();
// Webcam
let isWebcamActive = false;
let app;
let lastCaption = "";
const clients = [
"multimodalart/Florence-2-l4",
"gokaygokay/Florence-2",
"multimodalart/Florence-2-l4-2",
"gokaygokay/Florence-2",
];
// Utility Functions
// Normalize query text
const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
// Generate a cache key
const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
`${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;
// Update activity indicators
const updateActivityIndicators = (state = null) => {
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
if (isRequestInProgress && !currentAudio) {
aiActivityIndicator.textContent = "AI: Processing...";
} else if (currentAudio && !isUserSpeaking) {
aiActivityIndicator.textContent = state || "AI: Speaking";
} else if (isUserSpeaking) {
aiActivityIndicator.textContent = "AI: Listening";
} else {
aiActivityIndicator.textContent = "AI: Idle";
}
};
// Update latency display
const updateLatency = () => {
if (firstResponseTextTimestamp) {
const latency = firstResponseTextTimestamp - queryStartTime;
responseTimeDisplay.textContent = `Latency: ${latency}ms`;
} else {
responseTimeDisplay.textContent = "Latency: 0ms";
}
};
// Add to conversation history
const addToConversationHistory = (role, content) => {
if (conversationHistory.length > 0 &&
conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
conversationHistory[conversationHistory.length - 1].content === "") {
conversationHistory.pop();
}
conversationHistory.push({ role, content });
if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
};
// Audio Management Functions
// Play audio from the queue
const playNextAudio = async () => {
if (audioPlaybackQueue.length > 0) {
const audioData = audioPlaybackQueue.shift();
const audio = new Audio(audioData.url);
updateActivityIndicators();
const audioPromise = new Promise(resolve => {
audio.onended = resolve;
audio.onerror = resolve;
});
if (currentAudio) {
currentAudio.pause();
currentAudio.currentTime = 0;
}
currentAudio = audio;
await audio.play();
await audioPromise;
playNextAudio();
} else {
updateActivityIndicators();
}
};
// Prefetching and Caching Functions
// Prefetch and cache the first TTS audio chunk
const prefetchFirstAudioChunk = (query, voice) => {
const normalizedQuery = normalizeQueryText(query);
const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
prefetchQueue.push({ query: query.trim(), voice, cacheKey });
processPrefetchQueue();
};
// Webcam Integration Functions
const startWebcam = async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
document.getElementById('webcam').srcObject = stream;
setInterval(captureAndProcessImage, 5000);
} catch (error) {
console.error("Error accessing webcam: ", error);
}
};
const stopWebcam = () => {
const stream = document.getElementById('webcam').srcObject;
if (stream) {
const tracks = stream.getTracks();
tracks.forEach(track => track.stop());
}
};
const captureAndProcessImage = async () => {
if (!isWebcamActive) return;
const canvas = document.createElement('canvas');
const video = document.getElementById('webcam');
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const context = canvas.getContext('2d');
context.drawImage(video, 0, 0, canvas.width, canvas.height);
const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
await processWithGradio(blob);
};
const processWithGradio = async (imageBlob) => {
try {
const randomClient = clients[Math.floor(Math.random() * clients.length)];
app = await client(randomClient);
const handledFile = await handle_file(imageBlob);
const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
const dataString = result.data[0];
lastCaption = dataString || lastCaption;
} catch (error) {
console.error("Error processing with Gradio:", error);
}
};
// Event Listeners
startStopButton.addEventListener('click', () => {
isSpeechRecognitionActive = !isSpeechRecognitionActive;
if (isSpeechRecognitionActive) {
speechRecognizer.start();
} else {
speechRecognizer.stop();
}
});
webcamToggleButton.addEventListener('click', () => {
isWebcamActive = !isWebcamActive;
if (isWebcamActive) {
startWebcam();
} else {
stopWebcam();
}
});
// Speech Recognition Initialization
if ('webkitSpeechRecognition' in window) {
speechRecognizer = new webkitSpeechRecognition();
speechRecognizer.continuous = true;
speechRecognizer.interimResults = true;
speechRecognizer.onresult = (event) => {
let interimTranscript = '';
for (let i = event.resultIndex; i < event.results.length; i++) {
const transcript = event.results[i][0].transcript;
if (event.results[i].isFinal) {
processSpeechTranscript(transcript);
isUserSpeaking = false;
updateActivityIndicators();
queryStartTime = Date.now();
} else {
interimTranscript += transcript;
isUserSpeaking = true;
updateActivityIndicators();
}
}
};
}
setInterval(updateLatency, 100);
window.onload = () => {
startWebcam();
};