Spaces:
Running
Running
Update script1.js
Browse files- script1.js +46 -59
script1.js
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
// Constants and Configuration
|
2 |
const USER_SPEECH_INTERRUPT_DELAY = 500;
|
3 |
-
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
|
4 |
const CHUNK_SIZE = 300;
|
5 |
-
const MAX_PREFETCH_REQUESTS = 5;
|
6 |
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
|
7 |
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
|
8 |
const WEBCAM_INTERVAL = 5000;
|
9 |
-
const MAX_HISTORY_LENGTH = 6;
|
10 |
|
11 |
// DOM Elements
|
12 |
const startStopButton = document.getElementById('startStopButton');
|
@@ -17,6 +17,7 @@ const responseTimeDisplay = document.getElementById('responseTime');
|
|
17 |
const userActivityIndicator = document.getElementById('userIndicator');
|
18 |
const aiActivityIndicator = document.getElementById('aiIndicator');
|
19 |
const transcriptDiv = document.getElementById('transcript');
|
|
|
20 |
|
21 |
// Speech Recognition
|
22 |
let speechRecognizer;
|
@@ -46,6 +47,22 @@ let conversationHistory = [];
|
|
46 |
// Audio Caching
|
47 |
const audioCache = new Map();
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
// Utility Functions
|
50 |
|
51 |
// Normalize query text
|
@@ -58,7 +75,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
|
|
58 |
// Update activity indicators
|
59 |
const updateActivityIndicators = (state = null) => {
|
60 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
61 |
-
|
62 |
if (isRequestInProgress && !currentAudio) {
|
63 |
aiActivityIndicator.textContent = "AI: Processing...";
|
64 |
} else if (currentAudio && !isUserSpeaking) {
|
@@ -194,7 +211,6 @@ const cancelPrefetchRequests = (query) => {
|
|
194 |
|
195 |
// Send a query to the AI
|
196 |
async function sendQueryToAI(query) {
|
197 |
-
console.log("Sending query to AI:", query);
|
198 |
isRequestInProgress = true;
|
199 |
updateActivityIndicators();
|
200 |
firstResponseTextTimestamp = null;
|
@@ -210,7 +226,6 @@ async function sendQueryToAI(query) {
|
|
210 |
combinedQuery += `, {USER: "${query}"}`;
|
211 |
|
212 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
213 |
-
|
214 |
} catch (error) {
|
215 |
if (error.name !== 'AbortError') {
|
216 |
console.error("Error sending query to AI:", error);
|
@@ -226,7 +241,7 @@ const processSpeechTranscript = (transcript) => {
|
|
226 |
const trimmedTranscript = transcript.trimStart();
|
227 |
if (trimmedTranscript !== '' && !isRequestInProgress) {
|
228 |
activeQuery = trimmedTranscript;
|
229 |
-
addToConversationHistory('user', activeQuery);
|
230 |
sendQueryToAI(activeQuery);
|
231 |
}
|
232 |
};
|
@@ -330,7 +345,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
330 |
let fullResponseText2 = "";
|
331 |
let textChunk = "";
|
332 |
|
333 |
-
|
334 |
try {
|
335 |
while (true) {
|
336 |
const { done, value } = await reader.read();
|
@@ -346,34 +360,38 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
346 |
buffer += chunk;
|
347 |
const lines = buffer.split('\n');
|
348 |
|
349 |
-
for (const line of lines) {
|
350 |
if (line.startsWith('data: ')) {
|
351 |
const textContent = line.substring(6).trim();
|
352 |
if (textContent) {
|
353 |
if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
|
354 |
-
|
|
|
355 |
fullResponseText2 += textContent + " ";
|
356 |
textChunk += textContent + " ";
|
357 |
transcriptDiv.textContent = fullResponseText2;
|
358 |
|
359 |
|
360 |
if (textChunk.length >= CHUNK_SIZE) {
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
}
|
368 |
}
|
369 |
}
|
370 |
}
|
|
|
|
|
371 |
}
|
372 |
} catch (error) {
|
373 |
console.error("Error in handleStreamingResponse:", error);
|
374 |
} finally {
|
375 |
-
|
376 |
-
|
|
|
377 |
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
378 |
if (audioUrl) {
|
379 |
audioPlaybackQueue.push({ url: audioUrl });
|
@@ -381,12 +399,9 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
381 |
}
|
382 |
}
|
383 |
|
384 |
-
addToConversationHistory('assistant', fullResponseText2);
|
385 |
-
fullResponseText = "";
|
386 |
-
fullResponseText2 = "";
|
387 |
-
|
388 |
-
reader.releaseLock();
|
389 |
-
|
390 |
}
|
391 |
};
|
392 |
|
@@ -484,20 +499,14 @@ if ('webkitSpeechRecognition' in window) {
|
|
484 |
speechRecognizer.stop();
|
485 |
isSpeechRecognitionActive = false;
|
486 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
video.srcObject = null;
|
491 |
-
lastCaption = "";
|
492 |
-
isCaptioningEnabled = false;
|
493 |
-
|
494 |
} else {
|
495 |
speechRecognizer.start();
|
496 |
isSpeechRecognitionActive = true;
|
497 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
498 |
-
|
499 |
-
// Start webcam capture when speech recognition starts
|
500 |
-
isCaptioningEnabled = true;
|
501 |
startWebcam();
|
502 |
}
|
503 |
});
|
@@ -508,28 +517,13 @@ if ('webkitSpeechRecognition' in window) {
|
|
508 |
setInterval(updateLatency, 100);
|
509 |
|
510 |
|
511 |
-
|
512 |
-
// Webcam Integration
|
513 |
-
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
514 |
-
|
515 |
-
const video = document.getElementById('webcam');
|
516 |
-
let app;
|
517 |
-
let lastCaption = "";
|
518 |
-
|
519 |
-
const clients = [
|
520 |
-
"multimodalart/Florence-2-l4",
|
521 |
-
"gokaygokay/Florence-2",
|
522 |
-
"multimodalart/Florence-2-l4-2",
|
523 |
-
"gokaygokay/Florence-2",
|
524 |
-
];
|
525 |
-
|
526 |
-
let webcamInterval; // Store the interval ID
|
527 |
|
528 |
async function startWebcam() {
|
529 |
try {
|
530 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
531 |
video.srcObject = stream;
|
532 |
-
webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL);
|
533 |
} catch (error) {
|
534 |
console.error("Error accessing webcam: ", error);
|
535 |
}
|
@@ -559,11 +553,4 @@ async function processWithGradio(imageBlob) {
|
|
559 |
} catch (error) {
|
560 |
console.error("Error processing with Gradio:", error);
|
561 |
}
|
562 |
-
}
|
563 |
-
|
564 |
-
window.onload = () => {
|
565 |
-
// Start webcam only if speech recognition is active
|
566 |
-
if (isCaptioningEnabled) {
|
567 |
-
startWebcam();
|
568 |
-
}
|
569 |
-
};
|
|
|
1 |
// Constants and Configuration
|
2 |
const USER_SPEECH_INTERRUPT_DELAY = 500;
|
3 |
+
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
|
4 |
const CHUNK_SIZE = 300;
|
5 |
+
const MAX_PREFETCH_REQUESTS = 5;
|
6 |
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
|
7 |
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
|
8 |
const WEBCAM_INTERVAL = 5000;
|
9 |
+
const MAX_HISTORY_LENGTH = 6;
|
10 |
|
11 |
// DOM Elements
|
12 |
const startStopButton = document.getElementById('startStopButton');
|
|
|
17 |
const userActivityIndicator = document.getElementById('userIndicator');
|
18 |
const aiActivityIndicator = document.getElementById('aiIndicator');
|
19 |
const transcriptDiv = document.getElementById('transcript');
|
20 |
+
const video = document.getElementById('webcam');
|
21 |
|
22 |
// Speech Recognition
|
23 |
let speechRecognizer;
|
|
|
47 |
// Audio Caching
|
48 |
const audioCache = new Map();
|
49 |
|
50 |
+
// Image Captioning State
|
51 |
+
let isCaptioningEnabled = false;
|
52 |
+
let lastCaption = "";
|
53 |
+
|
54 |
+
// Webcam Integration
|
55 |
+
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
56 |
+
const clients = [
|
57 |
+
"multimodalart/Florence-2-l4",
|
58 |
+
"gokaygokay/Florence-2",
|
59 |
+
"multimodalart/Florence-2-l4-2",
|
60 |
+
"gokaygokay/Florence-2",
|
61 |
+
];
|
62 |
+
let app;
|
63 |
+
let webcamInterval;
|
64 |
+
|
65 |
+
|
66 |
// Utility Functions
|
67 |
|
68 |
// Normalize query text
|
|
|
75 |
// Update activity indicators
|
76 |
const updateActivityIndicators = (state = null) => {
|
77 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
78 |
+
|
79 |
if (isRequestInProgress && !currentAudio) {
|
80 |
aiActivityIndicator.textContent = "AI: Processing...";
|
81 |
} else if (currentAudio && !isUserSpeaking) {
|
|
|
211 |
|
212 |
// Send a query to the AI
|
213 |
async function sendQueryToAI(query) {
|
|
|
214 |
isRequestInProgress = true;
|
215 |
updateActivityIndicators();
|
216 |
firstResponseTextTimestamp = null;
|
|
|
226 |
combinedQuery += `, {USER: "${query}"}`;
|
227 |
|
228 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
|
|
229 |
} catch (error) {
|
230 |
if (error.name !== 'AbortError') {
|
231 |
console.error("Error sending query to AI:", error);
|
|
|
241 |
const trimmedTranscript = transcript.trimStart();
|
242 |
if (trimmedTranscript !== '' && !isRequestInProgress) {
|
243 |
activeQuery = trimmedTranscript;
|
244 |
+
addToConversationHistory('user', activeQuery);
|
245 |
sendQueryToAI(activeQuery);
|
246 |
}
|
247 |
};
|
|
|
345 |
let fullResponseText2 = "";
|
346 |
let textChunk = "";
|
347 |
|
|
|
348 |
try {
|
349 |
while (true) {
|
350 |
const { done, value } = await reader.read();
|
|
|
360 |
buffer += chunk;
|
361 |
const lines = buffer.split('\n');
|
362 |
|
363 |
+
for (const line of lines) {
|
364 |
if (line.startsWith('data: ')) {
|
365 |
const textContent = line.substring(6).trim();
|
366 |
if (textContent) {
|
367 |
if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
|
368 |
+
|
369 |
+
fullResponseText += textContent + " ";
|
370 |
fullResponseText2 += textContent + " ";
|
371 |
textChunk += textContent + " ";
|
372 |
transcriptDiv.textContent = fullResponseText2;
|
373 |
|
374 |
|
375 |
if (textChunk.length >= CHUNK_SIZE) {
|
376 |
+
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
377 |
+
if (audioUrl) {
|
378 |
+
audioPlaybackQueue.push({ url: audioUrl });
|
379 |
+
if (!currentAudio) playNextAudio();
|
380 |
+
}
|
381 |
+
textChunk = "";
|
382 |
}
|
383 |
}
|
384 |
}
|
385 |
}
|
386 |
+
|
387 |
+
buffer = lines[lines.length - 1];
|
388 |
}
|
389 |
} catch (error) {
|
390 |
console.error("Error in handleStreamingResponse:", error);
|
391 |
} finally {
|
392 |
+
reader.releaseLock();
|
393 |
+
|
394 |
+
if (textChunk !== "") { // Send any remaining text
|
395 |
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
396 |
if (audioUrl) {
|
397 |
audioPlaybackQueue.push({ url: audioUrl });
|
|
|
399 |
}
|
400 |
}
|
401 |
|
402 |
+
addToConversationHistory('assistant', fullResponseText2);
|
403 |
+
fullResponseText = "";
|
404 |
+
fullResponseText2 = "";
|
|
|
|
|
|
|
405 |
}
|
406 |
};
|
407 |
|
|
|
499 |
speechRecognizer.stop();
|
500 |
isSpeechRecognitionActive = false;
|
501 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
502 |
+
clearInterval(webcamInterval);
|
503 |
+
video.srcObject = null;
|
504 |
+
lastCaption = "";
|
|
|
|
|
|
|
|
|
505 |
} else {
|
506 |
speechRecognizer.start();
|
507 |
isSpeechRecognitionActive = true;
|
508 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
509 |
+
isCaptioningEnabled = true;
|
|
|
|
|
510 |
startWebcam();
|
511 |
}
|
512 |
});
|
|
|
517 |
setInterval(updateLatency, 100);
|
518 |
|
519 |
|
520 |
+
// Webcam Functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
|
522 |
async function startWebcam() {
|
523 |
try {
|
524 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
525 |
video.srcObject = stream;
|
526 |
+
webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL);
|
527 |
} catch (error) {
|
528 |
console.error("Error accessing webcam: ", error);
|
529 |
}
|
|
|
553 |
} catch (error) {
|
554 |
console.error("Error processing with Gradio:", error);
|
555 |
}
|
556 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|