Spaces:
Running
Running
Update script1.js
Browse files- script1.js +41 -26
script1.js
CHANGED
@@ -46,6 +46,20 @@ let conversationHistory = [];
|
|
46 |
// Audio Caching
|
47 |
const audioCache = new Map();
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
// Utility Functions
|
50 |
|
51 |
// Normalize query text
|
@@ -58,7 +72,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
|
|
58 |
// Update activity indicators
|
59 |
const updateActivityIndicators = (state = null) => {
|
60 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
61 |
-
|
62 |
if (isRequestInProgress && !currentAudio) {
|
63 |
aiActivityIndicator.textContent = "AI: Processing...";
|
64 |
} else if (currentAudio && !isUserSpeaking) {
|
@@ -223,7 +237,7 @@ async function sendQueryToAI(query) {
|
|
223 |
requestAbortController = new AbortController();
|
224 |
|
225 |
try {
|
226 |
-
const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}
|
227 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
228 |
} catch (error) {
|
229 |
if (error.name !== 'AbortError') {
|
@@ -473,6 +487,12 @@ if ('webkitSpeechRecognition' in window) {
|
|
473 |
lastUserSpeechTimestamp = Date.now();
|
474 |
updateActivityIndicators();
|
475 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
|
|
|
|
|
|
|
|
|
|
|
|
476 |
};
|
477 |
|
478 |
speechRecognizer.onresult = (event) => {
|
@@ -521,6 +541,9 @@ if ('webkitSpeechRecognition' in window) {
|
|
521 |
speechRecognizer.stop();
|
522 |
isSpeechRecognitionActive = false;
|
523 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
|
|
|
|
|
|
524 |
} else {
|
525 |
speechRecognizer.start();
|
526 |
isSpeechRecognitionActive = true;
|
@@ -531,29 +554,21 @@ if ('webkitSpeechRecognition' in window) {
|
|
531 |
alert('Your browser does not support the Web Speech API.');
|
532 |
}
|
533 |
|
534 |
-
setInterval(updateLatency, 100);
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
// Webcam Integration
|
539 |
-
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
540 |
-
|
541 |
-
const video = document.getElementById('webcam');
|
542 |
-
let app;
|
543 |
-
let lastCaption = "";
|
544 |
-
|
545 |
-
const clients = [
|
546 |
-
"multimodalart/Florence-2-l4",
|
547 |
-
"gokaygokay/Florence-2",
|
548 |
-
"multimodalart/Florence-2-l4-2",
|
549 |
-
"gokaygokay/Florence-2",
|
550 |
-
];
|
551 |
|
|
|
552 |
async function startWebcam() {
|
553 |
try {
|
554 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
555 |
video.srcObject = stream;
|
556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
} catch (error) {
|
558 |
console.error("Error accessing webcam: ", error);
|
559 |
}
|
@@ -566,25 +581,25 @@ async function captureAndProcessImage() {
|
|
566 |
const context = canvas.getContext('2d');
|
567 |
context.drawImage(video, 0, 0, canvas.width, canvas.height);
|
568 |
|
569 |
-
const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/
|
570 |
await processWithGradio(blob);
|
571 |
}
|
572 |
|
|
|
573 |
async function processWithGradio(imageBlob) {
|
574 |
try {
|
575 |
const randomClient = clients[Math.floor(Math.random() * clients.length)];
|
576 |
app = await client(randomClient);
|
577 |
-
const handledFile = await handle_file(imageBlob);
|
578 |
|
|
|
579 |
const result = await app.predict("/process_image", [handledFile, "More Detailed Caption"]);
|
580 |
|
581 |
const dataString = result.data[0];
|
582 |
-
lastCaption = dataString || lastCaption;
|
583 |
} catch (error) {
|
584 |
console.error("Error processing with Gradio:", error);
|
585 |
}
|
586 |
}
|
587 |
|
588 |
-
|
589 |
-
|
590 |
-
};
|
|
|
46 |
// Audio Caching
|
47 |
const audioCache = new Map();
|
48 |
|
49 |
+
// Webcam Integration
|
50 |
+
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
51 |
+
const video = document.getElementById('webcam');
|
52 |
+
let app;
|
53 |
+
let lastCaption = "";
|
54 |
+
let isWebcamActive = false; // Flag for webcam state
|
55 |
+
|
56 |
+
const clients = [
|
57 |
+
"multimodalart/Florence-2-l4",
|
58 |
+
"gokaygokay/Florence-2",
|
59 |
+
"multimodalart/Florence-2-l4-2",
|
60 |
+
"gokaygokay/Florence-2",
|
61 |
+
];
|
62 |
+
|
63 |
// Utility Functions
|
64 |
|
65 |
// Normalize query text
|
|
|
72 |
// Update activity indicators
|
73 |
const updateActivityIndicators = (state = null) => {
|
74 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
75 |
+
|
76 |
if (isRequestInProgress && !currentAudio) {
|
77 |
aiActivityIndicator.textContent = "AI: Processing...";
|
78 |
} else if (currentAudio && !isUserSpeaking) {
|
|
|
237 |
requestAbortController = new AbortController();
|
238 |
|
239 |
try {
|
240 |
+
const combinedQuery = isWebcamActive ? `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}` : query;
|
241 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
242 |
} catch (error) {
|
243 |
if (error.name !== 'AbortError') {
|
|
|
487 |
lastUserSpeechTimestamp = Date.now();
|
488 |
updateActivityIndicators();
|
489 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
490 |
+
|
491 |
+
// Start webcam processing if not already active
|
492 |
+
if (!isWebcamActive) {
|
493 |
+
startWebcam();
|
494 |
+
isWebcamActive = true;
|
495 |
+
}
|
496 |
};
|
497 |
|
498 |
speechRecognizer.onresult = (event) => {
|
|
|
541 |
speechRecognizer.stop();
|
542 |
isSpeechRecognitionActive = false;
|
543 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
544 |
+
|
545 |
+
// Stop webcam processing
|
546 |
+
isWebcamActive = false;
|
547 |
} else {
|
548 |
speechRecognizer.start();
|
549 |
isSpeechRecognitionActive = true;
|
|
|
554 |
alert('Your browser does not support the Web Speech API.');
|
555 |
}
|
556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
558 |
+
// Webcam Functions (optimized)
|
559 |
async function startWebcam() {
|
560 |
try {
|
561 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
562 |
video.srcObject = stream;
|
563 |
+
|
564 |
+
// Capture and process image every 5 seconds while webcam is active
|
565 |
+
const webcamInterval = setInterval(async () => {
|
566 |
+
if (!isWebcamActive) {
|
567 |
+
clearInterval(webcamInterval);
|
568 |
+
return;
|
569 |
+
}
|
570 |
+
await captureAndProcessImage();
|
571 |
+
}, 5000);
|
572 |
} catch (error) {
|
573 |
console.error("Error accessing webcam: ", error);
|
574 |
}
|
|
|
581 |
const context = canvas.getContext('2d');
|
582 |
context.drawImage(video, 0, 0, canvas.width, canvas.height);
|
583 |
|
584 |
+
const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/jpeg', 0.8)); // Use JPEG for smaller size
|
585 |
await processWithGradio(blob);
|
586 |
}
|
587 |
|
588 |
+
|
589 |
async function processWithGradio(imageBlob) {
|
590 |
try {
|
591 |
const randomClient = clients[Math.floor(Math.random() * clients.length)];
|
592 |
app = await client(randomClient);
|
|
|
593 |
|
594 |
+
const handledFile = await handle_file(imageBlob);
|
595 |
const result = await app.predict("/process_image", [handledFile, "More Detailed Caption"]);
|
596 |
|
597 |
const dataString = result.data[0];
|
598 |
+
lastCaption = dataString || lastCaption;
|
599 |
} catch (error) {
|
600 |
console.error("Error processing with Gradio:", error);
|
601 |
}
|
602 |
}
|
603 |
|
604 |
+
|
605 |
+
setInterval(updateLatency, 100);
|
|