KingNish commited on
Commit
8580e13
·
verified ·
1 Parent(s): 194daa5

Update script1.js

Browse files
Files changed (1) hide show
  1. script1.js +68 -384
script1.js CHANGED
@@ -1,11 +1,9 @@
1
- // script1.js
2
-
3
  // Constants and Configuration
4
  const USER_SPEECH_INTERRUPT_DELAY = 500;
5
  const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
6
  const CHUNK_SIZE = 300;
7
- const MAX_PREFETCH_REQUESTS = 5;
8
- const PREFETCH_CACHE_EXPIRATION = 30000; // 30 seconds
9
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
10
 
11
  // DOM Elements
@@ -17,6 +15,7 @@ const responseTimeDisplay = document.getElementById('responseTime');
17
  const userActivityIndicator = document.getElementById('userIndicator');
18
  const aiActivityIndicator = document.getElementById('aiIndicator');
19
  const transcriptDiv = document.getElementById('transcript');
 
20
 
21
  // Speech Recognition
22
  let speechRecognizer;
@@ -36,7 +35,8 @@ let audioPlaybackQueue = [];
36
 
37
  // Prefetching and Caching
38
  const prefetchCache = new Map();
39
- const pendingPrefetchRequests = new Set();
 
40
  let prefetchTextQuery = "";
41
 
42
  // Conversation History
@@ -45,9 +45,21 @@ let conversationHistory = [];
45
  // Audio Caching
46
  const audioCache = new Map();
47
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  // Utility Functions
49
 
50
- // Normalize query text
51
  const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
52
 
53
  // Generate a cache key
@@ -57,7 +69,6 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
57
  // Update activity indicators
58
  const updateActivityIndicators = (state = null) => {
59
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
60
-
61
  if (isRequestInProgress && !currentAudio) {
62
  aiActivityIndicator.textContent = "AI: Processing...";
63
  } else if (currentAudio && !isUserSpeaking) {
@@ -92,11 +103,6 @@ const addToConversationHistory = (role, content) => {
92
  if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
93
  };
94
 
95
- // Check if audio playback should be interrupted
96
- const shouldInterruptAudioPlayback = (interimTranscript) =>
97
- Date.now() - lastUserSpeechTimestamp > USER_SPEECH_INTERRUPT_DELAY || interimTranscript.length > 5;
98
-
99
-
100
  // Audio Management Functions
101
 
102
  // Play audio from the queue
@@ -124,319 +130,97 @@ const playNextAudio = async () => {
124
  }
125
  };
126
 
127
- // Interrupt audio playback
128
- const interruptAudioPlayback = (reason = 'unknown') => {
129
- console.log(`Interrupting audio (reason: ${reason})...`);
130
- if (currentAudio) {
131
- currentAudio.pause();
132
- currentAudio.currentTime = 0;
133
- currentAudio = null;
134
- }
135
-
136
- audioPlaybackQueue.length = 0;
137
- isRequestInProgress = false;
138
-
139
- if (requestAbortController) {
140
- requestAbortController.abort();
141
- requestAbortController = null;
142
- }
143
-
144
- prefetchCache.clear();
145
- updateActivityIndicators();
146
- };
147
-
148
-
149
  // Prefetching and Caching Functions
150
 
151
  // Prefetch and cache the first TTS audio chunk
152
- const prefetchFirstAudioChunk = async (query, voice) => {
153
  const normalizedQuery = normalizeQueryText(query);
154
  const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
155
 
156
  if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
157
 
158
- pendingPrefetchRequests.add(cacheKey);
159
-
160
- try {
161
- const firstAudioUrl = await streamAndPrefetchAudio(query, voice);
162
- if (firstAudioUrl) prefetchCache.set(cacheKey, { url: firstAudioUrl, timestamp: Date.now() });
163
- } catch (error) {
164
- if (error.name !== 'AbortError') console.error("Error prefetching audio:", error);
165
- } finally {
166
- pendingPrefetchRequests.delete(cacheKey);
167
- }
168
- };
169
-
170
-
171
- // Cancel pending prefetch requests
172
- const cancelPrefetchRequests = (query) => {
173
- const normalizedQuery = normalizeQueryText(query);
174
- for (const key of pendingPrefetchRequests) {
175
- if (key.startsWith(normalizedQuery)) {
176
- pendingPrefetchRequests.delete(key);
177
- // Implement abort logic if needed for your fetch implementation
178
- }
179
- }
180
  };
181
 
182
-
183
- // AI Interaction Functions
184
-
185
- // Send a query to the AI
186
- async function sendQueryToAI(query) {
187
- console.log("Sending query to AI:", query);
188
- isRequestInProgress = true;
189
- updateActivityIndicators();
190
- firstResponseTextTimestamp = null;
191
-
192
- const normalizedQuery = normalizeQueryText(query);
193
- const cacheKey = generateCacheKey(normalizedQuery, voiceSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
194
-
195
- queryStartTime = Date.now();
196
-
197
- // Check prefetch cache
198
- if (prefetchCache.has(cacheKey)) {
199
- const cachedData = prefetchCache.get(cacheKey);
200
- if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
201
- audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
202
- playNextAudio();
203
- } else {
204
- prefetchCache.delete(cacheKey);
205
- }
206
- }
207
-
208
- requestAbortController = new AbortController();
209
-
210
  try {
211
- const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}`;
212
- await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
 
213
  } catch (error) {
214
- if (error.name !== 'AbortError') {
215
- console.error("Error sending query to AI:", error);
216
- }
217
- } finally {
218
- isRequestInProgress = false;
219
- updateActivityIndicators();
220
  }
221
  };
222
 
223
- // Process the final speech transcript
224
- const processSpeechTranscript = (transcript) => {
225
- const trimmedTranscript = transcript.trimStart();
226
- if (trimmedTranscript !== '' && !isRequestInProgress) {
227
- activeQuery = trimmedTranscript;
228
- sendQueryToAI(activeQuery);
229
- addToConversationHistory('user', activeQuery);
230
  }
231
  };
232
 
 
 
233
 
234
- // Network and Streaming Functions
235
-
236
- // Stream AI response and handle audio
237
- const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
238
- const response = await fetchAIResponse(query, abortSignal);
239
-
240
- if (!response.ok) {
241
- if (response.status === 429) {
242
- console.log("Rate limit hit, retrying in 1 second...");
243
- await new Promise(resolve => setTimeout(resolve, 1000));
244
- await sendQueryToAI(query);
245
- return;
246
- }
247
- throw new Error(`Network response was not ok: ${response.status}`);
248
- }
249
-
250
- console.log("Streaming audio response received");
251
- await handleStreamingResponse(response.body, voice, abortSignal);
252
- };
253
-
254
- // Stream AI response for prefetching
255
- const streamAndPrefetchAudio = async (query, voice) => {
256
- const response = await fetchAIResponse(query, undefined);
257
-
258
- if (!response.ok) throw new Error('Network response was not ok');
259
-
260
- return handleStreamingResponseForPrefetch(response.body, voice);
261
- };
262
-
263
- // Fetch AI response
264
- const fetchAIResponse = async (query, abortSignal) => {
265
- const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
266
-
267
- const url = '/stream_text';
268
- const requestBody = {
269
- query: query,
270
- history: JSON.stringify(conversationHistory),
271
- model: modelSelectionDropdown.value,
272
- api_key: userSambanovaKey
273
- };
274
-
275
- return fetch(url, {
276
- method: 'POST',
277
- headers: {
278
- 'Accept': 'text/event-stream',
279
- 'Content-Type': 'application/json'
280
- },
281
- body: JSON.stringify(requestBody),
282
- signal: abortSignal
283
- });
284
- };
285
-
286
- // Handle the streaming response for prefetching
287
- const handleStreamingResponseForPrefetch = async (responseStream, voice) => {
288
- const reader = responseStream.getReader();
289
- const decoder = new TextDecoder("utf-8");
290
- let buffer = "";
291
-
292
- try {
293
- while (true) {
294
- const { done, value } = await reader.read();
295
- if (done) break;
296
-
297
- const chunk = decoder.decode(value, { stream: true });
298
- buffer += chunk;
299
- const lines = buffer.split('\n');
300
-
301
- for (let i = 0; i < lines.length - 1; i++) {
302
- const line = lines[i];
303
- if (line.startsWith('data: ')) {
304
- const textContent = line.substring(6).trim();
305
- if (textContent) {
306
- return await generateTextToSpeechAudio(textContent, voice);
307
- }
308
- }
309
- }
310
-
311
- buffer = lines[lines.length - 1];
312
- }
313
- } catch (error) {
314
- console.error("Error in handleStreamingResponseForPrefetch:", error);
315
- } finally {
316
- reader.releaseLock();
317
- }
318
 
319
- return null;
 
320
  };
321
 
322
- // Handle the streaming audio response
323
- const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
324
- const reader = responseStream.getReader();
325
- const decoder = new TextDecoder("utf-8");
326
- let buffer = "";
327
- let fullResponseText = "";
328
- let fullResponseText2 = "";
329
- let textChunk = "";
330
- let sentText = "";
331
-
332
  try {
333
- while (true) {
334
- const { done, value } = await reader.read();
335
- if (done) break;
336
- if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
337
-
338
- if (isUserSpeaking) {
339
- interruptAudioPlayback('user is speaking');
340
- break;
341
- }
342
 
343
- const chunk = decoder.decode(value, { stream: true });
344
- buffer += chunk;
345
- const lines = buffer.split('\n');
346
-
347
- for (let i = 0; i < lines.length - 1; i++) {
348
- const line = lines[i];
349
- if (line.startsWith('data: ')) {
350
- const textContent = line.substring(6).trim();
351
- if (textContent) {
352
- if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
353
-
354
- fullResponseText += textContent + " ";
355
- fullResponseText2 += textContent + " ";
356
- textChunk += textContent + " ";
357
- transcriptDiv.textContent = fullResponseText2;
358
-
359
- const audioUrl = await generateTextToSpeechAudio(textContent, voice); // Call TTS immediately
360
- if (audioUrl) {
361
- audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
362
- if (!currentAudio) playNextAudio();
363
- }
364
-
365
- if (fullResponseText !== '') {
366
- fullResponseText = '';
367
- }
368
- }
369
- }
370
- }
371
 
372
- buffer = lines[lines.length - 1];
373
- }
374
  } catch (error) {
375
- console.error("Error in handleStreamingResponse:", error);
376
- } finally {
377
- reader.releaseLock();
378
-
379
- if (fullResponseText2 !== '') {
380
- addToConversationHistory('assistant', fullResponseText2);
381
- fullResponseText2 = '';
382
- }
383
  }
384
  };
385
 
386
- // Generate Text-to-Speech audio with caching
387
- const generateTextToSpeechAudio = async (text, voice) => {
388
- const normalizedText = normalizeQueryText(text);
389
- const cacheKey = `${normalizedText}-${voice}`;
390
-
391
- if (audioCache.has(cacheKey)) {
392
- const cachedData = audioCache.get(cacheKey);
393
- if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
394
- return cachedData.url;
395
- } else {
396
- audioCache.delete(cacheKey);
397
- }
398
  }
 
399
 
400
- try {
401
- const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
402
- if (!response.ok) throw new Error('Network response was not ok');
403
- const audioBlob = await response.blob();
404
- const audioUrl = URL.createObjectURL(audioBlob);
405
-
406
- audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
407
- return audioUrl;
408
- } catch (error) {
409
- console.error("Error generating TTS audio:", error);
410
- return null;
411
  }
412
- };
413
-
414
 
415
  // Speech Recognition Initialization
416
-
417
  if ('webkitSpeechRecognition' in window) {
418
  speechRecognizer = new webkitSpeechRecognition();
419
- Object.assign(speechRecognizer, {
420
- continuous: true,
421
- interimResults: true,
422
- language: 'en-US',
423
- maxAlternatives: 3
424
- });
425
-
426
- speechRecognizer.onstart = () => {
427
- console.log("Speech recognition started");
428
- isUserSpeaking = true;
429
- lastUserSpeechTimestamp = Date.now();
430
- updateActivityIndicators();
431
- startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
432
- };
433
 
434
  speechRecognizer.onresult = (event) => {
435
  let interimTranscript = '';
436
  for (let i = event.resultIndex; i < event.results.length; i++) {
437
  const transcript = event.results[i][0].transcript;
438
  if (event.results[i].isFinal) {
439
- interruptAudioPlayback('final');
440
  processSpeechTranscript(transcript);
441
  isUserSpeaking = false;
442
  updateActivityIndicators();
@@ -444,114 +228,14 @@ if ('webkitSpeechRecognition' in window) {
444
  } else {
445
  interimTranscript += transcript;
446
  isUserSpeaking = true;
447
- lastUserSpeechTimestamp = Date.now();
448
  updateActivityIndicators();
449
-
450
- if (interimTranscript.length > prefetchTextQuery.length + 5) {
451
- cancelPrefetchRequests(prefetchTextQuery);
452
- }
453
- prefetchTextQuery = interimTranscript;
454
- prefetchFirstAudioChunk(interimTranscript, voiceSelectionDropdown.value);
455
-
456
- if (isRequestInProgress && shouldInterruptAudioPlayback(interimTranscript)) {
457
- interruptAudioPlayback('interim');
458
- }
459
  }
460
  }
461
  };
462
-
463
- speechRecognizer.onerror = (event) => {
464
- console.error('Speech recognition error:', event.error);
465
- if (isSpeechRecognitionActive) speechRecognizer.start();
466
- };
467
-
468
- speechRecognizer.onend = () => {
469
- isUserSpeaking = false;
470
- updateActivityIndicators();
471
-
472
- if (isSpeechRecognitionActive) speechRecognizer.start();
473
- };
474
-
475
- startStopButton.addEventListener('click', () => {
476
- if (isSpeechRecognitionActive) {
477
- speechRecognizer.stop();
478
- isSpeechRecognitionActive = false;
479
- startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
480
- clearInterval(imageCaptureInterval); // Stop webcam processing
481
- } else {
482
- speechRecognizer.start();
483
- isSpeechRecognitionActive = true;
484
- startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
485
- imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
486
- }
487
- });
488
- } else {
489
- alert('Your browser does not support the Web Speech API.');
490
  }
491
 
492
  setInterval(updateLatency, 100);
493
 
494
-
495
-
496
- // Webcam Integration
497
- import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
498
-
499
- let app;
500
- let lastCaption = "";
501
-
502
- const clients = [
503
- "multimodalart/Florence-2-l4",
504
- "gokaygokay/Florence-2",
505
- "multimodalart/Florence-2-l4-2",
506
- "gokaygokay/Florence-2",
507
- ];
508
-
509
- async function startWebcam() {
510
- try {
511
- const stream = await navigator.mediaDevices.getUserMedia({ video: true });
512
- webcamVideo.srcObject = stream;
513
- } catch (error) {
514
- console.error("Error accessing webcam: ", error);
515
- }
516
- }
517
-
518
- async function captureAndProcessImage() {
519
- const canvas = document.createElement('canvas');
520
- canvas.width = webcamVideo.videoWidth;
521
- canvas.height = webcamVideo.videoHeight;
522
- const context = canvas.getContext('2d');
523
- context.drawImage(webcamVideo, 0, 0, canvas.width, canvas.height);
524
-
525
- const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
526
- await processWithGradio(blob);
527
- }
528
-
529
- async function processWithGradio(imageBlob) {
530
- try {
531
- const randomClient = clients[Math.floor(Math.random() * clients.length)];
532
- app = await client(randomClient);
533
- const handledFile = await handle_file(imageBlob);
534
-
535
- const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
536
-
537
- const dataString = result.data[0];
538
- lastCaption = dataString || lastCaption;
539
- } catch (error) {
540
- console.error("Error processing with Gradio:", error);
541
- }
542
- }
543
-
544
- let imageCaptureInterval; // Declare interval outside the event listener
545
-
546
  window.onload = () => {
547
  startWebcam();
548
-
549
- startStopButton.addEventListener('click', () => {
550
- // ... (start/stop speech recognition and webcam captioning)
551
- if (isSpeechRecognitionActive) {
552
- clearInterval(imageCaptureInterval); // Stop webcam processing
553
- } else {
554
- imageCaptureInterval = setInterval(captureAndProcessImage, 5000); // Start webcam processing
555
- }
556
- });
557
- };
 
 
 
1
  // Constants and Configuration
2
  const USER_SPEECH_INTERRUPT_DELAY = 500;
3
  const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
4
  const CHUNK_SIZE = 300;
5
+ const MAX_PREFETCH_REQUESTS = 10;
6
+ const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
7
  const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
8
 
9
  // DOM Elements
 
15
  const userActivityIndicator = document.getElementById('userIndicator');
16
  const aiActivityIndicator = document.getElementById('aiIndicator');
17
  const transcriptDiv = document.getElementById('transcript');
18
+ const webcamToggleButton = document.getElementById('webcamToggle');
19
 
20
  // Speech Recognition
21
  let speechRecognizer;
 
35
 
36
  // Prefetching and Caching
37
  const prefetchCache = new Map();
38
+ const pendingPrefetchRequests = new Map();
39
+ const prefetchQueue = [];
40
  let prefetchTextQuery = "";
41
 
42
  // Conversation History
 
45
  // Audio Caching
46
  const audioCache = new Map();
47
 
48
+ // Webcam
49
+ let isWebcamActive = false;
50
+ let app;
51
+ let lastCaption = "";
52
+
53
+ const clients = [
54
+ "multimodalart/Florence-2-l4",
55
+ "gokaygokay/Florence-2",
56
+ "multimodalart/Florence-2-l4-2",
57
+ "gokaygokay/Florence-2",
58
+ ];
59
+
60
  // Utility Functions
61
 
62
+ // Normalize query text
63
  const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');
64
 
65
  // Generate a cache key
 
69
  // Update activity indicators
70
  const updateActivityIndicators = (state = null) => {
71
  userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
 
72
  if (isRequestInProgress && !currentAudio) {
73
  aiActivityIndicator.textContent = "AI: Processing...";
74
  } else if (currentAudio && !isUserSpeaking) {
 
103
  if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
104
  };
105
 
 
 
 
 
 
106
  // Audio Management Functions
107
 
108
  // Play audio from the queue
 
130
  }
131
  };
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  // Prefetching and Caching Functions
134
 
135
  // Prefetch and cache the first TTS audio chunk
136
+ const prefetchFirstAudioChunk = (query, voice) => {
137
  const normalizedQuery = normalizeQueryText(query);
138
  const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);
139
 
140
  if (pendingPrefetchRequests.has(cacheKey) || prefetchCache.has(cacheKey)) return;
141
 
142
+ prefetchQueue.push({ query: query.trim(), voice, cacheKey });
143
+ processPrefetchQueue();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  };
145
 
146
+ // Webcam Integration Functions
147
+ const startWebcam = async () => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  try {
149
+ const stream = await navigator.mediaDevices.getUserMedia({ video: true });
150
+ document.getElementById('webcam').srcObject = stream;
151
+ setInterval(captureAndProcessImage, 5000);
152
  } catch (error) {
153
+ console.error("Error accessing webcam: ", error);
 
 
 
 
 
154
  }
155
  };
156
 
157
+ const stopWebcam = () => {
158
+ const stream = document.getElementById('webcam').srcObject;
159
+ if (stream) {
160
+ const tracks = stream.getTracks();
161
+ tracks.forEach(track => track.stop());
 
 
162
  }
163
  };
164
 
165
+ const captureAndProcessImage = async () => {
166
+ if (!isWebcamActive) return;
167
 
168
+ const canvas = document.createElement('canvas');
169
+ const video = document.getElementById('webcam');
170
+ canvas.width = video.videoWidth;
171
+ canvas.height = video.videoHeight;
172
+ const context = canvas.getContext('2d');
173
+ context.drawImage(video, 0, 0, canvas.width, canvas.height);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
176
+ await processWithGradio(blob);
177
  };
178
 
179
+ const processWithGradio = async (imageBlob) => {
 
 
 
 
 
 
 
 
 
180
  try {
181
+ const randomClient = clients[Math.floor(Math.random() * clients.length)];
182
+ app = await client(randomClient);
183
+ const handledFile = await handle_file(imageBlob);
 
 
 
 
 
 
184
 
185
+ const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ const dataString = result.data[0];
188
+ lastCaption = dataString || lastCaption;
189
  } catch (error) {
190
+ console.error("Error processing with Gradio:", error);
 
 
 
 
 
 
 
191
  }
192
  };
193
 
194
+ // Event Listeners
195
+ startStopButton.addEventListener('click', () => {
196
+ isSpeechRecognitionActive = !isSpeechRecognitionActive;
197
+ if (isSpeechRecognitionActive) {
198
+ speechRecognizer.start();
199
+ } else {
200
+ speechRecognizer.stop();
 
 
 
 
 
201
  }
202
+ });
203
 
204
+ webcamToggleButton.addEventListener('click', () => {
205
+ isWebcamActive = !isWebcamActive;
206
+ if (isWebcamActive) {
207
+ startWebcam();
208
+ } else {
209
+ stopWebcam();
 
 
 
 
 
210
  }
211
+ });
 
212
 
213
  // Speech Recognition Initialization
 
214
  if ('webkitSpeechRecognition' in window) {
215
  speechRecognizer = new webkitSpeechRecognition();
216
+ speechRecognizer.continuous = true;
217
+ speechRecognizer.interimResults = true;
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  speechRecognizer.onresult = (event) => {
220
  let interimTranscript = '';
221
  for (let i = event.resultIndex; i < event.results.length; i++) {
222
  const transcript = event.results[i][0].transcript;
223
  if (event.results[i].isFinal) {
 
224
  processSpeechTranscript(transcript);
225
  isUserSpeaking = false;
226
  updateActivityIndicators();
 
228
  } else {
229
  interimTranscript += transcript;
230
  isUserSpeaking = true;
 
231
  updateActivityIndicators();
 
 
 
 
 
 
 
 
 
 
232
  }
233
  }
234
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  }
236
 
237
  setInterval(updateLatency, 100);
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  window.onload = () => {
240
  startWebcam();
241
+ };