Spaces:
Duplicated from KingNish/Voicee

KingNish
/

Live-Video-Chat

Running

App Files Files Community

Live-Video-Chat / script1.js

KingNish's picture

Update script1.js

8580e13 verified 4 months ago

7.62 kB

	// Constants and Configuration
	const USER_SPEECH_INTERRUPT_DELAY = 500;
	const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
	const CHUNK_SIZE = 300;
	const MAX_PREFETCH_REQUESTS = 10;
	const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
	const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour

	// DOM Elements
	const startStopButton = document.getElementById('startStopButton');
	const voiceSelectionDropdown = document.getElementById('voiceSelect');
	const modelSelectionDropdown = document.getElementById('modelSelect');
	const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
	const responseTimeDisplay = document.getElementById('responseTime');
	const userActivityIndicator = document.getElementById('userIndicator');
	const aiActivityIndicator = document.getElementById('aiIndicator');
	const transcriptDiv = document.getElementById('transcript');
	const webcamToggleButton = document.getElementById('webcamToggle');

	// Speech Recognition
	let speechRecognizer;
	let isSpeechRecognitionActive = false;

	// AI Interaction State
	let activeQuery = null;
	let queryStartTime = 0;
	let isRequestInProgress = false;
	let isUserSpeaking = false;
	let requestAbortController = null;
	let firstResponseTextTimestamp = null;

	// Audio Management
	let currentAudio = null;
	let audioPlaybackQueue = [];

	// Prefetching and Caching
	const prefetchCache = new Map();
	const pendingPrefetchRequests = new Map();
	const prefetchQueue = [];
	let prefetchTextQuery = "";

	// Conversation History
	let conversationHistory = [];

	// Audio Caching
	const audioCache = new Map();

	// Webcam
	let isWebcamActive = false;
	let app;
	let lastCaption = "";

	const clients = [
	"multimodalart/Florence-2-l4",
	"gokaygokay/Florence-2",
	"multimodalart/Florence-2-l4-2",
	"gokaygokay/Florence-2",
	];

	// Utility Functions

	// Normalize query text
	const normalizeQueryText = query => query.trim().toLowerCase().replace(/[^\w\s]/g, '');

	// Generate a cache key
	const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
	`${normalizedQuery}-${voice}-${JSON.stringify(history)}-${modelName}`;

	// Update activity indicators
	const updateActivityIndicators = (state = null) => {
	userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
	if (isRequestInProgress && !currentAudio) {
	aiActivityIndicator.textContent = "AI: Processing...";
	} else if (currentAudio && !isUserSpeaking) {
	aiActivityIndicator.textContent = state \|\| "AI: Speaking";
	} else if (isUserSpeaking) {
	aiActivityIndicator.textContent = "AI: Listening";
	} else {
	aiActivityIndicator.textContent = "AI: Idle";
	}
	};

	// Update latency display
	const updateLatency = () => {
	if (firstResponseTextTimestamp) {
	const latency = firstResponseTextTimestamp - queryStartTime;
	responseTimeDisplay.textContent = `Latency: ${latency}ms`;
	} else {
	responseTimeDisplay.textContent = "Latency: 0ms";
	}
	};

	// Add to conversation history
	const addToConversationHistory = (role, content) => {
	if (conversationHistory.length > 0 &&
	conversationHistory[conversationHistory.length - 1].role === 'assistant' &&
	conversationHistory[conversationHistory.length - 1].content === "") {
	conversationHistory.pop();
	}

	conversationHistory.push({ role, content });

	if (conversationHistory.length > 6) conversationHistory.splice(0, 2);
	};

	// Audio Management Functions

	// Play audio from the queue
	const playNextAudio = async () => {
	if (audioPlaybackQueue.length > 0) {
	const audioData = audioPlaybackQueue.shift();
	const audio = new Audio(audioData.url);
	updateActivityIndicators();

	const audioPromise = new Promise(resolve => {
	audio.onended = resolve;
	audio.onerror = resolve;
	});
	if (currentAudio) {
	currentAudio.pause();
	currentAudio.currentTime = 0;
	}

	currentAudio = audio;
	await audio.play();
	await audioPromise;
	playNextAudio();
	} else {
	updateActivityIndicators();
	}
	};

	// Prefetching and Caching Functions

	// Prefetch and cache the first TTS audio chunk
	const prefetchFirstAudioChunk = (query, voice) => {
	const normalizedQuery = normalizeQueryText(query);
	const cacheKey = generateCacheKey(normalizedQuery, voice, conversationHistory, modelSelectionDropdown.value);

	if (pendingPrefetchRequests.has(cacheKey) \|\| prefetchCache.has(cacheKey)) return;

	prefetchQueue.push({ query: query.trim(), voice, cacheKey });
	processPrefetchQueue();
	};

	// Webcam Integration Functions
	const startWebcam = async () => {
	try {
	const stream = await navigator.mediaDevices.getUserMedia({ video: true });
	document.getElementById('webcam').srcObject = stream;
	setInterval(captureAndProcessImage, 5000);
	} catch (error) {
	console.error("Error accessing webcam: ", error);
	}
	};

	const stopWebcam = () => {
	const stream = document.getElementById('webcam').srcObject;
	if (stream) {
	const tracks = stream.getTracks();
	tracks.forEach(track => track.stop());
	}
	};

	const captureAndProcessImage = async () => {
	if (!isWebcamActive) return;

	const canvas = document.createElement('canvas');
	const video = document.getElementById('webcam');
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const context = canvas.getContext('2d');
	context.drawImage(video, 0, 0, canvas.width, canvas.height);

	const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
	await processWithGradio(blob);
	};

	const processWithGradio = async (imageBlob) => {
	try {
	const randomClient = clients[Math.floor(Math.random() * clients.length)];
	app = await client(randomClient);
	const handledFile = await handle_file(imageBlob);

	const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);

	const dataString = result.data[0];
	lastCaption = dataString \|\| lastCaption;
	} catch (error) {
	console.error("Error processing with Gradio:", error);
	}
	};

	// Event Listeners
	startStopButton.addEventListener('click', () => {
	isSpeechRecognitionActive = !isSpeechRecognitionActive;
	if (isSpeechRecognitionActive) {
	speechRecognizer.start();
	} else {
	speechRecognizer.stop();
	}
	});

	webcamToggleButton.addEventListener('click', () => {
	isWebcamActive = !isWebcamActive;
	if (isWebcamActive) {
	startWebcam();
	} else {
	stopWebcam();
	}
	});

	// Speech Recognition Initialization
	if ('webkitSpeechRecognition' in window) {
	speechRecognizer = new webkitSpeechRecognition();
	speechRecognizer.continuous = true;
	speechRecognizer.interimResults = true;

	speechRecognizer.onresult = (event) => {
	let interimTranscript = '';
	for (let i = event.resultIndex; i < event.results.length; i++) {
	const transcript = event.results[i][0].transcript;
	if (event.results[i].isFinal) {
	processSpeechTranscript(transcript);
	isUserSpeaking = false;
	updateActivityIndicators();
	queryStartTime = Date.now();
	} else {
	interimTranscript += transcript;
	isUserSpeaking = true;
	updateActivityIndicators();
	}
	}
	};
	}

	setInterval(updateLatency, 100);

	window.onload = () => {
	startWebcam();
	};