speech-analyzer / optimized.py
abrar-adnan's picture
added output
6d1ef96
raw
history blame
3.09 kB
import base64
import cv2
import face_recognition
import gradio as gr
import moviepy.editor as mp
import os
import time
import torchaudio
from fastai.vision.all import load_learner
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
emotion_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion")
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
model = load_learner("gaze-recognizer-v3.pkl")
def extract_audio(video_path):
clip = mp.VideoFileClip(video_path)
clip.audio.write_audiofile("audio.wav")
def analyze_emotion(text):
result = emotion_pipeline(text)
return result
def analyze_sentiment(text):
result = sentiment_pipeline(text)
return result
def get_transcription(path):
extract_audio(path)
waveform, sample_rate = torchaudio.load("audio.wav")
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
waveform = resampler(waveform)[0]
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None
input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
def process_frame(frame):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
face_locations = face_recognition.face_locations(gray)
if len(face_locations) > 0:
for top, right, bottom, left in face_locations:
face_image = gray[top:bottom, left:right]
resized_face_image = cv2.resize(face_image, (128, 128))
result = model.predict(resized_face_image)
return result[0]
return None
def video_processing(video_file, encoded_video):
if encoded_video != "":
decoded_file_data = base64.b64decode(encoded_video)
with open("temp_video.mp4", "wb") as f:
f.write(decoded_file_data)
video_file = "temp_video.mp4"
transcription = get_transcription(video_file)
print(transcription)
video_capture = cv2.VideoCapture(video_file)
on_camera = 0
off_camera = 0
total = 0
emotions = []
while True:
for _ in range(24 * 3):
ret, frame = video_capture.read()
if not ret:
break
if not ret:
break
result = process_frame(frame)
if result:
if result == 'on_camera':
on_camera += 1
elif result == 'off_camera':
off_camera += 1
total += 1
emotion_results = analyze_emotion(transcription)
emotions.append(emotion_results)
video_capture.release()
cv2.destroyAllWindows()
if os.path.exists("temp_video.mp4"):
os.remove("temp_video.mp4")
gaze_percentage = on_camera / total * 100 if total > 0