import soundfile as sf import datetime from pyctcdecode import BeamSearchDecoderCTC import torch import os import time import gc import gradio as gr import librosa from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoModelForSeq2SeqLM, AutoTokenizer from numba import cuda # load pretrained model model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all") processor = Wav2Vec2ProcessorWithLM.from_pretrained("jlonsako/mms-1b-all-AmhLM") #Define Functions #convert time into .sbv format def format_time(seconds): # Convert seconds to hh:mm:ss,ms format return str(datetime.timedelta(seconds=seconds)).replace('.', ',') #Convert Video/Audio into 16K wav file def preprocessAudio(audioFile): os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav") #Transcribe!!! def Transcribe(file): device = "cuda:0" if torch.cuda.is_available() else "cpu" start_time = time.time() model.load_adapter("amh") preprocessAudio(file) #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav") block_size = 30 #30 second chunks of audio transcripts = [] stream = librosa.stream( "./audio.wav", block_length=block_size, frame_length=16000, hop_length=16000 ) model.to(device) print(f"Model loaded to {device}: Entering transcription phase") #Code for timestamping encoding_start = 0 sbv_file = open("subtitle.sbv", "w") for speech_segment in stream: if len(speech_segment.shape) > 1: speech_segment = speech_segment[:,0] + speech_segment[:,1] input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device) with torch.no_grad(): logits = model(input_values).logits if len(logits.shape) == 1: print("test") logits = logits.unsqueeze(0) #predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(logits.cpu().numpy()).text transcripts.append(transcription[0]) #Generate timestamps encoding_end = encoding_start + block_size formatted_start = format_time(encoding_start) formatted_end = format_time(encoding_end) #Write to the .sbv file sbv_file.write(f"{formatted_start},{formatted_end}\n") sbv_file.write(f"{transcription[0]}\n\n") encoding_start = encoding_end # Freeing up memory del input_values del logits #del predicted_ids del transcription torch.cuda.empty_cache() gc.collect() # Join all transcripts into a single transcript transcript = ' '.join(transcripts) sbv_file.close() end_time = time.time() os.system("rm ./audio.wav") print(f"The script ran for {end_time - start_time} seconds.") return("./subtitle.sbv") demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file") #with gr.Blocks() as demo: #file_output = gr.Textbox() #upload_button = gr.UploadButton("Click to Upload a sermon", # file_types=["video", "audio"], file_count="multiple") #upload_button.upload(Transcribe, upload_button, file_output) demo.launch()