import soundfile as sf import datetime from pyctcdecode import BeamSearchDecoderCTC import torch import os import time import gc import gradio as gr import librosa from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoModelForSeq2SeqLM, AutoTokenizer from numba import cuda # load pretrained model model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all") processor = Wav2Vec2ProcessorWithLM.from_pretrained("jlonsako/mms-1b-all-AmhLM") #Define Functions #convert time into .sbv format def format_time(seconds): # Convert seconds to hh:mm:ss,ms format return str(datetime.timedelta(seconds=seconds)).replace('.', ',') #function to send text strings to be translated into english def translate_text( text: str = "ሃሌሉያ አሁን ድምፅ የሚወጣል ጥሩ እግዛብሔርን እንዴት", project_id: str = "noble-feat-390914" ) -> translate.TranslationServiceClient: """Translating Text.""" client = translate.TranslationServiceClient() location = "global" parent = f"projects/{project_id}/locations/{location}" # Translate text from English to Amharic # Detail on supported types can be found here: # https://cloud.google.com/translate/docs/supported-formats response = client.translate_text( request={ "parent": parent, "contents": [text], "mime_type": "text/plain", # mime types: text/plain, text/html "source_language_code": "am", "target_language_code": "en-US", } ) # Display the translation for each input text provided #for translation in response.translations: #print(f"Translated text: {translation.translated_text}") return response #Convert Video/Audio into 16K wav file def preprocessAudio(audioFile): os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav") #Transcribe!!! def Transcribe(file): device = "cuda:0" if torch.cuda.is_available() else "cpu" start_time = time.time() model.load_adapter("amh") preprocessAudio(file) #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav") block_size = 30 transcripts = [] stream = librosa.stream( "./audio.wav", block_length=block_size, frame_length=16000, hop_length=16000 ) model.to(device) print("Model loaded to gpu: Entering transcription phase") #Code for timestamping encoding_start = 0 sbv_file = open("subtitle.sbv", "w") for speech_segment in stream: if len(speech_segment.shape) > 1: speech_segment = speech_segment[:,0] + speech_segment[:,1] input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device) with torch.no_grad(): logits = model(input_values).logits if len(logits.shape) == 1: print("test") logits = logits.unsqueeze(0) #predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(logits.cpu().numpy()).text transcripts.append(transcription[0]) #Generate timestamps encoding_end = encoding_start + block_size formatted_start = format_time(encoding_start) formatted_end = format_time(encoding_end) #Write to the .sbv file sbv_file.write(f"{formatted_start},{formatted_end}\n") sbv_file.write(f"{transcription[0]}\n\n") encoding_start = encoding_end # Freeing up memory del input_values del logits #del predicted_ids del transcription torch.cuda.empty_cache() gc.collect() # Join all transcripts into a single transcript transcript = ' '.join(transcripts) sbv_file.close() end_time = time.time() os.system("rm ./audio.wav") print(f"The script ran for {end_time - start_time} seconds.") return("subtitle.sbv") demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs=gr.File()) #with gr.Blocks() as demo: #file_output = gr.Textbox() #upload_button = gr.UploadButton("Click to Upload a sermon", # file_types=["video", "audio"], file_count="multiple") #upload_button.upload(Transcribe, upload_button, file_output) demo.launch()