Spaces:

jlonsako
/

Amh-Transcribe

Sleeping

App Files Files Community

jtlonsako commited on Jul 11, 2023

Commit

a95e64c

•

1 Parent(s): 2a01fc3

Test 2

Browse files

Files changed (1) hide show

app.py +138 -0

app.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import soundfile as sf
+import datetime
+from pyctcdecode import BeamSearchDecoderCTC
+import torch
+import os
+import time
+import gc
+import gradio as gr
+import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoModelForSeq2SeqLM, AutoTokenizer
+from GPUtil import showUtilization as gpu_usage
+from numba import cuda
+from google.cloud import translate
+# load pretrained model
+model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all")
+processor = Wav2Vec2ProcessorWithLM.from_pretrained("jlonsako/mms-1b-all-AmhLM")
+#Define Functions
+#convert time into .sbv format
+def format_time(seconds):
+    # Convert seconds to hh:mm:ss,ms format
+    return str(datetime.timedelta(seconds=seconds)).replace('.', ',')
+#function to send text strings to be translated into english
+def translate_text(
+    text: str = "ሃሌሉያ አሁን ድምፅ የሚወጣል ጥሩ እግዛብሔርን  እንዴት",
+    project_id: str = "noble-feat-390914"
+) -> translate.TranslationServiceClient:
+    """Translating Text."""
+    client = translate.TranslationServiceClient()
+    location = "global"
+    parent = f"projects/{project_id}/locations/{location}"
+    # Translate text from English to Amharic
+    # Detail on supported types can be found here:
+    # https://cloud.google.com/translate/docs/supported-formats
+    response = client.translate_text(
+        request={
+            "parent": parent,
+            "contents": [text],
+            "mime_type": "text/plain",  # mime types: text/plain, text/html
+            "source_language_code": "am",
+            "target_language_code": "en-US",
+        }
+    )
+    # Display the translation for each input text provided
+    #for translation in response.translations:
+        #print(f"Translated text: {translation.translated_text}")
+    return response
+#Convert Video/Audio into 16K wav file
+def preprocessAudio(audioFile):
+    os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav")
+#Transcribe!!!
+def Transcribe(file):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    start_time = time.time()
+    model.load_adapter("amh")
+    preprocessAudio(file)
+    #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav")
+    block_size = 30
+    transcripts = []
+    stream = librosa.stream(
+        "./audio.wav",
+        block_length=block_size,
+        frame_length=16000,
+        hop_length=16000
+    )
+    model.to(device)
+    print("Model loaded to gpu: Entering transcription phase")
+    #Code for timestamping
+    encoding_start = 0
+    sbv_file = open("subtitle.sbv", "w")
+    for speech_segment in stream:
+        if len(speech_segment.shape) > 1:
+            speech_segment = speech_segment[:,0] + speech_segment[:,1]
+        input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
+        with torch.no_grad():
+            logits = model(input_values).logits
+        if len(logits.shape) == 1:
+            print("test")
+            logits = logits.unsqueeze(0)
+        #predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(logits.cpu().numpy()).text
+        transcripts.append(transcription[0])
+        #Generate timestamps
+        encoding_end = encoding_start + block_size
+        formatted_start = format_time(encoding_start)
+        formatted_end = format_time(encoding_end)
+        #Write to the .sbv file
+        sbv_file.write(f"{formatted_start},{formatted_end}\n")
+        sbv_file.write(f"{transcription[0]}\n\n")
+        encoding_start = encoding_end
+        # Freeing up memory
+        del input_values
+        del logits
+        #del predicted_ids
+        del transcription
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Join all transcripts into a single transcript
+    transcript = ' '.join(transcripts)
+    sbv_file.close()
+    end_time = time.time()
+    os.system("rm ./audio.wav")
+    print(f"The script ran for {end_time - start_time} seconds.")
+    return("subtitle.sbv")
+demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs=gr.File())
+#with gr.Blocks() as demo:
+    #file_output = gr.Textbox()
+    #upload_button = gr.UploadButton("Click to Upload a sermon",
+    #                                    file_types=["video", "audio"], file_count="multiple")
+    #upload_button.upload(Transcribe, upload_button, file_output)
+demo.launch()