jtlonsako commited on
Commit
a95e64c
β€’
1 Parent(s): 2a01fc3
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import datetime
3
+ from pyctcdecode import BeamSearchDecoderCTC
4
+ import torch
5
+ import os
6
+ import time
7
+ import gc
8
+ import gradio as gr
9
+ import librosa
10
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoModelForSeq2SeqLM, AutoTokenizer
11
+ from GPUtil import showUtilization as gpu_usage
12
+ from numba import cuda
13
+ from google.cloud import translate
14
+
15
+
16
+ # load pretrained model
17
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all")
18
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained("jlonsako/mms-1b-all-AmhLM")
19
+
20
+
21
+ #Define Functions
22
+
23
+ #convert time into .sbv format
24
+ def format_time(seconds):
25
+ # Convert seconds to hh:mm:ss,ms format
26
+ return str(datetime.timedelta(seconds=seconds)).replace('.', ',')
27
+
28
+ #function to send text strings to be translated into english
29
+ def translate_text(
30
+ text: str = "αˆƒαˆŒαˆ‰α‹« αŠ αˆαŠ• α‹΅αˆα… α‹¨αˆšα‹ˆαŒ£αˆ αŒ₯ሩ αŠ₯αŒα‹›α‰₯αˆ”αˆ­αŠ• αŠ₯αŠ•α‹΄α‰΅",
31
+ project_id: str = "noble-feat-390914"
32
+ ) -> translate.TranslationServiceClient:
33
+ """Translating Text."""
34
+
35
+ client = translate.TranslationServiceClient()
36
+
37
+ location = "global"
38
+
39
+ parent = f"projects/{project_id}/locations/{location}"
40
+
41
+ # Translate text from English to Amharic
42
+ # Detail on supported types can be found here:
43
+ # https://cloud.google.com/translate/docs/supported-formats
44
+ response = client.translate_text(
45
+ request={
46
+ "parent": parent,
47
+ "contents": [text],
48
+ "mime_type": "text/plain", # mime types: text/plain, text/html
49
+ "source_language_code": "am",
50
+ "target_language_code": "en-US",
51
+ }
52
+ )
53
+
54
+ # Display the translation for each input text provided
55
+ #for translation in response.translations:
56
+ #print(f"Translated text: {translation.translated_text}")
57
+
58
+ return response
59
+
60
+ #Convert Video/Audio into 16K wav file
61
+ def preprocessAudio(audioFile):
62
+ os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav")
63
+
64
+ #Transcribe!!!
65
+
66
+ def Transcribe(file):
67
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
68
+ start_time = time.time()
69
+ model.load_adapter("amh")
70
+
71
+ preprocessAudio(file)
72
+ #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav")
73
+ block_size = 30
74
+
75
+ transcripts = []
76
+ stream = librosa.stream(
77
+ "./audio.wav",
78
+ block_length=block_size,
79
+ frame_length=16000,
80
+ hop_length=16000
81
+ )
82
+
83
+ model.to(device)
84
+ print("Model loaded to gpu: Entering transcription phase")
85
+
86
+ #Code for timestamping
87
+ encoding_start = 0
88
+ sbv_file = open("subtitle.sbv", "w")
89
+
90
+ for speech_segment in stream:
91
+ if len(speech_segment.shape) > 1:
92
+ speech_segment = speech_segment[:,0] + speech_segment[:,1]
93
+ input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
94
+ with torch.no_grad():
95
+ logits = model(input_values).logits
96
+ if len(logits.shape) == 1:
97
+ print("test")
98
+ logits = logits.unsqueeze(0)
99
+ #predicted_ids = torch.argmax(logits, dim=-1)
100
+ transcription = processor.batch_decode(logits.cpu().numpy()).text
101
+ transcripts.append(transcription[0])
102
+
103
+ #Generate timestamps
104
+ encoding_end = encoding_start + block_size
105
+ formatted_start = format_time(encoding_start)
106
+ formatted_end = format_time(encoding_end)
107
+
108
+ #Write to the .sbv file
109
+ sbv_file.write(f"{formatted_start},{formatted_end}\n")
110
+ sbv_file.write(f"{transcription[0]}\n\n")
111
+ encoding_start = encoding_end
112
+
113
+ # Freeing up memory
114
+ del input_values
115
+ del logits
116
+ #del predicted_ids
117
+ del transcription
118
+ torch.cuda.empty_cache()
119
+ gc.collect()
120
+
121
+ # Join all transcripts into a single transcript
122
+ transcript = ' '.join(transcripts)
123
+ sbv_file.close()
124
+
125
+ end_time = time.time()
126
+ os.system("rm ./audio.wav")
127
+ print(f"The script ran for {end_time - start_time} seconds.")
128
+ return("subtitle.sbv")
129
+
130
+ demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs=gr.File())
131
+ #with gr.Blocks() as demo:
132
+ #file_output = gr.Textbox()
133
+ #upload_button = gr.UploadButton("Click to Upload a sermon",
134
+ # file_types=["video", "audio"], file_count="multiple")
135
+ #upload_button.upload(Transcribe, upload_button, file_output)
136
+ demo.launch()
137
+
138
+