Spaces:

jlonsako
/

Amh-Transcribe

Sleeping

App Files Files Community

jlonsako commited on Jul 12, 2023

Commit

da7f7e0

•

1 Parent(s): f09834e

Update to chunking and half precision

Browse files

Files changed (1) hide show

app.py +51 -20

app.py CHANGED Viewed

@@ -31,11 +31,15 @@ def Transcribe(file):
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     start_time = time.time()
     model.load_adapter("amh")
     preprocessAudio(file)
-    block_size = 30 #30 second chunks of audio
     transcripts = []
     stream = librosa.stream(
         "./audioToConvert.wav",
         block_length=block_size,
@@ -43,9 +47,8 @@ def Transcribe(file):
         hop_length=16000
     )
-    model.half()
     model.to(device)
-    print(f"Model loaded to {device}: Entering transcription phase")
     #Code for timestamping
     encoding_start = 0
@@ -54,42 +57,70 @@ def Transcribe(file):
     for speech_segment in stream:
         if len(speech_segment.shape) > 1:
             speech_segment = speech_segment[:,0] + speech_segment[:,1]
-        input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
         input_values = input_values.half()
         with torch.no_grad():
             logits = model(input_values).logits
-        if len(logits.shape) == 1:
-            logits = logits.unsqueeze(0)
-        transcription = processor.batch_decode(logits.cpu().numpy()).text
-        transcripts.append(transcription[0])
-        #Generate timestamps
-        encoding_end = encoding_start + block_size
-        formatted_start = format_time(encoding_start)
-        formatted_end = format_time(encoding_end)
-        #Write to the .sbv file
-        sbv_file.write(f"{formatted_start},{formatted_end}\n")
-        sbv_file.write(f"{transcription[0]}\n\n")
-        encoding_start = encoding_end
         # Freeing up memory
         del input_values
         del logits
-        del transcription
         torch.cuda.empty_cache()
         gc.collect()
     # Join all transcripts into a single transcript
     transcript = ' '.join(transcripts)
     sbv_file.close()
     end_time = time.time()
-    os.system("rm ./audio.wav")
     print(f"The script ran for {end_time - start_time} seconds.")
     return("./subtitle.sbv")
 demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
 demo.launch()

     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     start_time = time.time()
     model.load_adapter("amh")
+    model.half()
     preprocessAudio(file)
+    block_size = 30
+    batch_size = 22  # or whatever number you choose
     transcripts = []
+    speech_segments = []
     stream = librosa.stream(
         "./audioToConvert.wav",
         block_length=block_size,
         hop_length=16000
     )
     model.to(device)
+    print("Model loaded to gpu: Entering transcription phase")
     #Code for timestamping
     encoding_start = 0
     for speech_segment in stream:
         if len(speech_segment.shape) > 1:
             speech_segment = speech_segment[:,0] + speech_segment[:,1]
+        speech_segments.append(speech_segment)
+        if len(speech_segments) == batch_size:
+            input_values = processor(speech_segments, sampling_rate=16_000, return_tensors="pt", padding=True).input_values.to(device)
+            input_values = input_values.half()
+            with torch.no_grad():
+                logits = model(input_values).logits
+            if len(logits.shape) == 1:
+                logits = logits.unsqueeze(0)
+            #predicted_ids = torch.argmax(logits, dim=-1)
+            transcriptions = processor.batch_decode(logits.cpu().numpy()).text
+            transcripts.extend(transcriptions)
+            # Write to the .sbv file
+            for i, transcription in enumerate(transcriptions):
+                encoding_start = (i * block_size)
+                encoding_end = encoding_start + block_size
+                formatted_start = format_time(encoding_start)
+                formatted_end = format_time(encoding_end)
+                sbv_file.write(f"{formatted_start},{formatted_end}\n")
+                sbv_file.write(f"{transcription}\n\n")
+            # Clear the batch
+            speech_segments = []
+            # Freeing up memory
+            del input_values
+            del logits
+            del transcriptions
+            torch.cuda.empty_cache()
+            gc.collect()
+    if speech_segments:
+        input_values = processor(speech_segments, sampling_rate=16_000, return_tensors="pt", padding=True).input_values.to(device)
         input_values = input_values.half()
         with torch.no_grad():
             logits = model(input_values).logits
+        transcriptions = processor.batch_decode(logits.cpu().numpy()).text
+        transcripts.extend(transcriptions)
+        for i in range(len(speech_segments)):
+            encoding_end = encoding_start + block_size
+            formatted_start = format_time(encoding_start)
+            formatted_end = format_time(encoding_end)
+            sbv_file.write(f"{formatted_start},{formatted_end}\n")
+            sbv_file.write(f"{transcriptions[i]}\n\n")
+            encoding_start = encoding_end
         # Freeing up memory
         del input_values
         del logits
+        del transcriptions
         torch.cuda.empty_cache()
         gc.collect()
     # Join all transcripts into a single transcript
     transcript = ' '.join(transcripts)
     sbv_file.close()
     end_time = time.time()
     print(f"The script ran for {end_time - start_time} seconds.")
     return("./subtitle.sbv")
 demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
 demo.launch()