Spaces:

jlonsako
/

Amh-Transcribe

Sleeping

App Files Files Community

jlonsako commited on Jul 23, 2023

Commit

a08d5e7

•

1 Parent(s): d031248

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -1

app.py CHANGED Viewed

@@ -67,7 +67,7 @@ def preprocessAudio(audioFile):
     if isinstance(audioFile, str):  # If audioFile is a string (filepath)
         os.system(f"ffmpeg -y -i {audioFile} -ar 16000 ./audioToConvert.wav")
     else:  # If audioFile is an object with a name attribute
-        os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./{audioFile.name}.wav")
 #Transcribe!!!
 def Transcribe(file):
@@ -191,6 +191,127 @@ def Transcribe(file):
         error_log.write(f"Exception occurred: {e}")
         error_log.close()
 demo = gr.Blocks()
 with demo:

     if isinstance(audioFile, str):  # If audioFile is a string (filepath)
         os.system(f"ffmpeg -y -i {audioFile} -ar 16000 ./audioToConvert.wav")
     else:  # If audioFile is an object with a name attribute
+        os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
 #Transcribe!!!
 def Transcribe(file):
         error_log.write(f"Exception occurred: {e}")
         error_log.close()
+#Transcribe!!!
+def TranscribeMic(file):
+    try:
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        start_time = time.time()
+        model.load_adapter("amh")
+        processor.tokenizer.set_target_lang("amh")
+        preprocessAudio(file)
+        block_size = 30
+        batch_size = 1
+        transcripts = []
+        speech_segments = []
+        stream = librosa.stream(
+            "./audioToConvert.wav",
+            block_length=block_size,
+            frame_length=16000,
+            hop_length=16000
+        )
+        model.to(device)
+        print(f"Model loaded to {device}: Entering transcription phase")
+        #Code for timestamping
+        encoding_start = 0
+        encoding_end = 0
+        sbv_file = open(f"microphone_subtitle.sbv", "w")
+        transcription_file = open(f"microphone_transcription.txt", "w")
+        # Create an empty list to hold batches
+        batch = []
+        for speech_segment in stream:
+            if len(speech_segment.shape) > 1:
+                speech_segment = speech_segment[:,0] + speech_segment[:,1]
+            # Add the current speech segment to the batch
+            batch.append(speech_segment)
+            # If the batch is full, process it
+            if len(batch) == batch_size:
+                # Concatenate all segments in the batch along the time axis
+                input_values = processor(batch, sampling_rate=16_000, return_tensors="pt", padding=True)
+                input_values = input_values.to(device)
+                with torch.no_grad():
+                    logits = model(**input_values).logits
+                if len(logits.shape) == 1:
+                    logits = logits.unsqueeze(0)
+                beam_search_result = beam_search_decoder(logits.to("cpu"))
+                # Transcribe each segment in the batch
+                for i in range(batch_size):
+                    transcription = " ".join(beam_search_result[i][0].words).strip()
+                    transcripts.append(transcription)
+                    encoding_end = encoding_start + block_size
+                    formatted_start = format_time(encoding_start)
+                    formatted_end = format_time(encoding_end)
+                    sbv_file.write(f"{formatted_start},{formatted_end}\n")
+                    sbv_file.write(f"{transcription}\n\n")
+                    encoding_start = encoding_end
+                # Freeing up memory
+                del input_values
+                del logits
+                del transcription
+                torch.cuda.empty_cache()
+                gc.collect()
+                # Clear the batch
+                batch = []
+        if batch:
+                # Concatenate all segments in the batch along the time axis
+                input_values = processor(batch, sampling_rate=16_000, return_tensors="pt", padding=True)
+                input_values = input_values.to(device)
+                with torch.no_grad():
+                    logits = model(**input_values).logits
+                if len(logits.shape) == 1:
+                    logits = logits.unsqueeze(0)
+                beam_search_result = beam_search_decoder(logits.to("cpu"))
+                # Transcribe each segment in the batch
+                for i in range(len(batch)):
+                    transcription = " ".join(beam_search_result[i][0].words).strip()
+                    print(transcription)
+                    transcripts.append(transcription)
+                    encoding_end = encoding_start + block_size
+                    formatted_start = format_time(encoding_start)
+                    formatted_end = format_time(encoding_end)
+                    sbv_file.write(f"{formatted_start},{formatted_end}\n")
+                    sbv_file.write(f"{transcription}\n\n")
+                    encoding_start = encoding_end
+                # Freeing up memory
+                del input_values
+                del logits
+                del transcription
+                torch.cuda.empty_cache()
+                gc.collect()
+        # Join all transcripts into a single transcript
+        transcript = ' '.join(transcripts)
+        transcription_file.write(f"{transcript}")
+        sbv_file.close()
+        transcription_file.close()
+        end_time = time.time()
+        print(f"The script ran for {end_time - start_time} seconds.")
+        return([f"./microphone_subtitle.sbv", f"./microphone_transcription.txt"])
+    except Exception as e:
+        error_log = open("error_log.txt", "w")
+        error_log.write(f"Exception occurred: {e}")
+        error_log.close()
 demo = gr.Blocks()
 with demo: