import spaces import gradio as gr import json import torch import numpy as np import librosa from accelerate.utils.imports import is_cuda_available from iso639 import iter_langs from ctc_forced_aligner import ( load_alignment_model, generate_emissions, preprocess_text, get_alignments, get_spans, postprocess_results, ) device = "cuda" if is_cuda_available() else "cpu" dtype = torch.float16 if is_cuda_available() else torch.float32 alignment_model, alignment_tokenizer = load_alignment_model( device, dtype=dtype, ) def process_alignment(audio_waveform, text, language="eng"): print(f"{audio_waveform.shape=}, {text=}, {language=}") # Generate emissions emissions, stride = generate_emissions( alignment_model, audio_waveform, batch_size=16 ) # Preprocess text tokens_starred, text_starred = preprocess_text( text, romanize=True, language=language, ) # Get alignments segments, scores, blank_id = get_alignments( emissions, tokens_starred, alignment_tokenizer, ) # Get spans and word timestamps spans = get_spans(tokens_starred, segments, blank_id) word_timestamps = postprocess_results(text_starred, spans, stride, scores) return word_timestamps def trim_audio(audio_array, sample_rate, word_timestamps): start_time = int(word_timestamps[0]["start"] * sample_rate) end_time = int(word_timestamps[-1]["end"] * sample_rate) print(f"{start_time=}, {end_time=}") trimmed_audio = audio_array[start_time:end_time] return (sample_rate, trimmed_audio) def get_language_choices(): return [f"{lang.pt3} - {lang.name}" for lang in iter_langs() if lang.pt3] @spaces.GPU def align(audio, text, language="eng - English"): # Extract the ISO 639-3 code from the selected language iso_code = language.split(" - ")[0] # Convert the input audio to 16kHz mono sample_rate, audio_array = audio audio_array = ( audio_array.astype(np.float32) / 32768.0 ) # Convert to float32 and normalize print(f"{sample_rate=}, {audio_array.shape=}") if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) # Convert to mono if stereo audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) # Convert to torch tensor and move to the correct device audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype) # Process the alignment word_timestamps = process_alignment(audio_waveform, text, iso_code) # Trim the audio trimmed_audio = trim_audio(audio_array, 16000, word_timestamps) # Create JSON output output_json = { "input_text": text, "word_timestamps": word_timestamps, "language": language, } return trimmed_audio, json.dumps(output_json, indent=2) @spaces.GPU def align_result_only(audio, text, language="eng - English"): # Extract the ISO 639-3 code from the selected language iso_code = language.split(" - ")[0] # Convert the input audio to 16kHz mono sample_rate, audio_array = audio audio_array = ( audio_array.astype(np.float32) / 32768.0 ) # Convert to float32 and normalize print(f"{sample_rate=}, {audio_array.shape=}") if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) # Convert to mono if stereo audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) # Convert to torch tensor and move to the correct device audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype) # Process the alignment word_timestamps = process_alignment(audio_waveform, text, iso_code) # Create JSON output output_json = { "input_text": text, "word_timestamps": word_timestamps, "language": language, } return json.dumps(output_json, indent=2) # Create Gradio blocks with gr.Blocks() as demo: gr.Markdown("# Forced Alignment") gr.Markdown( """ This tool aligns audio with text and provides word-level timestamps. ## How to use: 1. Upload an audio file or record audio 2. Enter the corresponding text 3. Select the language 4. Click 'Process' to get the alignment results """ ) with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Input Audio") text_input = gr.Textbox(label="Input Text") language_input = gr.Dropdown( choices=get_language_choices(), label="Language", value="eng - English" ) submit_button = gr.Button( "Get Alignment and Trimmed Audio", variant="primary" ) submit_button_result_only = gr.Button( "Get Alignment Only", variant="secondary" ) with gr.Column(): audio_output = gr.Audio(label="Trimmed Output Audio") json_output = gr.JSON(label="Alignment Results") submit_button.click( fn=align, inputs=[audio_input, text_input, language_input], outputs=[audio_output, json_output], ) submit_button_result_only.click( fn=align_result_only, inputs=[audio_input, text_input, language_input], outputs=[json_output], ) gr.Markdown("## Examples") gr.Examples( examples=[ ["examples/example1.mp3", "我們搭上公車要回台北了", "zho - Chinese"], [ "examples/example2.wav", "ON SATURDAY MORNINGS WHEN THE SODALITY MET IN THE CHAPEL TO RECITE THE LITTLE OFFICE HIS PLACE WAS A CUSHIONED KNEELING DESK AT THE RIGHT OF THE ALTAR FROM WHICH HE LED HIS WING OF BOYS THROUGH THE RESPONSES", "eng - English", ], ], inputs=[audio_input, text_input, language_input], ) # Launch the demo if __name__ == "__main__": demo.launch()