Spaces:

JacobLinCool
/

forced-alignment

Running on Zero

App Files Files Community

github-actions[bot] commited on 13 days ago

Commit

ad6b6ad

0 Parent(s):

Sync from https://github.com/JacobLinCool/forced-alignment-app

Browse files

Files changed (10) hide show

.gitattributes +5 -0
.github/workflows/sync.yml +26 -0
.gitignore +164 -0
LICENSE +21 -0
README.md +16 -0
app.py +196 -0
examples/example1.mp3 +3 -0
examples/example2.wav +3 -0
headers.yaml +8 -0
requirements.txt +6 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+# Auto detect text files and perform LF normalization
+* text=auto
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.github/workflows/sync.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: Sync to Hugging Face Spaces
+on:
+    push:
+        branches:
+            - main
+jobs:
+    sync:
+        name: Sync
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout Repository
+              uses: actions/checkout@v4
+              with:
+                  lfs: true
+            - name: Sync to Hugging Face Spaces
+              uses: JacobLinCool/huggingface-sync@v1
+              with:
+                  github: ${{ secrets.GITHUB_TOKEN }}
+                  user: jacoblincool # Hugging Face username or organization name
+                  space: forced-alignment # Hugging Face space name
+                  token: ${{ secrets.HF_TOKEN }} # Hugging Face token
+                  configuration: headers.yaml

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# pixi environments
+.pixi
+*.egg-info

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 JacobLinCool
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: Forced Alignment
+emoji: 🎯
+colorFrom: red
+colorTo: gray
+sdk: gradio
+app_file: app.py
+pinned: false
+license: mit
+---
+# Forced Alignment
+Gradio app to do forced alignment.
+<https://huggingface.co/spaces/JacobLinCool/forced-alignment>

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import spaces
+import gradio as gr
+import json
+import torch
+import numpy as np
+import librosa
+from accelerate.utils.imports import is_cuda_available
+from iso639 import iter_langs
+from ctc_forced_aligner import (
+    load_alignment_model,
+    generate_emissions,
+    preprocess_text,
+    get_alignments,
+    get_spans,
+    postprocess_results,
+)
+device = "cuda" if is_cuda_available() else "cpu"
+dtype = torch.float16 if is_cuda_available() else torch.float32
+alignment_model, alignment_tokenizer = load_alignment_model(
+    device,
+    dtype=dtype,
+)
+def process_alignment(audio_waveform, text, language="eng"):
+    print(f"{audio_waveform.shape=}, {text=}, {language=}")
+    # Generate emissions
+    emissions, stride = generate_emissions(
+        alignment_model, audio_waveform, batch_size=16
+    )
+    # Preprocess text
+    tokens_starred, text_starred = preprocess_text(
+        text,
+        romanize=True,
+        language=language,
+    )
+    # Get alignments
+    segments, scores, blank_id = get_alignments(
+        emissions,
+        tokens_starred,
+        alignment_tokenizer,
+    )
+    # Get spans and word timestamps
+    spans = get_spans(tokens_starred, segments, blank_id)
+    word_timestamps = postprocess_results(text_starred, spans, stride, scores)
+    return word_timestamps
+def trim_audio(audio_array, sample_rate, word_timestamps):
+    start_time = int(word_timestamps[0]["start"] * sample_rate)
+    end_time = int(word_timestamps[-1]["end"] * sample_rate)
+    print(f"{start_time=}, {end_time=}")
+    trimmed_audio = audio_array[start_time:end_time]
+    return (sample_rate, trimmed_audio)
+def get_language_choices():
+    return [f"{lang.pt3} - {lang.name}" for lang in iter_langs() if lang.pt3]
+@spaces.GPU
+def align(audio, text, language="eng - English"):
+    # Extract the ISO 639-3 code from the selected language
+    iso_code = language.split(" - ")[0]
+    # Convert the input audio to 16kHz mono
+    sample_rate, audio_array = audio
+    audio_array = (
+        audio_array.astype(np.float32) / 32768.0
+    )  # Convert to float32 and normalize
+    print(f"{sample_rate=}, {audio_array.shape=}")
+    if len(audio_array.shape) > 1:
+        audio_array = audio_array.mean(axis=1)  # Convert to mono if stereo
+    audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
+    # Convert to torch tensor and move to the correct device
+    audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype)
+    # Process the alignment
+    word_timestamps = process_alignment(audio_waveform, text, iso_code)
+    # Trim the audio
+    trimmed_audio = trim_audio(audio_array, 16000, word_timestamps)
+    # Create JSON output
+    output_json = {
+        "input_text": text,
+        "word_timestamps": word_timestamps,
+        "language": language,
+    }
+    return trimmed_audio, json.dumps(output_json, indent=2)
+@spaces.GPU
+def align_result_only(audio, text, language="eng - English"):
+    # Extract the ISO 639-3 code from the selected language
+    iso_code = language.split(" - ")[0]
+    # Convert the input audio to 16kHz mono
+    sample_rate, audio_array = audio
+    audio_array = (
+        audio_array.astype(np.float32) / 32768.0
+    )  # Convert to float32 and normalize
+    print(f"{sample_rate=}, {audio_array.shape=}")
+    if len(audio_array.shape) > 1:
+        audio_array = audio_array.mean(axis=1)  # Convert to mono if stereo
+    audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
+    # Convert to torch tensor and move to the correct device
+    audio_waveform = torch.from_numpy(audio_array).to(device=device, dtype=dtype)
+    # Process the alignment
+    word_timestamps = process_alignment(audio_waveform, text, iso_code)
+    # Create JSON output
+    output_json = {
+        "input_text": text,
+        "word_timestamps": word_timestamps,
+        "language": language,
+    }
+    return json.dumps(output_json, indent=2)
+# Create Gradio blocks
+with gr.Blocks() as demo:
+    gr.Markdown("# Forced Alignment")
+    gr.Markdown(
+        """
+    This tool aligns audio with text and provides word-level timestamps.
+    ## How to use:
+    1. Upload an audio file or record audio
+    2. Enter the corresponding text
+    3. Select the language
+    4. Click 'Process' to get the alignment results
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(label="Input Audio")
+            text_input = gr.Textbox(label="Input Text")
+            language_input = gr.Dropdown(
+                choices=get_language_choices(), label="Language", value="eng - English"
+            )
+            submit_button = gr.Button(
+                "Get Alignment and Trimmed Audio", variant="primary"
+            )
+            submit_button_result_only = gr.Button(
+                "Get Alignment Only", variant="secondary"
+            )
+        with gr.Column():
+            audio_output = gr.Audio(label="Trimmed Output Audio")
+            json_output = gr.JSON(label="Alignment Results")
+    submit_button.click(
+        fn=align,
+        inputs=[audio_input, text_input, language_input],
+        outputs=[audio_output, json_output],
+    )
+    submit_button_result_only.click(
+        fn=align_result_only,
+        inputs=[audio_input, text_input, language_input],
+        outputs=[json_output],
+    )
+    gr.Markdown("## Examples")
+    gr.Examples(
+        examples=[
+            ["examples/example1.mp3", "我們搭上公車要回台北了", "zho - Chinese"],
+            [
+                "examples/example2.wav",
+                "ON SATURDAY MORNINGS WHEN THE SODALITY MET IN THE CHAPEL TO RECITE THE LITTLE OFFICE HIS PLACE WAS A CUSHIONED KNEELING DESK AT THE RIGHT OF THE ALTAR FROM WHICH HE LED HIS WING OF BOYS THROUGH THE RESPONSES",
+                "eng - English",
+            ],
+        ],
+        inputs=[audio_input, text_input, language_input],
+    )
+# Launch the demo
+if __name__ == "__main__":
+    demo.launch()

examples/example1.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b460bb7d06ed482472cabccda799b3c812080d2d6bbb6e65ff7e9c721b90cee0
+size 31054

examples/example2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:097baaee33039db64dd47e7f1bf9880530be8e4f693d70dc022059c7fef2cc69
+size 398284

headers.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+title: Forced Alignment
+emoji: 🎯
+colorFrom: red
+colorTo: gray
+sdk: gradio
+app_file: app.py
+pinned: false
+license: mit

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+spaces
+gradio
+accelerate
+iso639-lang
+librosa
+git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git