ayymen commited on
Commit
143d264
1 Parent(s): afb069e

Initial commit

Browse files
135.wav ADDED
Binary file (171 kB). View file
 
Dockerfile ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder, OpenGRM Ngram tool to contaner
17
+
18
+ # How to use? Build it from NeMo root folder:
19
+ # 1. git clone https://github.com/NVIDIA/NeMo.git && cd NeMo
20
+ # 2. DOCKER_BUILDKIT=1 docker build -t nemo:23.03.1 -f ./scripts/installers/Dockerfile.ngramtools .
21
+
22
+ FROM nvcr.io/nvidia/nemo:24.01.speech
23
+
24
+ WORKDIR /workspace/nemo
25
+
26
+ COPY ./install_beamsearch_decoders.sh /workspace/nemo/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh
27
+
28
+ RUN /bin/bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh
29
+
30
+ RUN --mount=target=/tmp/packages.txt,source=packages.txt apt-get update && xargs -r -a /tmp/packages.txt apt-get install -y && rm -rf /var/lib/apt/lists/*
31
+
32
+ RUN --mount=target=/tmp/pre-requirements.txt,source=pre-requirements.txt pip install --no-cache-dir -r /tmp/pre-requirements.txt
33
+
34
+ RUN --mount=target=/tmp/requirements.txt,source=requirements.txt pip install --no-cache-dir -r /tmp/requirements.txt
35
+
36
+ WORKDIR /code
37
+
38
+ # Set up a new user named "user" with user ID 1000
39
+ RUN useradd -m -u 1000 user
40
+
41
+ # Switch to the "user" user
42
+ USER user
43
+
44
+ # Set home to the user's home directory
45
+ ENV HOME=/home/user \
46
+ PATH=/home/user/.local/bin:$PATH
47
+
48
+ # Set the working directory to the user's home directory
49
+ WORKDIR $HOME/app
50
+
51
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
52
+ COPY --chown=user . $HOME/app
53
+
54
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: ASR N Gram Language Modeling
3
  emoji: 👀
4
  colorFrom: purple
5
  colorTo: gray
 
1
  ---
2
+ title: ASR N-gram Language Modeling
3
  emoji: 👀
4
  colorFrom: purple
5
  colorTo: gray
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nemo.collections.asr.models import EncDecCTCModelBPE
2
+ from omegaconf import open_dict
3
+ #import yt_dlp as youtube_dl
4
+ import os
5
+ import tempfile
6
+ import torch
7
+ import gradio as gr
8
+ from pydub import AudioSegment
9
+ import time
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ MODEL_NAME="ayymen/stt_zgh_fastconformer_ctc_small"
13
+ YT_LENGTH_LIMIT_S=3600
14
+
15
+ model = EncDecCTCModelBPE.from_pretrained(model_name=MODEL_NAME).to(device)
16
+
17
+ with open_dict(model.cfg):
18
+ model.cfg.decoding.strategy = "beam"
19
+ model.cfg.decoding.beam.beam_size = 256 # Desired Beam Size
20
+ model.cfg.decoding.beam.beam_alpha = 1.5 # Desired Beam Alpha
21
+ model.cfg.decoding.beam.beam_beta = 1.5 # Desired Beam Beta
22
+ model.cfg.decoding.beam.kenlm_path = "kenlm.bin" # Path to KenLM binary file
23
+
24
+ model.change_decoding_strategy(model.cfg.decoding)
25
+
26
+ model.eval()
27
+
28
+ def get_transcripts(audio_path):
29
+ audio = AudioSegment.from_file(audio_path)
30
+ # check if audio is mono 16kHz
31
+ if audio.channels != 1 or audio.frame_rate != 16000:
32
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
33
+ with tempfile.TemporaryDirectory() as tmpdirname:
34
+ audio_path = os.path.join(tmpdirname, "audio.wav")
35
+ audio.export(audio_path, format="wav")
36
+ text = model.transcribe([audio_path])[0]
37
+ else:
38
+ text = model.transcribe([audio_path])[0]
39
+ return text
40
+
41
+ '''
42
+ article = (
43
+ "<p style='text-align: center'>"
44
+ "<a href='https://huggingface.co/nvidia/parakeet-rnnt-1.1b' target='_blank'>🎙️ Learn more about Parakeet model</a> | "
45
+ "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>📚 FastConformer paper</a> | "
46
+ "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 Repository</a>"
47
+ "</p>"
48
+ )
49
+ '''
50
+
51
+ EXAMPLES = [
52
+ ["135.wav"],
53
+ ["common_voice_zgh_37837257.mp3"]
54
+ ]
55
+
56
+ """
57
+ YT_EXAMPLES = [
58
+ ["https://www.youtube.com/shorts/CSgTSE50MHY"],
59
+ ["https://www.youtube.com/shorts/OxQtqOyAFLE"]
60
+ ]
61
+ """
62
+
63
+ def _return_yt_html_embed(yt_url):
64
+ video_id = yt_url.split("?v=")[-1]
65
+ if "youtube.com/shorts/" in video_id:
66
+ video_id = video_id.split("/")[-1]
67
+ HTML_str = (
68
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
69
+ " </center>"
70
+ )
71
+ return HTML_str
72
+
73
+ def download_yt_audio(yt_url, filename):
74
+ info_loader = youtube_dl.YoutubeDL()
75
+
76
+ try:
77
+ info = info_loader.extract_info(yt_url, download=False)
78
+ except youtube_dl.utils.DownloadError as err:
79
+ raise gr.Error(str(err))
80
+
81
+ file_length = info["duration_string"]
82
+ file_h_m_s = file_length.split(":")
83
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
84
+
85
+ if len(file_h_m_s) == 1:
86
+ file_h_m_s.insert(0, 0)
87
+ if len(file_h_m_s) == 2:
88
+ file_h_m_s.insert(0, 0)
89
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
90
+
91
+ if file_length_s > YT_LENGTH_LIMIT_S:
92
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
93
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
94
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
95
+
96
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
97
+
98
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
99
+ try:
100
+ ydl.download([yt_url])
101
+ except youtube_dl.utils.ExtractorError as err:
102
+ raise gr.Error(str(err))
103
+
104
+
105
+ def yt_transcribe(yt_url, max_filesize=75.0):
106
+ html_embed_str = _return_yt_html_embed(yt_url)
107
+
108
+ with tempfile.TemporaryDirectory() as tmpdirname:
109
+ filepath = os.path.join(tmpdirname, "video.mp4")
110
+ download_yt_audio(yt_url, filepath)
111
+ audio = AudioSegment.from_file(filepath)
112
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
113
+ wav_filepath = os.path.join(tmpdirname, "audio.wav")
114
+ audio.export(wav_filepath, format="wav")
115
+ text = get_transcripts(wav_filepath)
116
+
117
+ return html_embed_str, text
118
+
119
+
120
+ demo = gr.Blocks()
121
+
122
+ mf_transcribe = gr.Interface(
123
+ fn=get_transcripts,
124
+ inputs=[
125
+ gr.Audio(sources="microphone", type="filepath")
126
+ ],
127
+ outputs="text",
128
+ title="Transcribe Audio",
129
+ description=(
130
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
131
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
132
+ " of arbitrary length."
133
+ ),
134
+ allow_flagging="never",
135
+ )
136
+
137
+ file_transcribe = gr.Interface(
138
+ fn=get_transcripts,
139
+ inputs=[
140
+ gr.Audio(sources="upload", type="filepath", label="Audio file"),
141
+ ],
142
+ outputs="text",
143
+ examples=EXAMPLES,
144
+ title="Transcribe Audio",
145
+ description=(
146
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
147
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
148
+ " of arbitrary length."
149
+ ),
150
+ allow_flagging="never",
151
+ )
152
+
153
+ """
154
+ youtube_transcribe = gr.Interface(
155
+ fn=yt_transcribe,
156
+ inputs=[
157
+ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
158
+ ],
159
+ outputs=["html", "text"],
160
+ examples=YT_EXAMPLES,
161
+ title="Transcribe Audio",
162
+ description=(
163
+ "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
164
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) to transcribe audio files"
165
+ " of arbitrary length."
166
+ ),
167
+ allow_flagging="never",
168
+ )
169
+ """
170
+
171
+ with demo:
172
+ gr.TabbedInterface(
173
+ [
174
+ mf_transcribe,
175
+ file_transcribe,
176
+ #youtube_transcribe
177
+ ],
178
+ [
179
+ "Microphone",
180
+ "Audio file",
181
+ #"Youtube Video"
182
+ ]
183
+ )
184
+
185
+ demo.launch()
common_voice_zgh_37837257.mp3 ADDED
Binary file (28.1 kB). View file
 
install_beamsearch_decoders.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder
17
+ shopt -s expand_aliases
18
+
19
+ NEMO_PATH=/workspace/nemo # Path to NeMo folder: /workspace/nemo if you use NeMo/Dockerfile
20
+ if [ "$#" -eq 1 ]; then
21
+ NEMO_PATH=$1
22
+ fi
23
+ KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_os2s_decoders.py
24
+
25
+ if [ -d "$NEMO_PATH" ]; then
26
+ echo "The folder '$NEMO_PATH' exists."
27
+ else
28
+ echo "Error: The folder '$NEMO_PATH' does not exist. Specify it as a first command line positional argument!"
29
+ exit 1
30
+ fi
31
+ cd $NEMO_PATH
32
+
33
+ if [ $(id -u) -eq 0 ]; then
34
+ alias aptupdate='apt-get update'
35
+ alias b2install='./b2'
36
+ else
37
+ alias aptupdate='sudo apt-get update'
38
+ alias b2install='sudo ./b2'
39
+ fi
40
+
41
+ aptupdate && apt-get upgrade -y
42
+ # apt-get install -y swig liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder
43
+
44
+ # install Boost package for KenLM
45
+ wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && b2install --layout=tagged link=static,shared threading=multi,single install -j4 && cd .. || echo FAILURE
46
+ export BOOST_ROOT=$NEMO_PATH/boost_1_80_0
47
+
48
+ git clone https://github.com/NVIDIA/OpenSeq2Seq
49
+ cd OpenSeq2Seq
50
+ git checkout ctc-decoders
51
+ cd ..
52
+ mv OpenSeq2Seq/decoders $NEMO_PATH/
53
+ rm -rf OpenSeq2Seq
54
+ cd $NEMO_PATH/decoders
55
+ cp $NEMO_PATH/scripts/installers/setup_os2s_decoders.py ./setup.py
56
+ ./setup.sh
57
+
58
+ # install KenLM
59
+ cd $NEMO_PATH/decoders/kenlm/build && cmake -DKENLM_MAX_ORDER=$KENLM_MAX_ORDER .. && make -j2
60
+ cd $NEMO_PATH/decoders/kenlm
61
+ python setup.py install --max_order=$KENLM_MAX_ORDER
62
+ export KENLM_LIB=$NEMO_PATH/decoders/kenlm/build/bin
63
+ export KENLM_ROOT=$NEMO_PATH/decoders/kenlm
64
+ cd ..
65
+
66
+ # install Flashlight
67
+ # git clone https://github.com/flashlight/text && cd text
68
+ # python setup.py bdist_wheel
69
+ # pip install dist/*.whl
70
+ # cd ..
kenlm.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e3106cf41031192efb8bd1c615f2f40fc12b9d9ae7132b5b6e16b50aa8e4b83
3
+ size 69178189
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Cython
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Cython
2
+ huggingface-hub==0.23.2
3
+ # nemo-toolkit[asr]==2.0.0rc1
4
+ # numpy<2.0.0
5
+ # ipython
6
+ # yt_dlp
7
+ gradio