Spaces:
Sleeping
Sleeping
nizarzerouale
commited on
Upload 10 files
Browse filesadd speech to text
- models/speech_to_text/main.py +63 -0
- models/speech_to_text/requirements.txt +3 -0
- models/speech_to_text/transcriber/__init__.py +1 -0
- models/speech_to_text/transcriber/__pycache__/__init__.cpython-312.pyc +0 -0
- models/speech_to_text/transcriber/__pycache__/audio.cpython-312.pyc +0 -0
- models/speech_to_text/transcriber/__pycache__/model.cpython-312.pyc +0 -0
- models/speech_to_text/transcriber/__pycache__/utils.cpython-312.pyc +0 -0
- models/speech_to_text/transcriber/audio.py +35 -0
- models/speech_to_text/transcriber/model.py +16 -0
- models/speech_to_text/transcriber/utils.py +19 -0
models/speech_to_text/main.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
from transcriber.model import load_model_and_processor
|
5 |
+
from transcriber.audio import preprocess_audio, transcribe_audio
|
6 |
+
from transcriber.utils import get_audio_files_from_directory
|
7 |
+
|
8 |
+
def transcribe_multiple_files(model, processor, audio_files, target_sr=16000):
|
9 |
+
"""
|
10 |
+
Transcribes multiple audio files.
|
11 |
+
|
12 |
+
Parameters:
|
13 |
+
model: The Whisper model.
|
14 |
+
processor: The processor used for preparing the input features.
|
15 |
+
audio_files (list): List of paths to audio files.
|
16 |
+
target_sr (int): The target sampling rate for the audio.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
results (dict): Dictionary mapping file names to their transcriptions.
|
20 |
+
"""
|
21 |
+
results = {}
|
22 |
+
|
23 |
+
for file_path in audio_files:
|
24 |
+
print(f"Processing file: {file_path}")
|
25 |
+
audio = preprocess_audio(file_path, target_sr=target_sr)
|
26 |
+
transcription = transcribe_audio(model, processor, audio, target_sr=target_sr)
|
27 |
+
results[file_path] = transcription
|
28 |
+
print(f"Transcription for {file_path}: {transcription}")
|
29 |
+
|
30 |
+
return results
|
31 |
+
|
32 |
+
def main():
|
33 |
+
# Argument parser to accept directory or audio files as input
|
34 |
+
parser = argparse.ArgumentParser(description="Transcribe audio files using Whisper.")
|
35 |
+
parser.add_argument('input_path', type=str, help="Path to the audio file or directory containing audio files.")
|
36 |
+
args = parser.parse_args()
|
37 |
+
|
38 |
+
# Load model and processor once
|
39 |
+
model, processor = load_model_and_processor()
|
40 |
+
|
41 |
+
# Check if input is a directory or a single file
|
42 |
+
input_path = args.input_path
|
43 |
+
audio_files = []
|
44 |
+
|
45 |
+
if os.path.isdir(input_path):
|
46 |
+
# Get all audio files from the directory
|
47 |
+
audio_files = get_audio_files_from_directory(input_path)
|
48 |
+
elif os.path.isfile(input_path):
|
49 |
+
# Single file path provided
|
50 |
+
audio_files = [input_path]
|
51 |
+
else:
|
52 |
+
print(f"Invalid input path: {input_path}")
|
53 |
+
return
|
54 |
+
|
55 |
+
# Transcribe all audio files
|
56 |
+
transcriptions = transcribe_multiple_files(model, processor, audio_files)
|
57 |
+
|
58 |
+
# Optionally, you can store the transcriptions in a file or print them out
|
59 |
+
for file, transcription in transcriptions.items():
|
60 |
+
print(f"File: {file}, Transcription: {transcription}")
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
main()
|
models/speech_to_text/requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
librosa
|
models/speech_to_text/transcriber/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# transcriber/__init__.py
|
models/speech_to_text/transcriber/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (200 Bytes). View file
|
|
models/speech_to_text/transcriber/__pycache__/audio.cpython-312.pyc
ADDED
Binary file (1.9 kB). View file
|
|
models/speech_to_text/transcriber/__pycache__/model.cpython-312.pyc
ADDED
Binary file (937 Bytes). View file
|
|
models/speech_to_text/transcriber/__pycache__/utils.cpython-312.pyc
ADDED
Binary file (1.32 kB). View file
|
|
models/speech_to_text/transcriber/audio.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
def preprocess_audio(file_path, target_sr=16000):
|
4 |
+
"""
|
5 |
+
Loads and resamples audio from the specified file.
|
6 |
+
|
7 |
+
Parameters:
|
8 |
+
file_path (str): Path to the audio file.
|
9 |
+
target_sr (int): Target sampling rate. Defaults to 16000 Hz.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
resampled_audio (np.ndarray): Resampled audio data.
|
13 |
+
"""
|
14 |
+
audio_input, sample_rate = librosa.load(file_path, sr=None) # Keep original sample rate
|
15 |
+
resampled_audio = librosa.resample(audio_input, orig_sr=sample_rate, target_sr=target_sr)
|
16 |
+
return resampled_audio
|
17 |
+
|
18 |
+
def transcribe_audio(model, processor, audio, target_sr=16000):
|
19 |
+
"""
|
20 |
+
Transcribes the given audio using the Whisper model.
|
21 |
+
|
22 |
+
Parameters:
|
23 |
+
model: The Whisper model.
|
24 |
+
processor: The processor used for preparing the input features.
|
25 |
+
audio (np.ndarray): The resampled audio data.
|
26 |
+
target_sr (int): The target sampling rate for the audio.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
transcription (str): The transcribed text from the audio.
|
30 |
+
"""
|
31 |
+
input_features = processor(audio, sampling_rate=target_sr, return_tensors="pt").input_features
|
32 |
+
with torch.no_grad():
|
33 |
+
predicted_ids = model.generate(input_features)
|
34 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
35 |
+
return transcription
|
models/speech_to_text/transcriber/model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
+
|
3 |
+
def load_model_and_processor(model_name="openai/whisper-base"):
|
4 |
+
"""
|
5 |
+
Loads the Whisper model and processor.
|
6 |
+
|
7 |
+
Parameters:
|
8 |
+
model_name (str): The model to load. Defaults to 'openai/whisper-base'.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
model (WhisperForConditionalGeneration): Loaded Whisper model.
|
12 |
+
processor (WhisperProcessor): Loaded processor for the model.
|
13 |
+
"""
|
14 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
15 |
+
processor = WhisperProcessor.from_pretrained(model_name)
|
16 |
+
return model, processor
|
models/speech_to_text/transcriber/utils.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
def get_audio_files_from_directory(directory, extensions=[".wav", ".mp3"]):
|
4 |
+
"""
|
5 |
+
Retrieves all audio files from a specified directory.
|
6 |
+
|
7 |
+
Parameters:
|
8 |
+
directory (str): The directory to search for audio files.
|
9 |
+
extensions (list): List of valid audio file extensions.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
audio_files (list): List of paths to audio files.
|
13 |
+
"""
|
14 |
+
audio_files = []
|
15 |
+
for root, _, files in os.walk(directory):
|
16 |
+
for file in files:
|
17 |
+
if any(file.endswith(ext) for ext in extensions):
|
18 |
+
audio_files.append(os.path.join(root, file))
|
19 |
+
return audio_files
|