nizarzerouale commited on
Commit
5397a6e
1 Parent(s): 3d1492d

Upload 10 files

Browse files

add speech to text

models/speech_to_text/main.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ from transcriber.model import load_model_and_processor
5
+ from transcriber.audio import preprocess_audio, transcribe_audio
6
+ from transcriber.utils import get_audio_files_from_directory
7
+
8
+ def transcribe_multiple_files(model, processor, audio_files, target_sr=16000):
9
+ """
10
+ Transcribes multiple audio files.
11
+
12
+ Parameters:
13
+ model: The Whisper model.
14
+ processor: The processor used for preparing the input features.
15
+ audio_files (list): List of paths to audio files.
16
+ target_sr (int): The target sampling rate for the audio.
17
+
18
+ Returns:
19
+ results (dict): Dictionary mapping file names to their transcriptions.
20
+ """
21
+ results = {}
22
+
23
+ for file_path in audio_files:
24
+ print(f"Processing file: {file_path}")
25
+ audio = preprocess_audio(file_path, target_sr=target_sr)
26
+ transcription = transcribe_audio(model, processor, audio, target_sr=target_sr)
27
+ results[file_path] = transcription
28
+ print(f"Transcription for {file_path}: {transcription}")
29
+
30
+ return results
31
+
32
+ def main():
33
+ # Argument parser to accept directory or audio files as input
34
+ parser = argparse.ArgumentParser(description="Transcribe audio files using Whisper.")
35
+ parser.add_argument('input_path', type=str, help="Path to the audio file or directory containing audio files.")
36
+ args = parser.parse_args()
37
+
38
+ # Load model and processor once
39
+ model, processor = load_model_and_processor()
40
+
41
+ # Check if input is a directory or a single file
42
+ input_path = args.input_path
43
+ audio_files = []
44
+
45
+ if os.path.isdir(input_path):
46
+ # Get all audio files from the directory
47
+ audio_files = get_audio_files_from_directory(input_path)
48
+ elif os.path.isfile(input_path):
49
+ # Single file path provided
50
+ audio_files = [input_path]
51
+ else:
52
+ print(f"Invalid input path: {input_path}")
53
+ return
54
+
55
+ # Transcribe all audio files
56
+ transcriptions = transcribe_multiple_files(model, processor, audio_files)
57
+
58
+ # Optionally, you can store the transcriptions in a file or print them out
59
+ for file, transcription in transcriptions.items():
60
+ print(f"File: {file}, Transcription: {transcription}")
61
+
62
+ if __name__ == "__main__":
63
+ main()
models/speech_to_text/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ librosa
models/speech_to_text/transcriber/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # transcriber/__init__.py
models/speech_to_text/transcriber/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (200 Bytes). View file
 
models/speech_to_text/transcriber/__pycache__/audio.cpython-312.pyc ADDED
Binary file (1.9 kB). View file
 
models/speech_to_text/transcriber/__pycache__/model.cpython-312.pyc ADDED
Binary file (937 Bytes). View file
 
models/speech_to_text/transcriber/__pycache__/utils.cpython-312.pyc ADDED
Binary file (1.32 kB). View file
 
models/speech_to_text/transcriber/audio.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ def preprocess_audio(file_path, target_sr=16000):
4
+ """
5
+ Loads and resamples audio from the specified file.
6
+
7
+ Parameters:
8
+ file_path (str): Path to the audio file.
9
+ target_sr (int): Target sampling rate. Defaults to 16000 Hz.
10
+
11
+ Returns:
12
+ resampled_audio (np.ndarray): Resampled audio data.
13
+ """
14
+ audio_input, sample_rate = librosa.load(file_path, sr=None) # Keep original sample rate
15
+ resampled_audio = librosa.resample(audio_input, orig_sr=sample_rate, target_sr=target_sr)
16
+ return resampled_audio
17
+
18
+ def transcribe_audio(model, processor, audio, target_sr=16000):
19
+ """
20
+ Transcribes the given audio using the Whisper model.
21
+
22
+ Parameters:
23
+ model: The Whisper model.
24
+ processor: The processor used for preparing the input features.
25
+ audio (np.ndarray): The resampled audio data.
26
+ target_sr (int): The target sampling rate for the audio.
27
+
28
+ Returns:
29
+ transcription (str): The transcribed text from the audio.
30
+ """
31
+ input_features = processor(audio, sampling_rate=target_sr, return_tensors="pt").input_features
32
+ with torch.no_grad():
33
+ predicted_ids = model.generate(input_features)
34
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
35
+ return transcription
models/speech_to_text/transcriber/model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+
3
+ def load_model_and_processor(model_name="openai/whisper-base"):
4
+ """
5
+ Loads the Whisper model and processor.
6
+
7
+ Parameters:
8
+ model_name (str): The model to load. Defaults to 'openai/whisper-base'.
9
+
10
+ Returns:
11
+ model (WhisperForConditionalGeneration): Loaded Whisper model.
12
+ processor (WhisperProcessor): Loaded processor for the model.
13
+ """
14
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
15
+ processor = WhisperProcessor.from_pretrained(model_name)
16
+ return model, processor
models/speech_to_text/transcriber/utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def get_audio_files_from_directory(directory, extensions=[".wav", ".mp3"]):
4
+ """
5
+ Retrieves all audio files from a specified directory.
6
+
7
+ Parameters:
8
+ directory (str): The directory to search for audio files.
9
+ extensions (list): List of valid audio file extensions.
10
+
11
+ Returns:
12
+ audio_files (list): List of paths to audio files.
13
+ """
14
+ audio_files = []
15
+ for root, _, files in os.walk(directory):
16
+ for file in files:
17
+ if any(file.endswith(ext) for ext in extensions):
18
+ audio_files.append(os.path.join(root, file))
19
+ return audio_files