Spaces:
Sleeping
Sleeping
File size: 3,389 Bytes
95261ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# External programs
import whisper
class WhisperContainer:
def __init__(self, model_name: str, device: str = None):
self.model_name = model_name
self.device = device
# Will be created on demand
self.model = None
def get_model(self):
if self.model is None:
print("Loading model " + self.model_name)
self.model = whisper.load_model(self.model_name, device=self.device)
return self.model
def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
"""
Create a WhisperCallback object that can be used to transcript audio files.
Parameters
----------
language: str
The target language of the transcription. If not specified, the language will be inferred from the audio content.
task: str
The task - either translate or transcribe.
initial_prompt: str
The initial prompt to use for the transcription.
decodeOptions: dict
Additional options to pass to the decoder. Must be pickleable.
Returns
-------
A WhisperCallback object.
"""
return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
# This is required for multiprocessing
def __getstate__(self):
return { "model_name": self.model_name, "device": self.device }
def __setstate__(self, state):
self.model_name = state["model_name"]
self.device = state["device"]
self.model = None
class WhisperCallback:
def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
self.model_container = model_container
self.language = language
self.task = task
self.initial_prompt = initial_prompt
self.decodeOptions = decodeOptions
def invoke(self, audio, segment_index: int, prompt: str, detected_language: str):
"""
Peform the transcription of the given audio file or data.
Parameters
----------
audio: Union[str, np.ndarray, torch.Tensor]
The audio file to transcribe, or the audio data as a numpy array or torch tensor.
segment_index: int
The target language of the transcription. If not specified, the language will be inferred from the audio content.
task: str
The task - either translate or transcribe.
prompt: str
The prompt to use for the transcription.
detected_language: str
The detected language of the audio file.
Returns
-------
The result of the Whisper call.
"""
model = self.model_container.get_model()
return model.transcribe(audio, \
language=self.language if self.language else detected_language, task=self.task, \
initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
**self.decodeOptions)
def _concat_prompt(self, prompt1, prompt2):
if (prompt1 is None):
return prompt2
elif (prompt2 is None):
return prompt1
else:
return prompt1 + " " + prompt2 |