|
""" |
|
Speech-to-text module based on Vosk for SillyTavern Extras |
|
- Vosk website: https://alphacephei.com/vosk/ |
|
- Vosk api: https://github.com/alphacep/vosk-api |
|
|
|
Authors: |
|
- Tony Ribeiro (https://github.com/Tony-sama) |
|
|
|
Models are saved into user cache folder, example: C:/Users/toto/.cache/vosk |
|
|
|
References: |
|
- Code adapted from: https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py |
|
""" |
|
from flask import jsonify, abort, request |
|
|
|
import wave |
|
from vosk import Model, KaldiRecognizer, SetLogLevel |
|
import soundfile |
|
|
|
DEBUG_PREFIX = "<stt vosk module>" |
|
RECORDING_FILE_PATH = "stt_test.wav" |
|
|
|
model = None |
|
|
|
SetLogLevel(-1) |
|
|
|
def load_model(file_path=None): |
|
""" |
|
Load given vosk model from file or default to en-us model. |
|
Download model to user cache folder, example: C:/Users/toto/.cache/vosk |
|
""" |
|
|
|
if file_path is None: |
|
return Model(lang="en-us") |
|
else: |
|
return Model(file_path) |
|
|
|
def process_audio(): |
|
""" |
|
Transcript request audio file to text using Whisper |
|
""" |
|
|
|
if model is None: |
|
print(DEBUG_PREFIX,"Vosk model not initialized yet.") |
|
return "" |
|
|
|
try: |
|
file = request.files.get('AudioFile') |
|
file.save(RECORDING_FILE_PATH) |
|
|
|
|
|
data, samplerate = soundfile.read(RECORDING_FILE_PATH) |
|
soundfile.write(RECORDING_FILE_PATH, data, samplerate) |
|
|
|
wf = wave.open(RECORDING_FILE_PATH, "rb") |
|
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": |
|
print("Audio file must be WAV format mono PCM.") |
|
abort(500, DEBUG_PREFIX+" Audio file must be WAV format mono PCM.") |
|
|
|
rec = KaldiRecognizer(model, wf.getframerate()) |
|
|
|
|
|
|
|
while True: |
|
data = wf.readframes(4000) |
|
if len(data) == 0: |
|
break |
|
if rec.AcceptWaveform(data): |
|
break |
|
|
|
transcript = rec.Result()[14:-3] |
|
print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript) |
|
return jsonify({"transcript": transcript}) |
|
|
|
except Exception as e: |
|
print(e) |
|
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") |