deepugaur's picture
Update app.py
52d87a4 verified
import librosa
import numpy as np
def preprocess_audio(file_path):
y, sr = librosa.load(file_path, sr=16000)
mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
return mel_spectrogram
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, MarianMTModel, MarianTokenizer
# Load pre-trained models
speech_to_text_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
speech_to_text_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-xlsr-53")
translation_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
translation_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
def translate_audio(file_path):
# Preprocess the audio
mel_spectrogram = preprocess_audio(file_path)
# Convert to text
audio_input = speech_to_text_tokenizer(file_path, return_tensors="pt").input_values
logits = speech_to_text_model(audio_input).logits
predicted_ids = logits.argmax(dim=-1)
transcription = speech_to_text_tokenizer.batch_decode(predicted_ids)[0]
# Translate text
translation_input = translation_tokenizer(transcription, return_tensors="pt")
translated_output = translation_model.generate(**translation_input)
translation = translation_tokenizer.batch_decode(translated_output, skip_special_tokens=True)[0]
return translation
import datetime
def should_translate():
now = datetime.datetime.now()
return now.hour >= 18
def handle_translation(file_path):
if should_translate():
return translate_audio(file_path)
else:
return "Translation is only available after 6 PM IST."