File size: 2,769 Bytes
4166b5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import soundfile as sf  # Import soundfile for audio file handling
import numpy as np
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Choose a suitable Kannada speech-to-text model from Hugging Face
model_name = "vasista22/whisper-kannada-tiny"  # Replace with your preferred model

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)





def transcribe_kannada(audio_data):
    """
    Transcribes recorded Kannada audio using the specified Hugging Face model.

    Args:
        audio_data: A NumPy array representing the recorded audio data.

    Returns:
        The transcribed text in Kannada.
    """

    sampling_rate = 16000  # Assuming common speech sampling rate (adjust if needed)
    audio_input = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")

    with torch.no_grad():
        logits = model(**audio_input).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)

    return transcription

def record_and_transcribe(audio):
    """
    Records audio from the microphone, processes each channel independently (if applicable),
    converts them to speech-to-text, and plays reversed audio.

    Args:
        audio: A tuple containing recorded audio information (multiple audio channels).

    Returns:
        A list of transcriptions (one for each channel), or a tuple with transcriptions and reversed audio.
    """

    transcriptions = []
    for channel in audio:
        # Process each audio channel (replace with your actual conversion logic)
        audio_data = channel  # Assuming no processing needed for individual channels
        transcription = transcribe_kannada(audio_data)
        transcriptions.append(transcription)

    # ... (handle reversed audio if needed)

    return transcriptions  # Or a tuple with transcriptions and reversed audio



# input_audio = gr.Audio(
#     sources=["microphone"],
#     type="numpy",  # Specify audio format as NumPy array
#     normalization=" [-1, 1]",  # Normalize audio data to -1 to 1 range for model compatibility
#     label="Record Kannada Audio",
# )

input_audio = gr.Audio(
    sources=["microphone"],
    type="numpy",  # Specify audio format as NumPy array
    label="Record Kannada Audio",
)

text_output = gr.Textbox(label="Transcription (ಕನ್ನಡ)")
audio_output = gr.Audio(label="Reversed Audio (Optional)", type="numpy")

demo = gr.Interface(
    fn=record_and_transcribe,
    inputs=input_audio,
    outputs=[text_output, audio_output],
    description="Kannada Speech-to-Text and Reverse Audio",
)


if __name__ == "__main__":
    demo.launch(share=True)