File size: 3,607 Bytes
48b9b5d
3ccf873
 
 
c7a4f81
3ccf873
 
 
 
 
 
 
48b9b5d
 
 
 
 
 
75b7975
 
48b9b5d
 
 
 
75b7975
 
48b9b5d
 
 
 
75b7975
48b9b5d
 
 
8b4aa8a
75b7975
48b9b5d
 
 
 
 
 
 
 
 
 
 
 
 
79c8857
 
48b9b5d
 
 
 
 
79c8857
48b9b5d
 
79c8857
c092255
 
 
 
 
 
79c8857
 
c092255
 
79c8857
11efa99
 
 
48b9b5d
 
 
79c8857
 
 
 
 
 
3ccf873
48b9b5d
3ccf873
79c8857
3ccf873
48b9b5d
 
 
79c8857
 
 
 
 
 
 
 
48b9b5d
 
 
 
 
 
 
 
 
 
 
 
 
 
79c8857
 
48b9b5d
 
 
 
 
 
f09d2ab
48b9b5d
 
7dc8950
f5ddb49
48b9b5d
 
79c8857
 
 
 
 
 
 
 
f09d2ab
79c8857
 
 
 
 
 
 
 
 
 
 
f09d2ab
 
 
 
79c8857
 
 
48b9b5d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import numpy as np
import torch
import transformers
from pathlib import Path
from transformers import pipeline
from transformers.utils import logging

# Log

#logging.set_verbosity_debug()
logger = logging.get_logger("transformers")

# Pipelines

## Automatic Speech Recognition
## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
## Require ffmpeg to be installed
asr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
asr_model = "openai/whisper-tiny"
asr = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    torch_dtype=asr_torch_dtype,
    device=asr_device
)

## Token Classification / Name Entity Recognition
## https://huggingface.co/docs/transformers/task_summary#token-classification
tc_device = 0 if torch.cuda.is_available() else "cpu"
tc_model = "dslim/distilbert-NER"
tc = pipeline(
    "token-classification", # ner
    model=tc_model,
    device=tc_device
)

# ---

# Transformers

# https://www.gradio.app/main/docs/gradio/audio#behavior
# As output component: expects audio data in any of these formats:
# - a str or pathlib.Path filepath
# - or URL to an audio file,
# - or a bytes object (recommended for streaming),
# - or a tuple of (sample rate in Hz, audio data as numpy array)
def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
    logger.debug(">Transcribe")

    if audio is None:
        return "..."
    # TODO Manage str/Path

    text = ""

    # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
    # Whisper input format for tuple differ from output provided by gradio audio component
    if asr_model.startswith("openai/whisper") and type(audio) is tuple:
        sampling_rate, raw = audio

        # Convert to mono if stereo
        if raw.ndim > 1:
            raw = raw.mean(axis=1)

        # Convert according to asr_torch_dtype
        raw = raw.astype(np.float16 if type(asr_torch_dtype) is torch.float16 else np.float32)
        raw /= np.max(np.abs(raw))

        inputs = {"sampling_rate": sampling_rate, "raw": raw}

        logger.debug(inputs)

        transcript = asr(inputs)
        text = transcript['text']

    logger.debug(text)

    return text

def tokenize(text: str):
    logger.debug(">Tokenize")

    entities = tc(text)

    logger.debug(entities)

    # TODO Add Text Classification for sentiment analysis
    return {"text": text, "entities": entities}

def classify(text: str):
    logger.debug(">Classify")

    return None

def transcribe_tokenize(*arg):
    return tokenize(transcribe(arg))

# ---

# Gradio

## Interfaces

# https://www.gradio.app/main/docs/gradio/audio
input_audio = gr.Audio(
    sources=["upload", "microphone"],
    show_share_button=False
)

## App

asrner_app = gr.Interface(
    transcribe_tokenize,
    inputs=[
        input_audio
    ],
    outputs=[
        gr.HighlightedText()
    ],
    title="ASR>NER",
    description=(
        "Transcribe, Tokenize, Classify"
    ),
    flagging_mode="never"
)

ner_app = gr.Interface(
    tokenize,
    inputs=[
        gr.Textbox()
    ],
    outputs=[
        gr.HighlightedText()
    ],
    title="NER",
    description=(
        "Tokenize, Classify"
    ),
    flagging_mode="never"
)

gradio_app = gr.TabbedInterface(
    interface_list=[
        asrner_app,
        ner_app
    ],
    tab_names=[
        asrner_app.title,
        ner_app.title
    ],
    title="ASRNERSBX"
)

## Start!
gradio_app.launch()