yuweiiizz commited on
Commit
7b71b1d
·
1 Parent(s): 40531ab

Add application file

Browse files
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Whisper Taiwanese
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Whisper Small Taiwanese Lora
3
+ emoji: 🎤
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.29.0
8
  app_file: app.py
9
  pinned: false
10
+ ---
 
 
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ import gradio as gr
4
+ import os
5
+ from peft import PeftModel, PeftConfig
6
+
7
+ import tempfile
8
+ from transformers import (
9
+ pipeline,
10
+ WhisperForConditionalGeneration,
11
+ WhisperTokenizer,
12
+ WhisperProcessor,
13
+ )
14
+ from yt_dlp import YoutubeDL
15
+
16
+ # 確認支援的運算設備
17
+ device = "cuda"
18
+ print(f"Using device: {device}")
19
+
20
+ # 參數設定
21
+ yt_length_limit_s = 600 # 限制 YouTube 影片的最大長度為 10 分鐘
22
+ peft_model_id = "yuweiiizz/whisper-small-taiwanese-lora"
23
+ language = "Chinese"
24
+ task = "transcribe"
25
+
26
+ # 讀取 cookies 內容
27
+ cookies_content = os.getenv('COOKIES')
28
+
29
+ # 如果 cookies 存在,寫入暫存檔案
30
+ if cookies_content:
31
+ cookies_temp_path = os.path.join(tempfile.gettempdir(), 'cookies.txt')
32
+ with open(cookies_temp_path, 'w') as f:
33
+ f.write(cookies_content)
34
+ else:
35
+ raise ValueError("COOKIES secret is not set.")
36
+
37
+ # 模型與處理器初始化
38
+ peft_config = PeftConfig.from_pretrained(peft_model_id)
39
+ model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, device_map=device)
40
+ model = PeftModel.from_pretrained(model, peft_model_id)
41
+ tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
42
+ processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
43
+ feature_extractor = processor.feature_extractor
44
+
45
+ pipe = pipeline(
46
+ "automatic-speech-recognition",
47
+ model=model,
48
+ tokenizer=tokenizer,
49
+ feature_extractor=feature_extractor,
50
+ max_new_tokens=128,
51
+ chunk_length_s=15
52
+ )
53
+
54
+ # 轉錄音訊檔案的功能
55
+ @spaces.GPU
56
+ def transcribe(microphone=None, file_upload=None):
57
+ if microphone and file_upload:
58
+ warn_output = "警告:您同時使用了麥克風與上傳音訊檔案,將只會使用麥克風錄製的檔案。\n"
59
+ file = microphone
60
+ elif microphone or file_upload:
61
+ warn_output = ""
62
+ file = microphone if microphone else file_upload
63
+ else:
64
+ return "錯誤:您必須至少使用麥克風或上傳一個音訊檔案。"
65
+
66
+ text = pipe(file, generate_kwargs={"task": task, "language": language})["text"]
67
+ return warn_output + text
68
+
69
+ # 轉錄 YouTube 影片的功能
70
+ @spaces.GPU
71
+ def yt_transcribe(yt_url):
72
+ try:
73
+ import stable_whisper
74
+
75
+ # 使用 yt-dlp 下載音訊
76
+ ydl_opts = {
77
+ 'format': 'bestaudio/best',
78
+ 'noplaylist': True,
79
+ 'quiet': True,
80
+ 'cookiefile' : cookies_temp_path,
81
+ 'outtmpl': os.path.join(tempfile.gettempdir(), '%(id)s.%(ext)s'),
82
+ 'postprocessors': [{
83
+ 'key': 'FFmpegExtractAudio',
84
+ 'preferredcodec': 'mp3',
85
+ 'preferredquality': '192',
86
+ }],
87
+ }
88
+ with YoutubeDL(ydl_opts) as ydl:
89
+ info_dict = ydl.extract_info(yt_url, download=True)
90
+ audio_path = ydl.prepare_filename(info_dict)
91
+ audio_path = os.path.splitext(audio_path)[0] + ".mp3"
92
+
93
+ # 定義轉錄函數
94
+ def inference(audio, **kwargs) -> dict:
95
+ pipe_output = pipe(audio, generate_kwargs={"task": task, "language": language}, return_timestamps=True)
96
+ chunks = [{"start": c['timestamp'][0] or 0, "end": c['timestamp'][1] or c['timestamp'][0] + 5, "text": c['text']} for c in pipe_output['chunks']]
97
+ return chunks
98
+
99
+ # 使用 stable_whisper 進行轉錄
100
+ result = stable_whisper.transcribe_any(inference, audio_path, vad=True)
101
+ os.remove(audio_path)
102
+
103
+ # 解析 URL 中的 video ID
104
+ video_id = info_dict.get('id', None)
105
+ if not video_id:
106
+ return "錯誤:無法解析 YouTube 影片 ID。", "", None
107
+
108
+ # 嵌入 YouTube 影片
109
+ html_embed = f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
110
+
111
+ # 格式化字幕並保存為 SRT 檔案
112
+ srt_text = result.to_srt_vtt(word_level=False)
113
+ srt_path = os.path.join(tempfile.gettempdir(), f"{video_id}.srt")
114
+ with open(srt_path, 'w') as srt_file:
115
+ srt_file.write(srt_text)
116
+
117
+ return html_embed, srt_text, srt_path
118
+ except Exception as e:
119
+ return f"錯誤:處理 YouTube 影片時發生錯誤。錯誤詳情:{str(e)}", "", None
120
+
121
+ # 範例音訊檔案
122
+ example_paths = [
123
+ ["examples/common_voice_1.mp3"],
124
+ ["examples/common_voice_2.mp3"],
125
+ ["examples/common_voice_3.mp3"],
126
+ ["examples/dictionary_1.mp3"],
127
+ ["examples/dictionary_2.mp3"],
128
+ ["examples/dictionary_3.mp3"],
129
+ ]
130
+
131
+ example_info = """
132
+ | Example File | 台語漢字 | 華語 | 拼音 |
133
+ |--------------|------------|------|------|
134
+ | common_voice_1.mp3 | 我欲學臺語 | 我要學臺語 | guá beh o̍h Tâi-gí |
135
+ | common_voice_2.mp3 | 有這款的代誌,我攏毋知 | 有這種事情,我都不知道 | ū tsit-khuán ê tāi-tsì, guá lóng m̄ tsai |
136
+ | common_voice_3.mp3 | 豐原 | 豐原 | Hong-guân |
137
+ | dictionary_1.mp3 | 你今仔日下晡去佗愛交代清楚。 | 你今天下午到哪去要說明清楚。 | Lí kin-á-ji̍t ē-poo khì toh ài kau-tài tshing-tshó. |
138
+ | dictionary_2.mp3 | 𠢕眩船的人愛食眩船藥仔。 | 容易暈船的人要吃暈船藥。 | Gâu hîn-tsûn ê lâng ài tsia̍h hîn-tsûn io̍h-á |
139
+ | dictionary_3.mp3 | 三分天註定,七分靠拍拚。 | 三分天注定,七分靠努力。 | Sann hun thinn tsù-tiānn, tshit hun khò phah-piànn. |
140
+ """
141
+
142
+ # 獲取範例檔案的絕對路徑
143
+ script_dir = os.path.dirname(os.path.abspath(__file__))
144
+ examples = [os.path.join(script_dir, example_path[0]) for example_path in example_paths]
145
+
146
+ # Gradio 介面
147
+ demo = gr.Blocks()
148
+
149
+ mf_transcribe = gr.Interface(
150
+ fn=transcribe,
151
+ inputs=gr.Audio(label="audio", type="filepath"),
152
+ outputs="text",
153
+ title="Whisper 台語演示: 語音轉錄",
154
+ description=f"演示使用 `PEFT-LoRA` fine-tuned checkpoint [{peft_model_id}](https://huggingface.co/{peft_model_id} 轉錄任意長度的音訊檔案",
155
+ allow_flagging="manual",
156
+ examples=examples,
157
+ article=example_info,
158
+ )
159
+
160
+ yt_transcribe = gr.Interface(
161
+ fn=yt_transcribe,
162
+ inputs=[gr.Textbox(lines=1, placeholder="在此處貼上 YouTube 影片的 URL", label="YouTube URL")],
163
+ outputs=[
164
+ gr.HTML(label="YouTube Video Embed"),
165
+ gr.Textbox(label="轉錄稿"),
166
+ gr.File(label="下載 SRT 檔案")
167
+ ],
168
+ title="Whisper 台語演示: Youtube轉錄",
169
+ description=f"演示使用 `PEFT-LoRA` fine-tuned checkpoint [{peft_model_id}](https://huggingface.co/{peft_model_id} 轉錄任意長度的Youtube影片",
170
+ allow_flagging="manual",
171
+ )
172
+
173
+ with demo:
174
+ gr.TabbedInterface([mf_transcribe, yt_transcribe], ["語音轉錄", "Youtube轉錄"])
175
+
176
+ demo.launch(share=True)
examples/common_voice_1.mp3 ADDED
Binary file (16.5 kB). View file
 
examples/common_voice_2.mp3 ADDED
Binary file (31.4 kB). View file
 
examples/common_voice_3.mp3 ADDED
Binary file (9.98 kB). View file
 
examples/dictionary_1.mp3 ADDED
Binary file (49.3 kB). View file
 
examples/dictionary_2.mp3 ADDED
Binary file (46.5 kB). View file
 
examples/dictionary_3.mp3 ADDED
Binary file (55.8 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.27.2
2
+ datasets==2.18.0
3
+ evaluate==0.4.1
4
+ gradio==4.20.1
5
+ jiwer==3.0.3
6
+ librosa==0.10.1
7
+ moviepy==1.0.3
8
+ peft==0.10.0
9
+ pysrt==1.1.2
10
+ soundfile==0.12.1
11
+ stable-ts==2.17.3
12
+ streamlit==1.33.0
13
+ tensorboard==2.16.2
14
+ transformers==4.38.2
15
+ torch==2.1.2
16
+ watchdog==4.0.0
17
+ yt-dlp==2023.11.16