import spaces import torch import gradio as gr import os from peft import PeftModel, PeftConfig import tempfile from transformers import ( pipeline, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, ) from yt_dlp import YoutubeDL # 確認支援的運算設備 device = "cuda" print(f"Using device: {device}") # 參數設定 yt_length_limit_s = 600 # 限制 YouTube 影片的最大長度為 10 分鐘 peft_model_id = "yuweiiizz/whisper-small-taiwanese-lora" language = "Chinese" task = "transcribe" commit_hash = "170eaa15cc6390681d827230ea0ae69414cfe560" # 讀取 cookies 內容 cookies_content = os.getenv('COOKIES') # 如果 cookies 存在,寫入暫存檔案 if cookies_content: cookies_temp_path = os.path.join(tempfile.gettempdir(), 'cookies.txt') with open(cookies_temp_path, 'w') as f: f.write(cookies_content) else: raise ValueError("COOKIES secret is not set.") # 模型與處理器初始化 peft_config = PeftConfig.from_pretrained(peft_model_id) model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, device_map=device) model = PeftModel.from_pretrained(model, peft_model_id, revision=commit_hash) tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) feature_extractor = processor.feature_extractor pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, max_new_tokens=128, chunk_length_s=10 ) # 轉錄音訊檔案的功能 @spaces.GPU def transcribe(microphone=None, file_upload=None): if microphone and file_upload: warn_output = "警告:您同時使用了麥克風與上傳音訊檔案,將只會使用麥克風錄製的檔案。\n" file = microphone elif microphone or file_upload: warn_output = "" file = microphone if microphone else file_upload else: return "錯誤:您必須至少使用麥克風或上傳一個音訊檔案。" text = pipe(file, generate_kwargs={"task": task, "language": language})["text"] return warn_output + text def split_text_by_punctuation(chunks): new_chunks = [] for chunk in chunks: text = chunk['text'] start_time = chunk['start'] end_time = chunk['end'] # 以中文標點符號切分句子 sentences = [s for s in text.split(',')] # 計算每個句子的平均持續時間 total_duration = end_time - start_time total_length = sum(len(s) for s in sentences) current_start = start_time for sentence in sentences: # 根據每個句子的長度計算其持續時間 sentence_length = len(sentence) duration_ratio = sentence_length / total_length sentence_duration = total_duration * duration_ratio current_end = min(current_start + sentence_duration, end_time) # 確保當前的結束時間不小於開始時間,並且不會重疊 if current_end > current_start: new_chunks.append({ "start": current_start, "end": current_end, "text": sentence }) current_start = current_end return new_chunks # 定義轉錄函式 @spaces.GPU def inference(audio, **kwargs) -> dict: pipe_output = pipe(audio, generate_kwargs={"task": task, "language": language}, return_timestamps=True) chunks = [ { "start": c['timestamp'][0] or 0, "end": c['timestamp'][1] or c['timestamp'][0] + 5, "text": c['text'] } for c in pipe_output['chunks'] ] # 標點符號切分句子 new_chunks = split_text_by_punctuation(chunks) return new_chunks # 轉錄 YouTube 影片的功能 def yt_transcribe(yt_url): try: import stable_whisper # 使用 yt-dlp 下載音訊 ydl_opts = { 'format': 'bestaudio/best', 'noplaylist': True, 'quiet': True, 'cookiefile' : cookies_temp_path, 'outtmpl': os.path.join(tempfile.gettempdir(), '%(id)s.%(ext)s'), 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], } with YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(yt_url, download=True) audio_path = ydl.prepare_filename(info_dict) audio_path = os.path.splitext(audio_path)[0] + ".mp3" # 使用 stable_whisper 進行轉錄 result = stable_whisper.transcribe_any(inference, audio_path, vad=True) os.remove(audio_path) # 解析 URL 中的 video ID video_id = info_dict.get('id', None) if not video_id: return "錯誤:無法解析 YouTube 影片 ID。", "", None # 嵌入 YouTube 影片 html_embed = f'