import gradio as gr import openai import yt_dlp import os import io import tempfile from pydub import AudioSegment def split_audio(file_path, chunk_length_ms): audio = AudioSegment.from_file(file_path) duration = len(audio) chunks = [] start_time = 0 while start_time < duration: end_time = start_time + chunk_length_ms if end_time > duration: end_time = duration chunk = audio[start_time:end_time] chunks.append(chunk) start_time += chunk_length_ms return chunks def split_string_by_tokens(text, max_tokens=500): words = text.split() chunks = [] current_chunk = [] for word in words: current_chunk.append(word) if len(current_chunk) >= max_tokens: chunks.append(' '.join(current_chunk)) current_chunk = [] if current_chunk: chunks.append(' '.join(current_chunk)) return chunks openai.api_key = os.environ['OPENAI_API_KEY'] def asr(url): # delete the video os.system("rm *audio_download*") # download audio # Options for youtube-dl ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': 'audio_downloaded.%(ext)s', 'no_continue': True, } # Create a youtube-dl object ydl = yt_dlp.YoutubeDL(ydl_opts) # Download the video info_dict = ydl.extract_info(url, download=True) if info_dict is not None: audio_file_name = "audio_downloaded.{}".format(info_dict["ext"]) else: return "下载音频发生错误,请确认链接再试一次。", "Error downloading the audio. Check the URL and try again." yield "下载视频完成. 开始分割视频...", "" chunks = split_audio(audio_file_name, chunk_length_ms=30 * 1000) transcripts = [] for idx, chunk in enumerate(chunks): temp_file_path = None with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as temp_file: temp_file_path = temp_file.name chunk.export(temp_file.name, format="wav") with open(temp_file_path, "rb") as temp_file: transcript = openai.Audio.transcribe("whisper-1", temp_file) os.remove(temp_file_path) transcripts.append(transcript["text"]) yield "请耐心等待语音识别完成...({}/{})".format(idx + 1, len(chunks)), " ".join(transcripts) # delete the video os.system("rm {}".format(audio_file_name)) translations = [] full_transcript = " ".join(transcripts) # split into 500 tokens transcript_chunks = split_string_by_tokens(full_transcript, max_tokens=500) yield "语音识别完成, 开始翻译...(0/{})".format(len(transcript_chunks)), full_transcript # split transcripts if its too long for idx, transcript in enumerate(transcript_chunks): output = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": "Transcript: {transcript}. \n Translate the video conversation transcript into fluent Chinese. Chinese: ".format(transcript=transcript)}, ], stream=True, ) for event in output: translations.append(event["choices"][0].get("delta", "").get("content", "")) yield "请耐心等候翻译:({}/{})...".format(idx+1, len(transcript_chunks)) + "".join(translations), " ".join(transcripts) full_translation = "".join(translations) yield full_translation, full_transcript title = """ 轻声细译""" # Create an instruction input component instruction = """