import whisper import os import datetime import srt from moviepy.editor import VideoFileClip import gradio as gr import tempfile # Load the Whisper models once at startup model_sizes = ['tiny', 'base', 'small', 'medium', 'large'] models = {size: whisper.load_model(size) for size in model_sizes} # Task options tasks = ['transcribe', 'translate'] # Output format options output_formats = { 'transcribe': ['Transcription (.txt)', 'Subtitles (.srt)'], 'translate': ['Translation (.txt)', 'Translated Subtitles (.srt)'] } # Language options languages = ['Auto-detect', 'en', 'zh', 'fr', 'es', 'de', 'ja', 'ko'] def is_video_file(file_path): video_extensions = ['.mp4', '.avi', '.mov', '.mkv'] ext = os.path.splitext(file_path)[-1].lower() return ext in video_extensions def extract_audio_from_video(video_path): audio_path = video_path.rsplit('.', 1)[0] + '.mp3' video = VideoFileClip(video_path) video.audio.write_audiofile(audio_path, codec='mp3') return audio_path def generate_output(file_path, model_size, task, output_format, language): # Ensure that the file exists if not os.path.exists(file_path): raise FileNotFoundError(f"The file {file_path} does not exist.") # If it's a video file, extract the audio if is_video_file(file_path): audio_path = extract_audio_from_video(file_path) else: audio_path = file_path # Select the pre-loaded model model = models[model_size] # Transcribe or translate the audio result = model.transcribe( audio_path, task=task, language=None if language == "Auto-detect" else language ) # Prepare the output file base_filename = os.path.splitext(file_path)[0] if 'Subtitles' in output_format: # Generate SRT content subtitles = [] for segment in result['segments']: start = datetime.timedelta(seconds=segment['start']) end = datetime.timedelta(seconds=segment['end']) text = segment['text'] subtitle = srt.Subtitle(index=len(subtitles)+1, start=start, end=end, content=text) subtitles.append(subtitle) srt_content = srt.compose(subtitles) output_file = base_filename + '.srt' with open(output_file, "w", encoding='utf-8') as file: file.write(srt_content) else: # Generate TXT content transcription_text = " ".join([segment['text'] for segment in result['segments']]) output_file = base_filename + '.txt' with open(output_file, "w", encoding='utf-8') as file: file.write(transcription_text) return output_file def update_output_format(task): return gr.Dropdown.update(choices=output_formats[task], value=output_formats[task][0]) with gr.Blocks() as demo: gr.Markdown("# 📼 Video Transcription and Subtitles Generator") gr.Markdown("Upload a video or audio file to get the transcription or subtitles.") with gr.Row(): file_input = gr.File( label="Upload Video or Audio File", file_types=['video', 'audio'], type='filepath' ) with gr.Row(): model_size_input = gr.Dropdown( label="Select Whisper Model Size", choices=model_sizes, value='small' ) task_input = gr.Dropdown( label="Select Task", choices=tasks, value='transcribe' ) output_format_input = gr.Dropdown( label="Select Output Format", choices=output_formats['transcribe'], value=output_formats['transcribe'][0] ) language_input = gr.Dropdown( label="Select Original Language (Optional)", choices=languages, value='Auto-detect' ) task_input.change( fn=update_output_format, inputs=task_input, outputs=output_format_input ) submit_button = gr.Button("Generate") output_file = gr.File(label="Download Output File") submit_button.click( fn=generate_output, inputs=[ file_input, model_size_input, task_input, output_format_input, language_input ], outputs=output_file ) demo.launch()