import gradio as gr from faster_whisper import WhisperModel import logging import os from moviepy.editor import VideoFileClip import ffmpeg # Make sure to install ffmpeg-python from transformers import MarianMTModel, MarianTokenizer import pandas as pd import pysrt import requests # Configure logging for debugging purposes logging.basicConfig() logging.getLogger("faster_whisper").setLevel(logging.DEBUG) # Fetch and parse language options from the provided URL url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md" df = pd.read_csv(url, delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all') df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name'] df['ISO 639-1'] = df['ISO 639-1'].str.strip() # Prepare language options for the dropdown language_options = [(row['ISO 639-1'], f"{row['Language Name']} ({row['ISO 639-1']})") for index, row in df.iterrows()] def format_timestamp(seconds): """Convert seconds to HH:MM:SS.mmm format.""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) seconds_remainder = seconds % 60 return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}" def extract_audio(video_path): """Extract audio from video to a temporary audio file.""" output_audio_path = '/tmp/audio.wav' ffmpeg.input(video_path).output(output_audio_path, acodec='pcm_s16le', ac=1, ar='16k').run(quiet=True) return output_audio_path def transcribe_and_optionally_translate(video_file, source_language, target_language, model_size, allow_modification): audio_file = extract_audio(video_file) # Transcription device = "cpu" # GPU : cuda CPU : cpu compute_type = "int8" # GPU : float16 or int8 - CPU : int8 model = WhisperModel(model_size, device=device, compute_type=compute_type) segments, _ = model.transcribe(audio_file, source_language=source_language) transcription = " ".join([segment.text for segment in segments]) # Translation if source_language != target_language: model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) translated = model.generate(**tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512)) transcription = tokenizer.decode(translated[0], skip_special_tokens=True) return transcription, allow_modification def add_hard_subtitle_to_video(input_video, transcript): """Add hard subtitles to video.""" temp_subtitle_path = '/tmp/subtitle.srt' with open(temp_subtitle_path, 'w', encoding='utf-8') as file: file.write(transcript) # Assuming transcript is in SRT format output_video_path = f"/tmp/output_video.mp4" ffmpeg.input(input_video).output(output_video_path, vf=f"subtitles={temp_subtitle_path}").run(quiet=True) return output_video_path # Gradio Interface def process_video(video, source_language, target_language, model_size='base', allow_modification=False, modified_transcript=None): transcript, can_modify = transcribe_and_optionally_translate(video, source_language, target_language, model_size, allow_modification) if can_modify and modified_transcript: transcript = modified_transcript # Use the modified transcript if provided # Add hard subtitles to the video output_video = add_hard_subtitle_to_video(video, transcript) return output_video # Setup the Gradio app app = gr.Interface( fn=process_video, inputs=[ gr.Video(label="Upload Video"), gr.Dropdown(choices=language_options, label="Source Language"), gr.Dropdown(choices=language_options, label="Target Language"), gr.Dropdown(choices=["base", "small", "medium", "large", "large-v2", "large-v3"], label="Model Size"), gr.Checkbox(label="Allow Transcript Modification?", value=False), gr.TextArea(label="Modified Transcript (if allowed)") ], outputs=gr.Video(label="Processed Video with Hard Subtitles"), title="Video Transcription and Translation Tool", description="Transcribe or translate your video content. Optionally, edit the transcription before adding hard subtitles." ) if __name__ == "__main__": app.launch()