Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
import os | |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
from docx import Document | |
from reportlab.pdfgen import canvas | |
from reportlab.pdfbase.ttfonts import TTFont | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.lib.pagesizes import A4 | |
import arabic_reshaper | |
from bidi.algorithm import get_display | |
from pptx import Presentation | |
import subprocess | |
import shlex | |
# Define available Whisper models | |
whisper_models = { | |
"Tiny (Fast, Less Accurate)": "tiny", | |
"Base (Medium Speed, Medium Accuracy)": "base", | |
"Small (Good Speed, Good Accuracy)": "small", | |
"Medium (Slow, High Accuracy)": "medium", | |
"Large (Very Slow, Highest Accuracy)": "large" | |
} | |
# Load M2M100 translation model for different languages | |
def load_translation_model(target_language): | |
lang_codes = { | |
"fa": "fa", # Persian (Farsi) | |
"es": "es", # Spanish | |
"fr": "fr", # French | |
"de": "de", # German | |
"it": "it", # Italian | |
"pt": "pt", # Portuguese | |
"ar": "ar", # Arabic | |
"zh": "zh", # Chinese | |
"hi": "hi", # Hindi | |
"ja": "ja", # Japanese | |
"ko": "ko", # Korean | |
"ru": "ru", # Russian | |
} | |
target_lang_code = lang_codes.get(target_language) | |
if not target_lang_code: | |
raise ValueError(f"Translation model for {target_language} not supported") | |
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | |
translation_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | |
tokenizer.src_lang = "en" | |
tokenizer.tgt_lang = target_lang_code | |
return tokenizer, translation_model | |
def translate_text(text, tokenizer, model): | |
try: | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id(tokenizer.tgt_lang)) | |
return tokenizer.decode(translated[0], skip_special_tokens=True) | |
except Exception as e: | |
raise RuntimeError(f"Error during translation: {e}") | |
# Helper function to format timestamps in SRT format | |
def format_timestamp(seconds): | |
milliseconds = int((seconds % 1) * 1000) | |
seconds = int(seconds) | |
hours = seconds // 3600 | |
minutes = (seconds % 3600) // 60 | |
seconds = seconds % 60 | |
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" | |
# Corrected write_srt function | |
def write_srt(transcription, output_file, tokenizer=None, translation_model=None): | |
with open(output_file, "w") as f: | |
for i, segment in enumerate(transcription['segments']): | |
start = segment['start'] | |
end = segment['end'] | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
start_time = format_timestamp(start) | |
end_time = format_timestamp(end) | |
f.write(f"{i + 1}\n") | |
f.write(f"{start_time} --> {end_time}\n") | |
f.write(f"{text.strip()}\n\n") | |
# Embedding subtitles into video (hardsub) | |
def embed_hardsub_in_video(video_file, srt_file, output_video): | |
command = f'ffmpeg -i "{video_file}" -vf "subtitles=\'{srt_file}\'" -c:v libx264 -crf 23 -preset medium "{output_video}"' | |
try: | |
process = subprocess.run(shlex.split(command), capture_output=True, text=True, timeout=300) | |
if process.returncode != 0: | |
raise RuntimeError(f"ffmpeg error: {process.stderr}") | |
except subprocess.TimeoutExpired: | |
raise RuntimeError("ffmpeg process timed out.") | |
except Exception as e: | |
raise RuntimeError(f"Error running ffmpeg: {e}") | |
# Helper function to write Word documents | |
def write_word(transcription, output_file, tokenizer=None, translation_model=None, target_language=None): | |
doc = Document() | |
rtl = target_language == "fa" | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
para = doc.add_paragraph(f"{i + 1}. {text.strip()}") | |
if rtl: | |
para.paragraph_format.right_to_left = True | |
doc.save(output_file) | |
# Helper function to write PDF documents | |
def write_pdf(transcription, output_file, tokenizer=None, translation_model=None): | |
# Create PDF with A4 page size | |
c = canvas.Canvas(output_file, pagesize=A4) | |
app_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Register fonts | |
nazanin_font_path = os.path.join(app_dir, 'B-NAZANIN.TTF') | |
arial_font_path = os.path.join(app_dir, 'Arial.ttf') | |
if os.path.exists(nazanin_font_path): | |
pdfmetrics.registerFont(TTFont('B-Nazanin', nazanin_font_path)) | |
if os.path.exists(arial_font_path): | |
pdfmetrics.registerFont(TTFont('Arial', arial_font_path)) | |
y_position = A4[1] - 50 | |
line_height = 20 | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
line = f"{i + 1}. {text.strip()}" | |
target_language = tokenizer.tgt_lang if translation_model else None | |
if target_language in ['fa', 'ar']: | |
reshaped_text = arabic_reshaper.reshape(line) | |
bidi_text = get_display(reshaped_text) | |
c.setFont('B-Nazanin', 12) | |
c.drawRightString(A4[0] - 50, y_position, bidi_text) | |
else: | |
c.setFont('Arial', 12) | |
c.drawString(50, y_position, line) | |
if y_position < 50: | |
c.showPage() | |
y_position = A4[1] - 50 | |
y_position -= line_height | |
c.save() | |
return output_file | |
# Helper function to write PowerPoint slides | |
def write_ppt(transcription, output_file, tokenizer=None, translation_model=None): | |
ppt = Presentation() | |
slide = ppt.slides.add_slide(ppt.slide_layouts[5]) | |
text_buffer = "" | |
max_chars_per_slide = 400 | |
for i, segment in enumerate(transcription['segments']): | |
text = segment['text'] | |
if translation_model: | |
text = translate_text(text, tokenizer, translation_model) | |
line = f"{i + 1}. {text.strip()}\n" | |
if len(text_buffer) + len(line) > max_chars_per_slide: | |
slide.shapes.title.text = "Transcription" | |
textbox = slide.shapes.add_textbox(left=0, top=0, width=ppt.slide_width, height=ppt.slide_height) | |
textbox.text = text_buffer.strip() | |
slide = ppt.slides.add_slide(ppt.slide_layouts[5]) | |
text_buffer = line | |
else: | |
text_buffer += line | |
if text_buffer: | |
slide.shapes.title.text = "" | |
textbox = slide.shapes.add_textbox(left=0, top=0, width=ppt.slide_width, height=ppt.slide_height) | |
textbox.text = text_buffer.strip() | |
ppt.save(output_file) | |
# Transcribing video and generating output | |
def transcribe_video(video_file, language, target_language, model_name, output_format): | |
actual_model_name = whisper_models[model_name] # Map user selection to model name | |
model = whisper.load_model(actual_model_name) # Load the selected model | |
if video_file is not None: # Ensure the video_file is not None | |
video_file_path = video_file.name | |
else: | |
raise ValueError("No video file provided. Please upload a video file.") | |
result = model.transcribe(video_file_path, language=language) | |
video_name = os.path.splitext(video_file_path)[0] | |
if target_language != "en": | |
try: | |
tokenizer, translation_model = load_translation_model(target_language) | |
except Exception as e: | |
raise RuntimeError(f"Error loading translation model: {e}") | |
else: | |
tokenizer, translation_model = None, None | |
srt_file = f"{video_name}.srt" | |
write_srt(result, srt_file, tokenizer, translation_model) | |
if output_format == "SRT": | |
return srt_file | |
elif output_format == "Video with Hardsub": | |
output_video = f"{video_name}_with_subtitles.mp4" | |
try: | |
embed_hardsub_in_video(video_file_path, srt_file, output_video) | |
return output_video | |
except Exception as e: | |
raise RuntimeError(f"Error embedding subtitles in video: {e}") | |
elif output_format == "Word": | |
word_file = f"{video_name}.docx" | |
write_word(result, word_file, tokenizer, translation_model, target_language) | |
return word_file | |
elif output_format == "PDF": | |
pdf_file = f"{video_name}.pdf" | |
write_pdf(result, pdf_file, tokenizer, translation_model) | |
return pdf_file | |
elif output_format == "PowerPoint": | |
ppt_file = f"{video_name}.pptx" | |
write_ppt(result, ppt_file, tokenizer, translation_model) | |
return ppt_file | |
else: | |
raise ValueError("Invalid output format selected.") | |
# Gradio Interface setup | |
iface = gr.Interface( | |
fn=transcribe_video, | |
inputs=[ | |
gr.File(label="Upload Video File"), | |
gr.Dropdown(label="Select Original Video Language", choices=["en", "es", "fr", "de", "it", "pt"], value="en"), | |
gr.Dropdown(label="Select Subtitle Translation Language", choices=["en", "fa", "es", "de", "fr", "it", "pt"], value="fa"), | |
gr.Dropdown(label="Select Whisper Model", choices=list(whisper_models.keys()), value="Tiny (Fast, Less Accurate)"), | |
gr.Radio(label="Choose Output Format", choices=["SRT", "Video with Hardsub", "Word", "PDF", "PowerPoint"], value="Video with Hardsub") | |
], | |
outputs=gr.File(label="Download File"), | |
title="Video Subtitle Generator with Translation & Multi-Format Output", | |
description=( | |
"This tool allows you to generate subtitles from a video file, translate the subtitles into multiple languages using M2M100, " | |
"and export them in various formats including SRT, hardcoded subtitles in video, Word, PDF, or PowerPoint." | |
), | |
theme="compact", | |
live=False | |
) | |
# Run the interface | |
iface.launch(share=True) |