import gradio as gr import pytube as pt from transformers import pipeline import os from huggingface_hub import HfFolder from gtts import gTTS from fpdf import FPDF from pdfminer.high_level import extract_text # Initialize pipelines for transcription, summarization, and translation transcription_pipe = pipeline(model="SaladSlayer00/another_local", token=HfFolder.get_token()) summarizer = pipeline("summarization", model="it5/it5-efficient-small-el32-news-summarization") translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en") def process_audio(file_path): text = transcription_pipe(file_path)["text"] summary = summarizer(text, min_length=25, max_length=50)[0]["summary_text"] translation = translator(text)[0]["translation_text"] return text, summary, translation def download_youtube_audio(yt_url): yt = pt.YouTube(yt_url) stream = yt.streams.filter(only_audio=True).first() file_path ="temp_audio.mp3") return file_path def youtube_transcription(yt_url): audio_path = download_youtube_audio(yt_url) results = process_audio(audio_path) os.remove(audio_path) # Clean up the downloaded file return results def transcribe_and_process(rec=None, file=None): if rec is not None: audio = rec elif file is not None: audio = file else: return "Provide a recording or a file." return process_audio(audio) def save_text_to_pdf(text, filename="output.pdf"): # Create instance of FPDF class pdf = FPDF() # Add a page pdf.add_page() # Set font: Arial, bold, 12 pdf.set_font("Arial", size=12) # Add a cell pdf.multi_cell(0, 10, text) # Save the pdf with name .pdf pdf.output(filename) return filename def pdf_to_text(file_path): text = extract_text(file_path) audio_file = "tts_audio.wav" myobj = gTTS(text=text, lang='en', slow=False) return audio_file def audio_to_pdf(file_path): text, summary, translation = process_audio(file_path) pdf_file = save_text_to_pdf(translation) tts_audio_file = pdf_to_text(pdf_file) # Generate TTS audio from the PDF return translation, pdf_file, tts_audio_file def pdf_to_audio(file_path): text = extract_text(file_path) myobj = gTTS(text=text, lang='en', slow=False) audio_file = "output_audio.wav" return audio_file app = gr.Blocks() with app: gr.Markdown("### Whisper Small Italian Transcription, Summarization, and Translation") gr.Markdown("Talk, upload an audio file or enter a YouTube URL for processing.") with gr.Tab("Audio Processing"): gr.Markdown("### Example Audio Files") gr.Audio("/La_Casa.mp3", label="Short Audio 1") gr.Audio("/La_Neve.mp3", label="Short Audio 2") gr.Audio("/La_Lettera.mp3", label="Long Audio 3") gr.Audio("/Le_Feste.mp3", label="Long Audio 4") with gr.Row(): audio_input = gr.Audio(label="Upload Audio or Record", type="filepath") audio_process_button = gr.Button("Process Audio") audio_transcription, audio_summary, audio_translation = gr.Textbox(label="Transcription"), gr.Textbox(label="Summary"), gr.Textbox(label="Translation"), inputs=audio_input, outputs=[audio_transcription, audio_summary, audio_translation]) with gr.Tab("YouTube Processing"): gr.Markdown("### Example YouTube URLs") gr.Markdown("1. [The House](") gr.Markdown("2. [Introduction](") gr.Markdown("3. [Where Are You From?](") gr.Markdown("4. [The Colors](") with gr.Row(): yt_input = gr.Textbox(label="YouTube URL") yt_process_button = gr.Button("Process YouTube Video") yt_transcription, yt_summary, yt_translation = gr.Textbox(label="Transcription"), gr.Textbox(label="Summary"), gr.Textbox(label="Translation"), inputs=yt_input, outputs=[yt_transcription, yt_summary, yt_translation]) with gr.Tab("Italian Audio to English PDF"): gr.Markdown("### Example Audio Files") gr.Audio("/La_Casa.mp3", label="Short Audio 1") gr.Audio("/La_Neve.mp3", label="Short Audio 2") gr.Audio("/La_Lettera.mp3", label="Long Audio 3") gr.Audio("/Le_Feste.mp3", label="Long Audio 4") with gr.Row(): audio_input = gr.Audio(label="Upload Italian Audio", type="filepath") translate_process_button = gr.Button("Translate and Save as PDF") translation_textbox, pdf_download, tts_audio = gr.Textbox(label="Translation"), gr.File(label="Download PDF"), gr.Audio(label="TTS Audio"), inputs=audio_input, outputs=[translation_textbox, pdf_download, tts_audio]) (app.launch())