Realtime-whisper-large-v3-turbo

Running on Zero

App Files Files Community

Upload app.py

by zeadsokar - opened Oct 8

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+47

-135

Files changed (1) hide show

app.py +47 -135

app.py CHANGED Viewed

@@ -1,135 +1,47 @@
-import spaces
-import torch
-import gradio as gr
-import tempfile
-import os
-import uuid
-import scipy.io.wavfile
-import time
-import numpy as np
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
-import subprocess
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
-)
-model.to(device)
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
-tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=model,
-    tokenizer=tokenizer,
-    feature_extractor=processor.feature_extractor,
-    chunk_length_s=10,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-@spaces.GPU
-def transcribe(inputs, previous_transcription):
-    start_time = time.time()
-    try:
-        filename = f"{uuid.uuid4().hex}.wav"
-        sample_rate, audio_data = inputs
-        scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        transcription = pipe(filename)["text"]
-        previous_transcription += transcription
-        end_time = time.time()
-        latency = end_time - start_time
-        return previous_transcription, f"{latency:.2f}"
-    except Exception as e:
-        print(f"Error during Transcription: {e}")
-        return previous_transcription, "Error"
-@spaces.GPU
-def translate_and_transcribe(inputs, previous_transcription, target_language):
-    start_time = time.time()
-    try:
-        filename = f"{uuid.uuid4().hex}.wav"
-        sample_rate, audio_data = inputs
-        scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
-        previous_transcription += translation
-        end_time = time.time()
-        latency = end_time - start_time
-        return previous_transcription, f"{latency:.2f}"
-    except Exception as e:
-        print(f"Error during Translation and Transcription: {e}")
-        return previous_transcription, "Error"
-def clear():
-    return ""
-with gr.Blocks() as microphone:
-    with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-        with gr.Row():
-            input_audio_microphone = gr.Audio(streaming=True)
-            output = gr.Textbox(label="Transcription", value="")
-            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-        with gr.Row():
-            clear_button = gr.Button("Clear Output")
-        input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
-        clear_button.click(clear, outputs=[output])
-with gr.Blocks() as file:
-    with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-        with gr.Row():
-            input_audio_microphone = gr.Audio(sources="upload", type="numpy")
-            output = gr.Textbox(label="Transcription", value="")
-            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-        with gr.Row():
-            submit_button = gr.Button("Submit")
-            clear_button = gr.Button("Clear Output")
-        submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
-        clear_button.click(clear, outputs=[output])
-# with gr.Blocks() as translate:
-#     with gr.Column():
-#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-#         with gr.Row():
-#             input_audio_microphone = gr.Audio(streaming=True)
-#             output = gr.Textbox(label="Transcription and Translation", value="")
-#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-#             target_language_dropdown = gr.Dropdown(
-#                 choices=["english", "french", "hindi", "spanish", "russian"],
-#                 label="Target Language",
-#                 value="<|es|>"
-#             )
-#         with gr.Row():
-#             clear_button = gr.Button("Clear Output")
-#         input_audio_microphone.stream(
-#             translate_and_transcribe,
-#             [input_audio_microphone, output, target_language_dropdown],
-#             [output, latency_textbox],
-#             time_limit=45,
-#             stream_every=2,
-#             concurrency_limit=None
-#         )
-#         clear_button.click(clear, outputs=[output])
-with gr.Blocks() as demo:
-    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
-demo.launch()

+import gradio as gr
+import easyocr
+import cv2
+import numpy as np
+from PIL import Image
+# Create an EasyOCR Reader
+reader = easyocr.Reader(['en'])
+def process_image(image):
+    # Convert the PIL image to a numpy array (compatible with OpenCV)
+    image_np = np.array(image)
+    # Convert the image to RGB (OpenCV loads as BGR, EasyOCR expects RGB)
+    image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    # Use EasyOCR to read text from the image
+    result = reader.readtext(image_rgb)
+    # Draw bounding boxes around detected text
+    for (bbox, text, prob) in result:
+        (top_left, top_right, bottom_right, bottom_left) = bbox
+        top_left = tuple(map(int, top_left))
+        bottom_right = tuple(map(int, bottom_right))
+        cv2.rectangle(image_np, top_left, bottom_right, (0, 255, 0), 2)
+    # Convert back to RGB for display
+    result_image = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
+    # Combine detected text and their confidence scores
+    detected_text = "\n".join([f"Detected text: {text}, Confidence: {prob:.2f}" for (_, text, prob) in result])
+    return result_image, detected_text
+# Gradio Interface
+interface = gr.Interface(
+    fn=process_image,
+    inputs="image",
+    outputs=["image", "text"],
+    title="OCR with EasyOCR",
+    description="Upload an image, and the system will detect text using EasyOCR and display it."
+)
+# Launch the interface
+interface.launch()