Files changed (1) hide show
  1. app.py +47 -135
app.py CHANGED
@@ -1,135 +1,47 @@
1
- import spaces
2
- import torch
3
- import gradio as gr
4
- import tempfile
5
- import os
6
- import uuid
7
- import scipy.io.wavfile
8
- import time
9
- import numpy as np
10
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
11
- import subprocess
12
- subprocess.run(
13
- "pip install flash-attn --no-build-isolation",
14
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
- shell=True,
16
- )
17
-
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
- torch_dtype = torch.float16
20
- MODEL_NAME = "openai/whisper-large-v3-turbo"
21
-
22
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
23
- MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
24
- )
25
- model.to(device)
26
-
27
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
28
- tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
29
-
30
- pipe = pipeline(
31
- task="automatic-speech-recognition",
32
- model=model,
33
- tokenizer=tokenizer,
34
- feature_extractor=processor.feature_extractor,
35
- chunk_length_s=10,
36
- torch_dtype=torch_dtype,
37
- device=device,
38
- )
39
-
40
- @spaces.GPU
41
- def transcribe(inputs, previous_transcription):
42
- start_time = time.time()
43
- try:
44
- filename = f"{uuid.uuid4().hex}.wav"
45
- sample_rate, audio_data = inputs
46
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
47
-
48
- transcription = pipe(filename)["text"]
49
- previous_transcription += transcription
50
-
51
- end_time = time.time()
52
- latency = end_time - start_time
53
- return previous_transcription, f"{latency:.2f}"
54
- except Exception as e:
55
- print(f"Error during Transcription: {e}")
56
- return previous_transcription, "Error"
57
-
58
- @spaces.GPU
59
- def translate_and_transcribe(inputs, previous_transcription, target_language):
60
- start_time = time.time()
61
- try:
62
- filename = f"{uuid.uuid4().hex}.wav"
63
- sample_rate, audio_data = inputs
64
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
65
-
66
- translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
67
-
68
- previous_transcription += translation
69
-
70
- end_time = time.time()
71
- latency = end_time - start_time
72
- return previous_transcription, f"{latency:.2f}"
73
- except Exception as e:
74
- print(f"Error during Translation and Transcription: {e}")
75
- return previous_transcription, "Error"
76
-
77
- def clear():
78
- return ""
79
-
80
- with gr.Blocks() as microphone:
81
- with gr.Column():
82
- gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
83
- with gr.Row():
84
- input_audio_microphone = gr.Audio(streaming=True)
85
- output = gr.Textbox(label="Transcription", value="")
86
- latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
87
- with gr.Row():
88
- clear_button = gr.Button("Clear Output")
89
-
90
- input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
91
- clear_button.click(clear, outputs=[output])
92
-
93
- with gr.Blocks() as file:
94
- with gr.Column():
95
- gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
96
- with gr.Row():
97
- input_audio_microphone = gr.Audio(sources="upload", type="numpy")
98
- output = gr.Textbox(label="Transcription", value="")
99
- latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
100
- with gr.Row():
101
- submit_button = gr.Button("Submit")
102
- clear_button = gr.Button("Clear Output")
103
-
104
- submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
105
- clear_button.click(clear, outputs=[output])
106
-
107
- # with gr.Blocks() as translate:
108
- # with gr.Column():
109
- # gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
110
- # with gr.Row():
111
- # input_audio_microphone = gr.Audio(streaming=True)
112
- # output = gr.Textbox(label="Transcription and Translation", value="")
113
- # latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
114
- # target_language_dropdown = gr.Dropdown(
115
- # choices=["english", "french", "hindi", "spanish", "russian"],
116
- # label="Target Language",
117
- # value="<|es|>"
118
- # )
119
- # with gr.Row():
120
- # clear_button = gr.Button("Clear Output")
121
-
122
- # input_audio_microphone.stream(
123
- # translate_and_transcribe,
124
- # [input_audio_microphone, output, target_language_dropdown],
125
- # [output, latency_textbox],
126
- # time_limit=45,
127
- # stream_every=2,
128
- # concurrency_limit=None
129
- # )
130
- # clear_button.click(clear, outputs=[output])
131
-
132
- with gr.Blocks() as demo:
133
- gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
134
-
135
- demo.launch()
 
1
+ import gradio as gr
2
+ import easyocr
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+ # Create an EasyOCR Reader
8
+ reader = easyocr.Reader(['en'])
9
+
10
+
11
+ def process_image(image):
12
+ # Convert the PIL image to a numpy array (compatible with OpenCV)
13
+ image_np = np.array(image)
14
+
15
+ # Convert the image to RGB (OpenCV loads as BGR, EasyOCR expects RGB)
16
+ image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
17
+
18
+ # Use EasyOCR to read text from the image
19
+ result = reader.readtext(image_rgb)
20
+
21
+ # Draw bounding boxes around detected text
22
+ for (bbox, text, prob) in result:
23
+ (top_left, top_right, bottom_right, bottom_left) = bbox
24
+ top_left = tuple(map(int, top_left))
25
+ bottom_right = tuple(map(int, bottom_right))
26
+ cv2.rectangle(image_np, top_left, bottom_right, (0, 255, 0), 2)
27
+
28
+ # Convert back to RGB for display
29
+ result_image = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
30
+
31
+ # Combine detected text and their confidence scores
32
+ detected_text = "\n".join([f"Detected text: {text}, Confidence: {prob:.2f}" for (_, text, prob) in result])
33
+
34
+ return result_image, detected_text
35
+
36
+
37
+ # Gradio Interface
38
+ interface = gr.Interface(
39
+ fn=process_image,
40
+ inputs="image",
41
+ outputs=["image", "text"],
42
+ title="OCR with EasyOCR",
43
+ description="Upload an image, and the system will detect text using EasyOCR and display it."
44
+ )
45
+
46
+ # Launch the interface
47
+ interface.launch()