Video-to-Multilingual-OCR

Runtime error

App Files Files Community

stupidog04 commited on Apr 8, 2023

Commit

38e090f

1 Parent(s): 18bc0db

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -23

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
 import PIL
-from PIL import Image, ImageDraw
 import gradio as gr
 import torch
 import easyocr
@@ -50,8 +50,9 @@ def inference(video, lang, time_step):
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
-    max_boxes = 10
     # Get the positions of the largest boxes in the first frame
     while success and not bounds:
         if count == 0:
@@ -89,30 +90,42 @@ def inference(video, lang, time_step):
     frames_total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     # Define the codec and create VideoWriter object.
-    temp = f"{Path(output).stem}_temp{Path(output).suffix}"
-    output_video = cv2.VideoWriter(
-        temp, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
-    )
-    # output_video = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
     for frame in output_frames:
         output_video.write(frame)
     output_video.release()
     vidcap.release()
-    # Compressing the video for smaller size and web compatibility.
-    os.system(
-        f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
-    )
-    os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
     # Format temporal profiles as a DataFrame
-    df = pd.DataFrame(columns=["Box", "Time (s)", "Text"])
     for i, profile in enumerate(temporal_profiles):
         for t, text in profile:
-            df = df.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text}, ignore_index=True)
-    return output, df
 title = '🖼️Video to Multilingual OCR👁️Gradio'
@@ -120,8 +133,7 @@ description = 'Multilingual OCR which works conveniently on all devices in multi
 article = "<p style='text-align: center'></p>"
 examples = [
-#['PleaseRepeatLouder.jpg',['ja']],['ProhibitedInWhiteHouse.JPG',['en']],['BeautyIsTruthTruthisBeauty.JPG',['en']],
-['20-Books.jpg',['en']],['COVID.png',['en']],['chinese.jpg',['ch_sim', 'en']],['japanese.jpg',['ja', 'en']],['Hindi.jpeg',['hi', 'en']]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
@@ -140,19 +152,19 @@ choices = [
 gr.Interface(
     inference,
     [
-        # gr.inputs.Image(type='file', label='Input Image'),
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
         gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
     ],
     [
         gr.outputs.Video(label='Output Video'),
-        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'])
     ],
     title=title,
     description=description,
     article=article,
-    # examples=examples,
     css=css,
     enable_queue=True
 ).launch(debug=True)

 import numpy as np
 import PIL
+from PIL import Image, ImageDraw, ImageFont
 import gradio as gr
 import torch
 import easyocr
     frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
     output_frames = []
     temporal_profiles = []
+    max_boxes = 6
+    compress_mp4 = True
     # Get the positions of the largest boxes in the first frame
     while success and not bounds:
         if count == 0:
     frames_total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     # Define the codec and create VideoWriter object.
+    if compress_mp4:
+        temp = f"{Path(output).stem}_temp{Path(output).suffix}"
+        output_video = cv2.VideoWriter(
+            temp, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
+        )
+    else:
+        output_video = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
     for frame in output_frames:
         output_video.write(frame)
+    # Draw boxes with box indices in the first frame of the output video
+    im = Image.fromarray(output_frames[0])
+    draw = ImageDraw.Draw(im)
+    font_size = 30
+    font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
+    for i, box in enumerate(largest_boxes):
+        draw.text((box_position(box)), f"Box {i+1}", fill='red', font=ImageFont.truetype(font_path, font_size))
     output_video.release()
     vidcap.release()
+    if compress_mp4:
+        # Compressing the video for smaller size and web compatibility.
+        os.system(
+            f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
+        )
+        os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")
     # Format temporal profiles as a DataFrame
+    df_list = []
     for i, profile in enumerate(temporal_profiles):
         for t, text in profile:
+            df_list.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text})
+        df_list.append({"Box": f"", "Time (s)": "", "Text": ""})
+    df = pd.concat([pd.DataFrame(df_list)])
+    return output, im, df
 title = '🖼️Video to Multilingual OCR👁️Gradio'
 article = "<p style='text-align: center'></p>"
 examples = [
+['test.mp4',['en']]
 ]
 css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
 gr.Interface(
     inference,
     [
         gr.inputs.Video(label='Input Video'),
         gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
         gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
     ],
     [
         gr.outputs.Video(label='Output Video'),
+        gr.outputs.Image(label='Output Preview', type='numpy'),
+        gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas')
     ],
     title=title,
     description=description,
     article=article,
+    examples=examples,
     css=css,
     enable_queue=True
 ).launch(debug=True)