Video-to-Multilingual-OCR

Runtime error

App Files Files Community

Video-to-Multilingual-OCR / app.py

stupidog04

Update app.py

38e090f over 1 year ago

raw

history blame

6.49 kB

	import numpy as np
	import PIL
	from PIL import Image, ImageDraw, ImageFont
	import gradio as gr
	import torch
	import easyocr
	import os
	from pathlib import Path
	import cv2
	import pandas as pd


	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/BeautyIsTruthTruthisBeauty.JPG', 'BeautyIsTruthTruthisBeauty.JPG')
	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/PleaseRepeatLouder.jpg', 'PleaseRepeatLouder.jpg')
	#torch.hub.download_url_to_file('https://github.com/AaronCWacker/Yggdrasil/blob/main/images/ProhibitedInWhiteHouse.JPG', 'ProhibitedInWhiteHouse.JPG')

	torch.hub.download_url_to_file('https://raw.githubusercontent.com/AaronCWacker/Yggdrasil/master/images/20-Books.jpg','20-Books.jpg')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/english.png', 'COVID.png')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/chinese.jpg', 'chinese.jpg')
	torch.hub.download_url_to_file('https://github.com/JaidedAI/EasyOCR/raw/master/examples/japanese.jpg', 'japanese.jpg')
	torch.hub.download_url_to_file('https://i.imgur.com/mwQFd7G.jpeg', 'Hindi.jpeg')

	def draw_boxes(image, bounds, color='yellow', width=2):
	draw = ImageDraw.Draw(image)
	for bound in bounds:
	p0, p1, p2, p3 = bound[0]
	draw.line([p0, p1, p2, p3, *p0], fill=color, width=width)
	return image

	def box_size(box):
	points = box[0]
	if len(points) == 4:
	x1, y1 = points[0]
	x2, y2 = points[2]
	return abs(x1 - x2) * abs(y1 - y2)
	else:
	return 0

	def box_position(box):
	return (box[0][0][0] + box[0][2][0]) / 2, (box[0][0][1] + box[0][2][1]) / 2


	def inference(video, lang, time_step):
	output = 'results.mp4'
	reader = easyocr.Reader(lang)
	bounds = []
	vidcap = cv2.VideoCapture(video)
	success, frame = vidcap.read()
	count = 0
	frame_rate = vidcap.get(cv2.CAP_PROP_FPS)
	output_frames = []
	temporal_profiles = []
	max_boxes = 6
	compress_mp4 = True

	# Get the positions of the largest boxes in the first frame
	while success and not bounds:
	if count == 0:
	bounds = reader.readtext(frame)
	im = PIL.Image.fromarray(frame)
	im_with_boxes = draw_boxes(im, bounds)
	largest_boxes = sorted(bounds, key=lambda x: box_size(x), reverse=True)[:max_boxes]
	positions = [box_position(b) for b in largest_boxes]
	temporal_profiles = [[] for _ in range(len(largest_boxes))]
	success, frame = vidcap.read()
	count += 1

	# Match bboxes to position and store the text read by OCR
	while success:
	if count % (int(frame_rate * time_step)) == 0:
	bounds = reader.readtext(frame)
	for box in bounds:
	bbox_pos = box_position(box)
	for i, position in enumerate(positions):
	distance = np.linalg.norm(np.array(bbox_pos) - np.array(position))
	if distance < 50:
	temporal_profiles[i].append((count / frame_rate, box[1]))
	break
	im = PIL.Image.fromarray(frame)
	im_with_boxes = draw_boxes(im, bounds)
	output_frames.append(np.array(im_with_boxes))
	success, frame = vidcap.read()
	count += 1

	# Default resolutions of the frame are obtained. The default resolutions are system dependent.
	# We convert the resolutions from float to integer.
	width = int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = vidcap.get(cv2.CAP_PROP_FPS)
	frames_total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Define the codec and create VideoWriter object.
	if compress_mp4:
	temp = f"{Path(output).stem}_temp{Path(output).suffix}"
	output_video = cv2.VideoWriter(
	temp, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
	)
	else:
	output_video = cv2.VideoWriter(output, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
	for frame in output_frames:
	output_video.write(frame)

	# Draw boxes with box indices in the first frame of the output video
	im = Image.fromarray(output_frames[0])
	draw = ImageDraw.Draw(im)
	font_size = 30
	font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
	for i, box in enumerate(largest_boxes):
	draw.text((box_position(box)), f"Box {i+1}", fill='red', font=ImageFont.truetype(font_path, font_size))

	output_video.release()
	vidcap.release()

	if compress_mp4:
	# Compressing the video for smaller size and web compatibility.
	os.system(
	f"ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 1 -c:a aac -f mp4 /dev/null && ffmpeg -y -i {temp} -c:v libx264 -b:v 5000k -minrate 1000k -maxrate 8000k -pass 2 -c:a aac -movflags faststart {output}"
	)
	os.system(f"rm -rf {temp} ffmpeg2pass-0.log ffmpeg2pass-0.log.mbtree")

	# Format temporal profiles as a DataFrame
	df_list = []
	for i, profile in enumerate(temporal_profiles):
	for t, text in profile:
	df_list.append({"Box": f"Box {i+1}", "Time (s)": t, "Text": text})
	df_list.append({"Box": f"", "Time (s)": "", "Text": ""})
	df = pd.concat([pd.DataFrame(df_list)])
	return output, im, df


	title = '🖼️Video to Multilingual OCR👁️Gradio'
	description = 'Multilingual OCR which works conveniently on all devices in multiple languages.'
	article = "<p style='text-align: center'></p>"

	examples = [
	['test.mp4',['en']]
	]

	css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	choices = [
	"ch_sim",
	"ch_tra",
	"de",
	"en",
	"es",
	"ja",
	"hi",
	"ru"
	]


	gr.Interface(
	inference,
	[
	gr.inputs.Video(label='Input Video'),
	gr.inputs.CheckboxGroup(choices, type="value", default=['en'], label='Language'),
	gr.inputs.Number(label='Time Step (in seconds)', default=1.0)
	],
	[
	gr.outputs.Video(label='Output Video'),
	gr.outputs.Image(label='Output Preview', type='numpy'),
	gr.outputs.Dataframe(headers=['Box', 'Time (s)', 'Text'], type='pandas')
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	css=css,
	enable_queue=True
	).launch(debug=True)