ritwikm
/

gandhi-gpt

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

gandhi-gpt / code /myocr.py

ritwikm's picture

added code files

b7c468b over 2 years ago

history blame contribute delete

2.53 kB

	'''
	this file is responsible for scraping the gandhi text
	'''

	import pytesseract
	from pytesseract import Output
	from PIL import Image
	import pandas as pd
	from tqdm import tqdm
	import os.path

	import fitz

	import subprocess

	def do_indent(df):
	text = ""
	# clean up blanks
	df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
	# sort blocks vertically
	sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
	for block in sorted_blocks:
	curr = df1[df1['block_num']==block]
	sel = curr[curr.text.str.len()>3]
	char_w = (sel.width/sel.text.str.len()).mean()
	prev_par, prev_line, prev_left = 0, 0, 0
	# text = ''
	for ix, ln in curr.iterrows():
	# add new line when necessary
	if prev_par != ln['par_num']:
	text += '\n'
	prev_par = ln['par_num']
	prev_line = ln['line_num']
	prev_left = 0
	elif prev_line != ln['line_num']:
	text += '\n'
	prev_line = ln['line_num']
	prev_left = 0

	added = 0 # num of spaces that should be added
	if ln['left']/char_w > prev_left + 1:
	added = int((ln['left'])/char_w) - prev_left
	text += ' ' * added
	text += ln['text'] + ' '
	prev_left += len(ln['text']) + added + 1
	text += '\n'
	return text

	text_file_path = 'text_files/'
	start_page = 0

	for h in range(1,99):
	tfile = text_file_path+str(h)+'.txt'
	url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf"
	bashCommand = "wget "+url +" -O file.pdf"
	process = subprocess.Popen(bashCommand.split())
	output, error = process.communicate()

	pdffile = "file.pdf"
	doc = fitz.open(pdffile)
	# https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
	file_text = ""

	for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'):
	if i < start_page:
	continue
	page = doc.load_page(i) # number of page
	mat = fitz.Matrix(5, 5) # zoom factor
	pix = page.get_pixmap(matrix=mat)
	output = "outfile.png"
	pix.save(output)
	custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
	d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT)
	df = pd.DataFrame(d)
	file_text += do_indent(df)

	f = open(tfile,'w')
	f.write(file_text)
	f.close()