''' this file is responsible for scraping the gandhi text ''' import pytesseract from pytesseract import Output from PIL import Image import pandas as pd from tqdm import tqdm import os.path import fitz import subprocess def do_indent(df): text = "" # clean up blanks df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')] # sort blocks vertically sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist() for block in sorted_blocks: curr = df1[df1['block_num']==block] sel = curr[curr.text.str.len()>3] char_w = (sel.width/sel.text.str.len()).mean() prev_par, prev_line, prev_left = 0, 0, 0 # text = '' for ix, ln in curr.iterrows(): # add new line when necessary if prev_par != ln['par_num']: text += '\n' prev_par = ln['par_num'] prev_line = ln['line_num'] prev_left = 0 elif prev_line != ln['line_num']: text += '\n' prev_line = ln['line_num'] prev_left = 0 added = 0 # num of spaces that should be added if ln['left']/char_w > prev_left + 1: added = int((ln['left'])/char_w) - prev_left text += ' ' * added text += ln['text'] + ' ' prev_left += len(ln['text']) + added + 1 text += '\n' return text text_file_path = 'text_files/' start_page = 0 for h in range(1,99): tfile = text_file_path+str(h)+'.txt' url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf" bashCommand = "wget "+url +" -O file.pdf" process = subprocess.Popen(bashCommand.split()) output, error = process.communicate() pdffile = "file.pdf" doc = fitz.open(pdffile) # https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg file_text = "" for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'): if i < start_page: continue page = doc.load_page(i) # number of page mat = fitz.Matrix(5, 5) # zoom factor pix = page.get_pixmap(matrix=mat) output = "outfile.png" pix.save(output) custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita' d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT) df = pd.DataFrame(d) file_text += do_indent(df) f = open(tfile,'w') f.write(file_text) f.close()