File size: 2,533 Bytes

b7c468b

'''
this file is responsible for scraping the gandhi text
'''

import pytesseract
from pytesseract import Output
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os.path

import fitz

import subprocess

def do_indent(df):
	text = ""
	# clean up blanks
	df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
	# sort blocks vertically
	sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
	for block in sorted_blocks:
	    curr = df1[df1['block_num']==block]
	    sel = curr[curr.text.str.len()>3]
	    char_w = (sel.width/sel.text.str.len()).mean()
	    prev_par, prev_line, prev_left = 0, 0, 0
	    # text = ''
	    for ix, ln in curr.iterrows():
	        # add new line when necessary
	        if prev_par != ln['par_num']:
	            text += '\n'
	            prev_par = ln['par_num']
	            prev_line = ln['line_num']
	            prev_left = 0
	        elif prev_line != ln['line_num']:
	            text += '\n'
	            prev_line = ln['line_num']
	            prev_left = 0

	        added = 0  # num of spaces that should be added
	        if ln['left']/char_w > prev_left + 1:
	            added = int((ln['left'])/char_w) - prev_left
	            text += ' ' * added 
	        text += ln['text'] + ' '
	        prev_left += len(ln['text']) + added + 1
	    text += '\n'
	return text

text_file_path = 'text_files/'
start_page = 0

for h in range(1,99):
    tfile = text_file_path+str(h)+'.txt'
    url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf"
    bashCommand = "wget "+url +" -O file.pdf"
    process = subprocess.Popen(bashCommand.split())
    output, error = process.communicate()

    pdffile = "file.pdf"
    doc = fitz.open(pdffile)
    # https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
    file_text = ""
    
    for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'):
        if i < start_page:
            continue
        page = doc.load_page(i)  # number of page
        mat = fitz.Matrix(5, 5) # zoom factor
        pix = page.get_pixmap(matrix=mat)
        output = "outfile.png"
        pix.save(output)
        custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
        d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT)
        df = pd.DataFrame(d)
        file_text += do_indent(df)
    
    f = open(tfile,'w')
    f.write(file_text)
    f.close()