File size: 2,533 Bytes
b7c468b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
'''
this file is responsible for scraping the gandhi text
'''
import pytesseract
from pytesseract import Output
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os.path
import fitz
import subprocess
def do_indent(df):
text = ""
# clean up blanks
df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
# sort blocks vertically
sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
for block in sorted_blocks:
curr = df1[df1['block_num']==block]
sel = curr[curr.text.str.len()>3]
char_w = (sel.width/sel.text.str.len()).mean()
prev_par, prev_line, prev_left = 0, 0, 0
# text = ''
for ix, ln in curr.iterrows():
# add new line when necessary
if prev_par != ln['par_num']:
text += '\n'
prev_par = ln['par_num']
prev_line = ln['line_num']
prev_left = 0
elif prev_line != ln['line_num']:
text += '\n'
prev_line = ln['line_num']
prev_left = 0
added = 0 # num of spaces that should be added
if ln['left']/char_w > prev_left + 1:
added = int((ln['left'])/char_w) - prev_left
text += ' ' * added
text += ln['text'] + ' '
prev_left += len(ln['text']) + added + 1
text += '\n'
return text
text_file_path = 'text_files/'
start_page = 0
for h in range(1,99):
tfile = text_file_path+str(h)+'.txt'
url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf"
bashCommand = "wget "+url +" -O file.pdf"
process = subprocess.Popen(bashCommand.split())
output, error = process.communicate()
pdffile = "file.pdf"
doc = fitz.open(pdffile)
# https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
file_text = ""
for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'):
if i < start_page:
continue
page = doc.load_page(i) # number of page
mat = fitz.Matrix(5, 5) # zoom factor
pix = page.get_pixmap(matrix=mat)
output = "outfile.png"
pix.save(output)
custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT)
df = pd.DataFrame(d)
file_text += do_indent(df)
f = open(tfile,'w')
f.write(file_text)
f.close()
|