|
''' |
|
this file is responsible for scraping the gandhi text |
|
''' |
|
|
|
import pytesseract |
|
from pytesseract import Output |
|
from PIL import Image |
|
import pandas as pd |
|
from tqdm import tqdm |
|
import os.path |
|
|
|
import fitz |
|
|
|
import subprocess |
|
|
|
def do_indent(df): |
|
text = "" |
|
|
|
df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')] |
|
|
|
sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist() |
|
for block in sorted_blocks: |
|
curr = df1[df1['block_num']==block] |
|
sel = curr[curr.text.str.len()>3] |
|
char_w = (sel.width/sel.text.str.len()).mean() |
|
prev_par, prev_line, prev_left = 0, 0, 0 |
|
|
|
for ix, ln in curr.iterrows(): |
|
|
|
if prev_par != ln['par_num']: |
|
text += '\n' |
|
prev_par = ln['par_num'] |
|
prev_line = ln['line_num'] |
|
prev_left = 0 |
|
elif prev_line != ln['line_num']: |
|
text += '\n' |
|
prev_line = ln['line_num'] |
|
prev_left = 0 |
|
|
|
added = 0 |
|
if ln['left']/char_w > prev_left + 1: |
|
added = int((ln['left'])/char_w) - prev_left |
|
text += ' ' * added |
|
text += ln['text'] + ' ' |
|
prev_left += len(ln['text']) + added + 1 |
|
text += '\n' |
|
return text |
|
|
|
text_file_path = 'text_files/' |
|
start_page = 0 |
|
|
|
for h in range(1,99): |
|
tfile = text_file_path+str(h)+'.txt' |
|
url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf" |
|
bashCommand = "wget "+url +" -O file.pdf" |
|
process = subprocess.Popen(bashCommand.split()) |
|
output, error = process.communicate() |
|
|
|
pdffile = "file.pdf" |
|
doc = fitz.open(pdffile) |
|
|
|
file_text = "" |
|
|
|
for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'): |
|
if i < start_page: |
|
continue |
|
page = doc.load_page(i) |
|
mat = fitz.Matrix(5, 5) |
|
pix = page.get_pixmap(matrix=mat) |
|
output = "outfile.png" |
|
pix.save(output) |
|
custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita' |
|
d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT) |
|
df = pd.DataFrame(d) |
|
file_text += do_indent(df) |
|
|
|
f = open(tfile,'w') |
|
f.write(file_text) |
|
f.close() |
|
|
|
|
|
|
|
|