ask_my_thesis / OCR_directory.sh
jordyvl's picture
First commit
e0a78f5
raw
history blame contribute delete
463 Bytes
# pdftk thesis.pdf burst
#using pdf2text, extract text for each page in assets/pdfs and store in asssets/txts with similar basename
for pdf in assets/pdfs/*.pdf
do
echo
#pdftotext $pdf assets/txts/$(basename $pdf .pdf).txt
#pdf2txt.py -o assets/txts/$(basename $pdf .pdf).txt $pdf
done
for pdf in assets/pdfs/*.pdf
do
convert -density 100 -quality 100 -colorspace RGB -alpha remove -alpha off $pdf assets/pngs/$(basename $pdf .pdf).png
done