Spaces:
Runtime error
Runtime error
davidpengg
commited on
Commit
•
43e0ac1
1
Parent(s):
6b79276
init
Browse files- app.py +32 -0
- download_pdf.py +43 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio App
|
3 |
+
David Peng
|
4 |
+
20230621
|
5 |
+
"""
|
6 |
+
import base64
|
7 |
+
import gradio as gr
|
8 |
+
from download_pdf import download
|
9 |
+
|
10 |
+
examples = [
|
11 |
+
"https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
|
12 |
+
]
|
13 |
+
|
14 |
+
with gr.Blocks() as app:
|
15 |
+
gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
|
16 |
+
# with gr.Row():
|
17 |
+
# with gr.Column():
|
18 |
+
# landing_page_url = gr.Textbox(label="Landing Page URL")
|
19 |
+
# landing_page_url_btrn = gr.Button(value="Extract PDF")
|
20 |
+
# with gr.Column():
|
21 |
+
# pdf_file = gr.File(label="PDF")
|
22 |
+
landing_page_url = gr.Textbox(label="Landing Page URL")
|
23 |
+
landing_page_url_btrn = gr.Button(value="Extract PDF")
|
24 |
+
pdf_file = gr.File(label="PDF")
|
25 |
+
gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
|
26 |
+
|
27 |
+
landing_page_url_btrn.click(
|
28 |
+
download,
|
29 |
+
inputs=landing_page_url,
|
30 |
+
outputs=pdf_file
|
31 |
+
)
|
32 |
+
app.launch()
|
download_pdf.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Extract PDF from indianculture[dot]gov[dot]in
|
3 |
+
David Peng
|
4 |
+
20230621
|
5 |
+
"""
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup as bs
|
8 |
+
from urllib.parse import unquote
|
9 |
+
import time
|
10 |
+
import os
|
11 |
+
|
12 |
+
DEFAULT_TIMEOUT = 10
|
13 |
+
RETURN_CODE = 0
|
14 |
+
|
15 |
+
# script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
|
16 |
+
def download(book_page_url):
|
17 |
+
while RETURN_CODE == 0 :
|
18 |
+
try:
|
19 |
+
book_page_get = requests.get(book_page_url, timeout=DEFAULT_TIMEOUT)
|
20 |
+
except:
|
21 |
+
continue
|
22 |
+
if book_page_get.status_code == 200:
|
23 |
+
break
|
24 |
+
time.sleep(10)
|
25 |
+
book_page_get = requests.get(book_page_url)
|
26 |
+
parsed_book_page = bs(book_page_get.content, 'html.parser')
|
27 |
+
class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
|
28 |
+
|
29 |
+
if len(class_pdf_in_page) >= 1:
|
30 |
+
# assume there is just 1 right now
|
31 |
+
pdf_item = class_pdf_in_page[0]
|
32 |
+
src_each_item = pdf_item['src']
|
33 |
+
pdf_address = src_each_item.split('file=')[-1]
|
34 |
+
cleaned_pdf_address = unquote(pdf_address)
|
35 |
+
pdf_name = cleaned_pdf_address.split('/')[-1]
|
36 |
+
|
37 |
+
cmd_for_curl = 'curl ' + cleaned_pdf_address + " -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0' -H 'Accept: */*' -H 'Accept-Language: en-US,en;q=0.8,sa;q=0.5,hi;q=0.3' --compressed -H 'Referer: https://www.indianculture.gov.in/libraries/pdf.js/web/viewer.html?file=https%3A%2F%2Fwww.indianculture.gov.in%2Fsystem%2Ffiles%2FdigitalFilesICWeb%2Figncarepository%2F963%2Fignca-19280-rb.pdf' -H 'DNT: 1' -H 'Connection: keep-alive' -H 'TE: Trailers'" + " --output " + pdf_name
|
38 |
+
|
39 |
+
print(cmd_for_curl)
|
40 |
+
os.system(cmd_for_curl)
|
41 |
+
return pdf_name
|
42 |
+
else:
|
43 |
+
return None
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.35.2
|
2 |
+
bs4==0.0.1
|
3 |
+
requests==2.31.0
|