davidpengg commited on
Commit
43e0ac1
1 Parent(s): 6b79276
Files changed (3) hide show
  1. app.py +32 -0
  2. download_pdf.py +43 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio App
3
+ David Peng
4
+ 20230621
5
+ """
6
+ import base64
7
+ import gradio as gr
8
+ from download_pdf import download
9
+
10
+ examples = [
11
+ "https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
12
+ ]
13
+
14
+ with gr.Blocks() as app:
15
+ gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
16
+ # with gr.Row():
17
+ # with gr.Column():
18
+ # landing_page_url = gr.Textbox(label="Landing Page URL")
19
+ # landing_page_url_btrn = gr.Button(value="Extract PDF")
20
+ # with gr.Column():
21
+ # pdf_file = gr.File(label="PDF")
22
+ landing_page_url = gr.Textbox(label="Landing Page URL")
23
+ landing_page_url_btrn = gr.Button(value="Extract PDF")
24
+ pdf_file = gr.File(label="PDF")
25
+ gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
26
+
27
+ landing_page_url_btrn.click(
28
+ download,
29
+ inputs=landing_page_url,
30
+ outputs=pdf_file
31
+ )
32
+ app.launch()
download_pdf.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract PDF from indianculture[dot]gov[dot]in
3
+ David Peng
4
+ 20230621
5
+ """
6
+ import requests
7
+ from bs4 import BeautifulSoup as bs
8
+ from urllib.parse import unquote
9
+ import time
10
+ import os
11
+
12
+ DEFAULT_TIMEOUT = 10
13
+ RETURN_CODE = 0
14
+
15
+ # script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
16
+ def download(book_page_url):
17
+ while RETURN_CODE == 0 :
18
+ try:
19
+ book_page_get = requests.get(book_page_url, timeout=DEFAULT_TIMEOUT)
20
+ except:
21
+ continue
22
+ if book_page_get.status_code == 200:
23
+ break
24
+ time.sleep(10)
25
+ book_page_get = requests.get(book_page_url)
26
+ parsed_book_page = bs(book_page_get.content, 'html.parser')
27
+ class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
28
+
29
+ if len(class_pdf_in_page) >= 1:
30
+ # assume there is just 1 right now
31
+ pdf_item = class_pdf_in_page[0]
32
+ src_each_item = pdf_item['src']
33
+ pdf_address = src_each_item.split('file=')[-1]
34
+ cleaned_pdf_address = unquote(pdf_address)
35
+ pdf_name = cleaned_pdf_address.split('/')[-1]
36
+
37
+ cmd_for_curl = 'curl ' + cleaned_pdf_address + " -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0' -H 'Accept: */*' -H 'Accept-Language: en-US,en;q=0.8,sa;q=0.5,hi;q=0.3' --compressed -H 'Referer: https://www.indianculture.gov.in/libraries/pdf.js/web/viewer.html?file=https%3A%2F%2Fwww.indianculture.gov.in%2Fsystem%2Ffiles%2FdigitalFilesICWeb%2Figncarepository%2F963%2Fignca-19280-rb.pdf' -H 'DNT: 1' -H 'Connection: keep-alive' -H 'TE: Trailers'" + " --output " + pdf_name
38
+
39
+ print(cmd_for_curl)
40
+ os.system(cmd_for_curl)
41
+ return pdf_name
42
+ else:
43
+ return None
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==3.35.2
2
+ bs4==0.0.1
3
+ requests==2.31.0