awacke1 commited on
Commit
524da2b
·
1 Parent(s): 972fa90

Update backup.py

Browse files
Files changed (1) hide show
  1. backup.py +22 -19
backup.py CHANGED
@@ -1,12 +1,14 @@
1
  import streamlit as st
2
  import requests
3
- from bs4 import BeautifulSoup
4
  import os
5
  import urllib
6
  import base64
 
 
 
7
 
8
  def download_file(url, local_filename):
9
- if url.startswith('http://') or url.startswith('https://'): # add this line
10
  try:
11
  with requests.get(url, stream=True) as r:
12
  r.raise_for_status()
@@ -15,37 +17,24 @@ def download_file(url, local_filename):
15
  f.write(chunk)
16
  return local_filename
17
  except requests.exceptions.HTTPError as err:
18
- print(f"HTTP error occurred: {err}") # or use logging
19
-
20
 
21
  def download_html_and_files(url):
22
  html_content = requests.get(url).text
23
  soup = BeautifulSoup(html_content, 'html.parser')
24
-
25
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
26
-
27
  for link in soup.find_all('a'):
28
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
29
  local_filename = urllib.parse.urlparse(file_url).path.split('/')[-1]
30
- if local_filename: # add this line
31
  link['href'] = local_filename
32
  download_file(file_url, local_filename)
33
-
34
  with open("index.html", "w") as file:
35
  file.write(str(soup))
36
 
37
-
38
  def list_files(directory_path='.'):
39
- return [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
40
-
41
- def main():
42
- st.sidebar.title('Bulk Download Tool')
43
- url = st.sidebar.text_input('Please enter a URL to bulk download text and files')
44
- if st.sidebar.button('📥 Get All the Content'):
45
- download_html_and_files(url)
46
- st.sidebar.write('Download complete. Here are the files you can download:')
47
- for file in list_files():
48
- st.sidebar.markdown(get_download_link(file), unsafe_allow_html=True)
49
 
50
  def get_download_link(file):
51
  with open(file, "rb") as f:
@@ -54,5 +43,19 @@ def get_download_link(file):
54
  href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file}\'>Click to download {file}</a>'
55
  return href
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  if __name__ == "__main__":
58
  main()
 
1
  import streamlit as st
2
  import requests
 
3
  import os
4
  import urllib
5
  import base64
6
+ from bs4 import BeautifulSoup
7
+
8
+ EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
9
 
10
  def download_file(url, local_filename):
11
+ if url.startswith('http://') or url.startswith('https://'):
12
  try:
13
  with requests.get(url, stream=True) as r:
14
  r.raise_for_status()
 
17
  f.write(chunk)
18
  return local_filename
19
  except requests.exceptions.HTTPError as err:
20
+ print(f"HTTP error occurred: {err}")
 
21
 
22
  def download_html_and_files(url):
23
  html_content = requests.get(url).text
24
  soup = BeautifulSoup(html_content, 'html.parser')
 
25
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
 
26
  for link in soup.find_all('a'):
27
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
28
  local_filename = urllib.parse.urlparse(file_url).path.split('/')[-1]
29
+ if local_filename:
30
  link['href'] = local_filename
31
  download_file(file_url, local_filename)
 
32
  with open("index.html", "w") as file:
33
  file.write(str(soup))
34
 
 
35
  def list_files(directory_path='.'):
36
+ files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
37
+ return [f for f in files if f not in EXCLUDED_FILES]
 
 
 
 
 
 
 
 
38
 
39
  def get_download_link(file):
40
  with open(file, "rb") as f:
 
43
  href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file}\'>Click to download {file}</a>'
44
  return href
45
 
46
+ def show_download_links():
47
+ st.sidebar.write('Here are the files you can download:')
48
+ for file in list_files():
49
+ st.sidebar.markdown(get_download_link(file), unsafe_allow_html=True)
50
+
51
+ def main():
52
+ st.sidebar.title('Web Datasets Bulk Downloader')
53
+ url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
54
+ if st.sidebar.button('📥 Get All the Content'):
55
+ download_html_and_files(url)
56
+ show_download_links()
57
+ if st.sidebar.button('📂 Show Download Links'):
58
+ show_download_links()
59
+
60
  if __name__ == "__main__":
61
  main()