awacke1 commited on
Commit
ba33805
·
1 Parent(s): 4cb0725

Update backup.py

Browse files
Files changed (1) hide show
  1. backup.py +68 -22
backup.py CHANGED
@@ -4,12 +4,15 @@ import os
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
 
 
7
 
8
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
9
 
10
- # Create a history.txt file if it doesn't exist yet
11
- with open("history.txt", "a+") as f:
12
- f.close()
 
13
 
14
  def download_file(url, local_filename):
15
  if url.startswith('http://') or url.startswith('https://'):
@@ -23,54 +26,97 @@ def download_file(url, local_filename):
23
  except requests.exceptions.HTTPError as err:
24
  print(f"HTTP error occurred: {err}")
25
 
26
- def download_html_and_files(url):
27
  html_content = requests.get(url).text
28
  soup = BeautifulSoup(html_content, 'html.parser')
29
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
 
30
  for link in soup.find_all('a'):
31
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
32
- local_filename = urllib.parse.urlparse(file_url).path.split('/')[-1]
33
- if local_filename:
 
 
34
  link['href'] = local_filename
35
  download_file(file_url, local_filename)
36
- with open("index.html", "w") as file:
 
 
37
  file.write(str(soup))
38
 
39
  def list_files(directory_path='.'):
40
  files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
41
  return [f for f in files if f not in EXCLUDED_FILES]
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def get_download_link(file):
44
  with open(file, "rb") as f:
45
  bytes = f.read()
46
  b64 = base64.b64encode(bytes).decode()
47
- href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file}\'>Click to download {file}</a>'
48
  return href
49
 
50
- def show_download_links():
51
- st.sidebar.write('Here are the files you can download:')
52
- for file in list_files():
53
- st.sidebar.markdown(get_download_link(file), unsafe_allow_html=True)
54
 
55
  def main():
56
  st.sidebar.title('Web Datasets Bulk Downloader')
57
  url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
58
 
59
- # Save the history of URL entered as a text file
 
 
 
 
60
  if url:
61
- with open("history.txt", "a") as f:
62
- f.write(url + "\n")
 
 
 
 
 
63
 
64
  if st.sidebar.button('📥 Get All the Content'):
65
- download_html_and_files(url)
66
- show_download_links()
 
67
  if st.sidebar.button('📂 Show Download Links'):
68
- show_download_links()
 
69
 
70
  # Display history as markdown
71
- with open("history.txt", "r") as f:
72
- history = f.read()
73
- st.markdown(f"### History\n\n{history}")
 
74
 
75
  if __name__ == "__main__":
76
- main()
 
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
7
+ import hashlib
8
+ import json
9
 
10
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
11
 
12
+ # Create a history.json file if it doesn't exist yet
13
+ if not os.path.exists("history.json"):
14
+ with open("history.json", "w") as f:
15
+ json.dump({}, f)
16
 
17
  def download_file(url, local_filename):
18
  if url.startswith('http://') or url.startswith('https://'):
 
26
  except requests.exceptions.HTTPError as err:
27
  print(f"HTTP error occurred: {err}")
28
 
29
+ def download_html_and_files(url, subdir):
30
  html_content = requests.get(url).text
31
  soup = BeautifulSoup(html_content, 'html.parser')
32
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
33
+
34
  for link in soup.find_all('a'):
35
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
36
+ local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
37
+
38
+ # Skip if the local filename is a directory
39
+ if not local_filename.endswith('/') and local_filename != subdir:
40
  link['href'] = local_filename
41
  download_file(file_url, local_filename)
42
+
43
+ # Save the modified HTML content
44
+ with open(os.path.join(subdir, "index.html"), "w") as file:
45
  file.write(str(soup))
46
 
47
  def list_files(directory_path='.'):
48
  files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
49
  return [f for f in files if f not in EXCLUDED_FILES]
50
 
51
+
52
+ def show_file_operations(file_path):
53
+ st.write(f"File: {os.path.basename(file_path)}")
54
+
55
+ # Edit button
56
+ if st.button(f"✏️ Edit {os.path.basename(file_path)}"):
57
+ with open(file_path, "r") as f:
58
+ file_content = f.read()
59
+ file_content = st.text_area("Edit the file content:", value=file_content, height=250)
60
+ if st.button(f"💾 Save {os.path.basename(file_path)}"):
61
+ with open(file_path, "w") as f:
62
+ f.write(file_content)
63
+ st.success(f"File {os.path.basename(file_path)} saved!")
64
+
65
+ # Delete button
66
+ if st.button(f"🗑️ Delete {os.path.basename(file_path)}"):
67
+ os.remove(file_path)
68
+ st.markdown(f"🎉 File {os.path.basename(file_path)} deleted!")
69
+
70
+ def show_download_links(subdir):
71
+ st.write(f'Files for {subdir}:')
72
+ for file in list_files(subdir):
73
+ file_path = os.path.join(subdir, file)
74
+ if os.path.isfile(file_path):
75
+ st.markdown(get_download_link(file_path), unsafe_allow_html=True)
76
+ show_file_operations(file_path)
77
+ else:
78
+ st.write(f"File not found: {file}")
79
+
80
+
81
  def get_download_link(file):
82
  with open(file, "rb") as f:
83
  bytes = f.read()
84
  b64 = base64.b64encode(bytes).decode()
85
+ href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Click to download {os.path.basename(file)}</a>'
86
  return href
87
 
 
 
 
 
88
 
89
  def main():
90
  st.sidebar.title('Web Datasets Bulk Downloader')
91
  url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
92
 
93
+ # Load history
94
+ with open("history.json", "r") as f:
95
+ history = json.load(f)
96
+
97
+ # Save the history of URL entered as a json file
98
  if url:
99
+ subdir = hashlib.md5(url.encode()).hexdigest()
100
+ if not os.path.exists(subdir):
101
+ os.makedirs(subdir)
102
+ if url not in history:
103
+ history[url] = subdir
104
+ with open("history.json", "w") as f:
105
+ json.dump(history, f)
106
 
107
  if st.sidebar.button('📥 Get All the Content'):
108
+ download_html_and_files(url, history[url])
109
+ show_download_links(history[url])
110
+
111
  if st.sidebar.button('📂 Show Download Links'):
112
+ for subdir in history.values():
113
+ show_download_links(subdir)
114
 
115
  # Display history as markdown
116
+ with st.expander("URL History and Downloaded Files"):
117
+ for url, subdir in history.items():
118
+ st.markdown(f"#### {url}")
119
+ show_download_links(subdir)
120
 
121
  if __name__ == "__main__":
122
+ main()