SR05 commited on
Commit
2f0a65b
1 Parent(s): 065e3e9

Update loading_file.py

Browse files
Files changed (1) hide show
  1. loading_file.py +55 -40
loading_file.py CHANGED
@@ -1,45 +1,60 @@
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from io import BytesIO
4
  import streamlit as st
 
5
 
6
  # URL of the website to scrape
7
- url = "https://www.ireland.ie/en/india/newdelhi/services/visas/processing-times-and-decisions/"
8
-
9
- # Headers for the HTTP request
10
- headers = {
11
- "User-Agent": (
12
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
13
- "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
14
- )
15
- }
16
-
17
- @st.cache_data(ttl=3600, max_entries=1)
18
- def load_data_file():
19
- response = requests.get(url, headers=headers)
20
- if response.status_code == 200:
21
- soup = BeautifulSoup(response.content, 'html.parser')
22
- links = soup.find_all('a')
23
-
24
- # Look for the link to the .ods file
25
- file_url = None
26
- file_name = None
27
- for link in links:
28
- link_text = link.get_text(strip=True)
29
- if "Visa decisions made from 1 January 2024 to" in link_text:
30
- file_url = link.get('href')
31
- file_name = link_text
32
- break
33
-
34
- if file_url:
35
- if not file_url.startswith('http'):
36
- file_url = requests.compat.urljoin(url, file_url)
37
-
38
- file_response = requests.get(file_url, headers=headers)
39
- if file_response.status_code == 200:
40
- return BytesIO(file_response.content), file_name
41
- else:
42
- st.error(f"Failed to download the file. Status code: {file_response.status_code}")
43
- else:
44
- st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
45
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  import streamlit as st
5
+ from io import BytesIO
6
 
7
  # URL of the website to scrape
8
+ URL = "https://www.ireland.ie/en/india/newdelhi/services/visas/processing-times-and-decisions/"
9
+
10
+ # Cache to improve performance
11
+ @st.cache_data(ttl=3600)
12
+ def fetch_and_process_file():
13
+ """Fetches the .ods file from the web and processes it into a DataFrame."""
14
+ headers = {
15
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
16
+ }
17
+ response = requests.get(URL, headers=headers)
18
+ if response.status_code != 200:
19
+ st.error(f"Failed to fetch webpage. Status code: {response.status_code}")
20
+ return None
21
+
22
+ # Extract the file link
23
+ soup = BeautifulSoup(response.content, "html.parser")
24
+ file_link = None
25
+ for link in soup.find_all("a"):
26
+ if "Visa decisions made from 1 January 2024 to" in link.text:
27
+ file_link = link["href"]
28
+ if not file_link.startswith("http"):
29
+ file_link = requests.compat.urljoin(URL, file_link)
30
+ break
31
+
32
+ if not file_link:
33
+ st.error("Could not find the .ods file link on the webpage.")
34
+ return None
35
+
36
+ # Fetch the .ods file
37
+ file_response = requests.get(file_link, headers=headers)
38
+ if file_response.status_code != 200:
39
+ st.error(f"Failed to fetch the .ods file. Status code: {file_response.status_code}")
40
+ return None
41
+
42
+ # Process the .ods file
43
+ df = pd.read_excel(BytesIO(file_response.content), engine="odf")
44
+ df.drop(columns=["Unnamed: 0", "Unnamed: 1"], inplace=True, errors="ignore")
45
+ df.dropna(how="all", inplace=True)
46
+ df.reset_index(drop=True, inplace=True)
47
+
48
+ # Rename and restructure columns
49
+ for idx, row in df.iterrows():
50
+ if row["Unnamed: 2"] == "Application Number" and row["Unnamed: 3"] == "Decision":
51
+ df.columns = ["Application Number", "Decision"]
52
+ df = df.iloc[idx + 1 :]
53
+ break
54
+
55
+ # Preprocess the DataFrame
56
+ df["Application Number"] = df["Application Number"].astype(str)
57
+ return df
58
+
59
+ # Load the DataFrame
60
+ df = fetch_and_process_file()