Spaces:

SR05
/

Delhi_Irish_visa_decisions

Running

App Files Files Community

SR05 commited on 4 days ago

Commit

2f0a65b

•

1 Parent(s): 065e3e9

Update loading_file.py

Browse files

Files changed (1) hide show

loading_file.py +55 -40

loading_file.py CHANGED Viewed

@@ -1,45 +1,60 @@
 import requests
 from bs4 import BeautifulSoup
-from io import BytesIO
 import streamlit as st
 # URL of the website to scrape
-url = "https://www.ireland.ie/en/india/newdelhi/services/visas/processing-times-and-decisions/"
-# Headers for the HTTP request
-headers = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
-        "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
-    )
-}
-@st.cache_data(ttl=3600, max_entries=1)
-def load_data_file():
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        soup = BeautifulSoup(response.content, 'html.parser')
-        links = soup.find_all('a')
-        # Look for the link to the .ods file
-        file_url = None
-        file_name = None
-        for link in links:
-            link_text = link.get_text(strip=True)
-            if "Visa decisions made from 1 January 2024 to" in link_text:
-                file_url = link.get('href')
-                file_name = link_text
-                break
-        if file_url:
-            if not file_url.startswith('http'):
-                file_url = requests.compat.urljoin(url, file_url)
-            file_response = requests.get(file_url, headers=headers)
-            if file_response.status_code == 200:
-                return BytesIO(file_response.content), file_name
-            else:
-                st.error(f"Failed to download the file. Status code: {file_response.status_code}")
-    else:
-        st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
-    return None, None

+import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import streamlit as st
+from io import BytesIO
 # URL of the website to scrape
+URL = "https://www.ireland.ie/en/india/newdelhi/services/visas/processing-times-and-decisions/"
+# Cache to improve performance
+@st.cache_data(ttl=3600)
+def fetch_and_process_file():
+    """Fetches the .ods file from the web and processes it into a DataFrame."""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
+    }
+    response = requests.get(URL, headers=headers)
+    if response.status_code != 200:
+        st.error(f"Failed to fetch webpage. Status code: {response.status_code}")
+        return None
+    # Extract the file link
+    soup = BeautifulSoup(response.content, "html.parser")
+    file_link = None
+    for link in soup.find_all("a"):
+        if "Visa decisions made from 1 January 2024 to" in link.text:
+            file_link = link["href"]
+            if not file_link.startswith("http"):
+                file_link = requests.compat.urljoin(URL, file_link)
+            break
+    if not file_link:
+        st.error("Could not find the .ods file link on the webpage.")
+        return None
+    # Fetch the .ods file
+    file_response = requests.get(file_link, headers=headers)
+    if file_response.status_code != 200:
+        st.error(f"Failed to fetch the .ods file. Status code: {file_response.status_code}")
+        return None
+    # Process the .ods file
+    df = pd.read_excel(BytesIO(file_response.content), engine="odf")
+    df.drop(columns=["Unnamed: 0", "Unnamed: 1"], inplace=True, errors="ignore")
+    df.dropna(how="all", inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    # Rename and restructure columns
+    for idx, row in df.iterrows():
+        if row["Unnamed: 2"] == "Application Number" and row["Unnamed: 3"] == "Decision":
+            df.columns = ["Application Number", "Decision"]
+            df = df.iloc[idx + 1 :]
+            break
+    # Preprocess the DataFrame
+    df["Application Number"] = df["Application Number"].astype(str)
+    return df
+# Load the DataFrame
+df = fetch_and_process_file()