Robzy commited on
Commit
0ef6b8e
1 Parent(s): 938a35d
Files changed (3) hide show
  1. README.md +8 -2
  2. app_streamlit.py +2 -2
  3. merge_df.py +39 -15
README.md CHANGED
@@ -12,7 +12,7 @@ short_description: Air quality forecasting for Lahore, Pakistan!
12
 
13
  # Air Quality Monitoring for Lahore, Pakistan
14
 
15
- ### Dashboard link: https://huggingface.co/spaces/Robzy/hbg-weather
16
 
17
  # Architecture & pipeline
18
 
@@ -53,4 +53,10 @@ Weather data features:
53
 
54
  HuggingFace's Streamlit Spaces is used to display the hindcast, forecast, and real air quality using an interactive line graph. GitHub Actions is used to call the feature and inference pipeline daily by levraging schedulin.
55
 
56
- Note that backfilling, feature group creation and model training is only performed once
 
 
 
 
 
 
 
12
 
13
  # Air Quality Monitoring for Lahore, Pakistan
14
 
15
+ ### [Dashboard link](https://huggingface.co/spaces/Robzy/hbg-weather)
16
 
17
  # Architecture & pipeline
18
 
 
53
 
54
  HuggingFace's Streamlit Spaces is used to display the hindcast, forecast, and real air quality using an interactive line graph. GitHub Actions is used to call the feature and inference pipeline daily by levraging schedulin.
55
 
56
+ Note that backfilling, feature group creation and model training is only performed once
57
+
58
+
59
+
60
+ ### Aknowledments
61
+
62
+ * [HuggingFace restart scheduler](https://huggingface.co/spaces/davanstrien/restart/blob/main/app.py)
app_streamlit.py CHANGED
@@ -51,9 +51,9 @@ st.plotly_chart(fig)
51
 
52
  HF_TOKEN = os.getenv("HF_TOKEN")
53
  def restart():
54
- restart_space("Robzy/hgb-weather", token=HF_TOKEN)
55
 
56
  time_start = datetime.now()
57
  scheduler = BackgroundScheduler()
58
  job = scheduler.add_job(restart, "interval", minutes=2)
59
- scheduler.start()
 
51
 
52
  HF_TOKEN = os.getenv("HF_TOKEN")
53
  def restart():
54
+ restart_space("davanstrien/restart", token=HF_TOKEN)
55
 
56
  time_start = datetime.now()
57
  scheduler = BackgroundScheduler()
58
  job = scheduler.add_job(restart, "interval", minutes=2)
59
+ scheduler.start()
merge_df.py CHANGED
@@ -2,6 +2,11 @@ import datetime
2
  import pandas as pd
3
  import hopsworks
4
  import os
 
 
 
 
 
5
 
6
  os.environ['HOPSWORKS_PROJECT'] = os.getenv('HOPSWORKS_PROJECT')
7
  os.environ['HOPSWORKS_API_KEY'] = os.getenv('HOPSWORKS_API_KEY')
@@ -9,6 +14,16 @@ os.environ['HOPSWORKS_API_KEY'] = os.getenv('HOPSWORKS_API_KEY')
9
  project = hopsworks.login()
10
  fs = project.get_feature_store()
11
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def get_merged_dataframe():
14
 
@@ -26,28 +41,37 @@ def get_merged_dataframe():
26
  version=1,
27
  )
28
 
29
- selected_features = air_quality_fg.select_all(['pm25'])
30
- selected_features = selected_features.read()
31
- predicted_data = monitor_fg.read()
 
32
 
33
- #filter columns
34
- selected_features = selected_features[['date', 'pm25']]
35
- predicted_data = predicted_data[['date','predicted_pm25']]
36
- #predicted_data = predicted_data.rename(columns={"predicted_pm25" : "pm25"})
37
- predicted_data = predicted_data.sort_values(by=['date'], ascending=True)
38
 
39
- #merge the dataframes
40
- selected_features = selected_features.reset_index(drop=True)
41
- predicted_data = predicted_data.reset_index(drop=True)
42
 
43
- outcome_df = selected_features[['date', 'pm25']]
44
- preds_df = predicted_data[['date', 'predicted_pm25']]
 
 
 
 
 
 
45
 
46
 
 
 
47
 
48
- combined_df = pd.concat([selected_features, predicted_data], axis=0, ignore_index=True)
 
 
49
  combined_df['date'] = pd.to_datetime(combined_df['date'], utc=True).dt.tz_convert(None).astype('datetime64[ns]')
50
 
51
- return combined_df
 
 
 
 
52
 
53
  print(get_merged_dataframe())
 
2
  import pandas as pd
3
  import hopsworks
4
  import os
5
+ import datetime
6
+ from xgboost import XGBRegressor
7
+ import pandas as pd
8
+ import hopsworks
9
+ import os
10
 
11
  os.environ['HOPSWORKS_PROJECT'] = os.getenv('HOPSWORKS_PROJECT')
12
  os.environ['HOPSWORKS_API_KEY'] = os.getenv('HOPSWORKS_API_KEY')
 
14
  project = hopsworks.login()
15
  fs = project.get_feature_store()
16
 
17
+ project = hopsworks.login()
18
+ fs = project.get_feature_store()
19
+
20
+ mr = project.get_model_registry()
21
+ retrieved_model = mr.get_model(
22
+ name="air_quality_xgboost_model",
23
+ version=1,
24
+ )
25
+ saved_model_dir = retrieved_model.download()
26
+
27
 
28
  def get_merged_dataframe():
29
 
 
41
  version=1,
42
  )
43
 
44
+ weather_fg = fs.get_feature_group(
45
+ name='weather',
46
+ version=1,
47
+ )
48
 
49
+ retrieved_xgboost_model = XGBRegressor()
50
+ retrieved_xgboost_model.load_model(saved_model_dir + "/model.json")
 
 
 
51
 
 
 
 
52
 
53
+ selected_features = air_quality_fg.select_all(['pm25', 'past_air_quality']).join(weather_fg.select(['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']), on=['city'])
54
+ selected_features = selected_features.read()
55
+ selected_features['date'] = pd.to_datetime(selected_features['date'], utc=True).dt.tz_convert(None).astype('datetime64[ns]')
56
+
57
+ predicted_data = monitor_fg.read()
58
+ predicted_data = predicted_data[['date','predicted_pm25']]
59
+ predicted_data['date'] = predicted_data['date'].dt.tz_convert(None).astype('datetime64[ns]')
60
+ predicted_data = predicted_data.sort_values(by=['date'], ascending=True).reset_index(drop=True)
61
 
62
 
63
+ #get historical predicted pm25
64
+ selected_features['predicted_pm25'] = retrieved_xgboost_model.predict(selected_features[['past_air_quality','temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
65
 
66
+ #merge data
67
+ selected_features = selected_features[['date', 'pm25', 'predicted_pm25']]
68
+ combined_df = pd.merge(selected_features, predicted_data,on='date', how='outer')
69
  combined_df['date'] = pd.to_datetime(combined_df['date'], utc=True).dt.tz_convert(None).astype('datetime64[ns]')
70
 
71
+ # Combine the predicted_pm25_x and predicted_pm25_y columns into one
72
+ combined_df['predicted_pm25'] = combined_df['predicted_pm25_x'].combine_first(combined_df['predicted_pm25_y'])
73
+
74
+ # Drop the individual columns after merging
75
+ combined_df = combined_df.drop(columns=['predicted_pm25_x', 'predicted_pm25_y'])
76
 
77
  print(get_merged_dataframe())