Spaces:
Running
Running
import pandas as pd | |
from typing import Any, Dict, List | |
import datetime | |
import pandas as pd | |
import hopsworks | |
from hsfs.feature import Feature | |
def get_historical_data_for_date(date: str, feature_view, weather_fg, model) -> pd.DataFrame: | |
""" | |
Retrieve data for a specific date from a feature view. | |
Args: | |
date (str): The date in the format "%Y-%m-%d". | |
feature_view: The feature view object. | |
model: The machine learning model used for prediction. | |
Returns: | |
pd.DataFrame: A DataFrame containing data for the specified date. | |
""" | |
# Convert date string to datetime object | |
date_datetime = datetime.datetime.strptime(date, "%Y-%m-%d").date() | |
features_df, labels_df = feature_view.training_data( | |
start_time=date_datetime, | |
end_time=date_datetime + datetime.timedelta(days=1), | |
# event_time=True, | |
statistics_config=False | |
) | |
# bugfix line, shouldn't need to cast to datetime | |
features_df['date'] = pd.to_datetime(features_df['date']) | |
batch_data = features_df | |
batch_data['pm25'] = labels_df['pm25'] | |
batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d')) | |
return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True) | |
def get_historical_data_in_date_range(date_start: str, date_end: str, feature_view, weather_fg, model) -> pd.DataFrame: | |
""" | |
Retrieve data for a specific date range from a time in the past from a feature view. | |
Args: | |
date_start (str): The start date in the format "%Y-%m-%d". | |
date_end (str): The end date in the format "%Y-%m-%d". | |
feature_view: The feature view object. | |
model: The machine learning model used for prediction. | |
Returns: | |
pd.DataFrame: A DataFrame containing data for the specified date range. | |
""" | |
# Convert date strings to datetime objects | |
# date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d").date() | |
# date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d").date() | |
batch_data = feature_view.query.read() | |
batch_data = batch_data[(batch_data['date'] >= date_start) & (batch_data['date'] <= date_end)] | |
batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d')) | |
return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True) | |
def get_future_data_for_date(date: str, feature_view, weather_fg, model) -> pd.DataFrame: | |
""" | |
Predicts future PM2.5 data for a specified date using a given feature view and model. | |
Args: | |
date (str): The date in the format "%Y-%m-%d". | |
feature_view: The feature view object. | |
model: The machine learning model used for prediction. | |
Returns: | |
pd.DataFrame: A DataFrame containing data for the specified date. | |
""" | |
date_start_dt = datetime.datetime.strptime(date, "%Y-%m-%d") #.date() | |
fg_data = weather_fg.read() | |
# Couldn't get our filters to work, so filter in memory | |
df = fg_data[fg_data.date == date_start_dt] | |
batch_data = df.drop(['date', 'city'], axis=1) | |
df['pm25'] = model.predict(batch_data) | |
return df[['date', 'pm25']].sort_values('date').reset_index(drop=True) | |
def get_future_data_in_date_range(date_start: str, date_end: str, feature_view, weather_fg, model) -> pd.DataFrame: | |
""" | |
Predicts future PM2.5 data for a specified start and end date range using a given feature view and model. | |
Args: | |
date_start (str): The start date in the format "%Y-%m-%d". | |
date_end (str): The end date in the format "%Y-%m-%d". | |
feature_view: The feature view object. | |
model: The machine learning model used for prediction. | |
Returns: | |
pd.DataFrame: A DataFrame containing data for the specified date range. | |
""" | |
date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d") #.date() | |
if date_end == None: | |
date_end = date_start | |
date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d") #.date() | |
fg_data = weather_fg.read() | |
# Fix bug: Cannot compare tz-naive and tz-aware datetime-like objects | |
fg_data['date'] = pd.to_datetime(fg_data['date']).dt.tz_localize(None) | |
# Couldn't get our filters to work, so filter in memory | |
df = fg_data[(fg_data['date'] >= date_start_dt) & (fg_data['date'] <= date_end_dt)] | |
batch_data = df.drop(['date', 'city'], axis=1) | |
df['pm25'] = model.predict(batch_data) | |
return df[['date', 'pm25']].sort_values('date').reset_index(drop=True) | |