Spaces:

Robzy
/

hbg-weather

Running

App Files Files Community

Robzy commited on 15 days ago

Commit

35ffba0

•

1 Parent(s): 8e5eb32

starting to write scripts

Browse files

Files changed (14) hide show

.gitignore +3 -0
backfill.py +62 -0
functions/__pycache__/air_quality_data_retrieval.cpython-312.pyc +0 -0
functions/__pycache__/context_engineering.cpython-312.pyc +0 -0
functions/__pycache__/llm_chain.cpython-312.pyc +0 -0
functions/__pycache__/util.cpython-312.pyc +0 -0
functions/air_quality_data_retrieval.py +115 -0
functions/context_engineering.py +248 -0
functions/llm_chain.py +246 -0
functions/util.py +311 -0
infer.py +52 -0
requirements-llm.txt +11 -0
requirements.txt +21 -0
training.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+.env
+.cache.sqlite

backfill.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import datetime
+import requests
+import pandas as pd
+import hopsworks
+import datetime
+from pathlib import Path
+from functions import util
+import json
+import re
+import os
+import warnings
+import pandas as pd
+api_key = os.getenv('HOPSWORKS_API_KEY')
+project_name = os.getenv('HOPSWORKS_PROJECT')
+project = hopsworks.login(project=project_name, api_key_value=api_key)
+fs = project.get_feature_store()
+secrets = util.secrets_api(project.name)
+AQI_API_KEY = secrets.get_secret("AQI_API_KEY").value
+location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
+location = json.loads(location_str)
+country=location['country']
+city=location['city']
+street=location['street']
+aqicn_url=location['aqicn_url']
+latitude=location['latitude']
+longitude=location['longitude']
+today = datetime.date.today()
+# Retrieve feature groups
+air_quality_fg = fs.get_feature_group(
+    name='air_quality',
+    version=1,
+)
+weather_fg = fs.get_feature_group(
+    name='weather',
+    version=1,
+)
+aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQI_API_KEY)
+#aq_today_df = util.get_pm25(aqicn_url, country, city, street, "2024-11-15", AQI_API_KEY)
+aq_today_df['date'] = pd.to_datetime(aq_today_df['date']).dt.date
+aq_today_df
+# Get weather forecast data
+hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
+hourly_df = hourly_df.set_index('date')
+# We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
+# We only want the daily weather data, so only get weather at 12:00
+daily_df = hourly_df.between_time('11:59', '12:01')
+daily_df = daily_df.reset_index()
+daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
+daily_df['date'] = pd.to_datetime(daily_df['date'])
+# daily_df['date'] = daily_df['date'].astype(str)
+daily_df['city'] = city
+daily_df

functions/__pycache__/air_quality_data_retrieval.cpython-312.pyc ADDED Viewed

Binary file (5.82 kB). View file

functions/__pycache__/context_engineering.cpython-312.pyc ADDED Viewed

Binary file (9.77 kB). View file

functions/__pycache__/llm_chain.cpython-312.pyc ADDED Viewed

Binary file (7.77 kB). View file

functions/__pycache__/util.cpython-312.pyc ADDED Viewed

Binary file (16.8 kB). View file

functions/air_quality_data_retrieval.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+from typing import Any, Dict, List
+import datetime
+import pandas as pd
+import hopsworks
+from hsfs.feature import Feature
+def get_historical_data_for_date(date: str, feature_view, weather_fg, model) -> pd.DataFrame:
+    """
+    Retrieve data for a specific date from a feature view.
+    Args:
+        date (str): The date in the format "%Y-%m-%d".
+        feature_view: The feature view object.
+        model: The machine learning model used for prediction.
+    Returns:
+        pd.DataFrame: A DataFrame containing data for the specified date.
+    """
+    # Convert date string to datetime object
+    date_datetime = datetime.datetime.strptime(date, "%Y-%m-%d").date()
+    features_df, labels_df = feature_view.training_data(
+        start_time=date_datetime,
+        end_time=date_datetime + datetime.timedelta(days=1),
+        # event_time=True,
+        statistics_config=False
+    )
+    # bugfix line, shouldn't need to cast to datetime
+    features_df['date'] = pd.to_datetime(features_df['date'])
+    batch_data = features_df
+    batch_data['pm25'] = labels_df['pm25']
+    batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
+    return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True)
+def get_historical_data_in_date_range(date_start: str, date_end: str, feature_view,  weather_fg, model) -> pd.DataFrame:
+    """
+    Retrieve data for a specific date range from a time in the past from a feature view.
+    Args:
+        date_start (str): The start date in the format "%Y-%m-%d".
+        date_end (str): The end date in the format "%Y-%m-%d".
+        feature_view: The feature view object.
+        model: The machine learning model used for prediction.
+    Returns:
+        pd.DataFrame: A DataFrame containing data for the specified date range.
+    """
+    # Convert date strings to datetime objects
+#     date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d").date()
+#     date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d").date()
+    batch_data = feature_view.query.read()
+    batch_data = batch_data[(batch_data['date'] >= date_start) & (batch_data['date'] <= date_end)]
+    batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
+    return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True)
+def get_future_data_for_date(date: str, feature_view,  weather_fg, model) -> pd.DataFrame:
+    """
+    Predicts future PM2.5 data for a specified date using a given feature view and model.
+    Args:
+        date (str): The date in the format "%Y-%m-%d".
+        feature_view: The feature view object.
+        model: The machine learning model used for prediction.
+    Returns:
+        pd.DataFrame: A DataFrame containing data for the specified date.
+    """
+    date_start_dt = datetime.datetime.strptime(date, "%Y-%m-%d") #.date()
+    fg_data = weather_fg.read()
+    # Couldn't get our filters to work, so filter in memory
+    df = fg_data[fg_data.date == date_start_dt]
+    batch_data = df.drop(['date', 'city'], axis=1)
+    df['pm25'] = model.predict(batch_data)
+    return df[['date', 'pm25']].sort_values('date').reset_index(drop=True)
+def get_future_data_in_date_range(date_start: str, date_end: str, feature_view,  weather_fg, model) -> pd.DataFrame:
+    """
+    Predicts future PM2.5 data for a specified start and end date range using a given feature view and model.
+    Args:
+        date_start (str): The start date in the format "%Y-%m-%d".
+        date_end (str): The end date in the format "%Y-%m-%d".
+        feature_view: The feature view object.
+        model: The machine learning model used for prediction.
+    Returns:
+        pd.DataFrame: A DataFrame containing data for the specified date range.
+    """
+    date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d") #.date()
+    if date_end == None:
+        date_end = date_start
+    date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d") #.date()
+    fg_data = weather_fg.read()
+    # Fix bug: Cannot compare tz-naive and tz-aware datetime-like objects
+    fg_data['date'] = pd.to_datetime(fg_data['date']).dt.tz_localize(None)
+    # Couldn't get our filters to work, so filter in memory
+    df = fg_data[(fg_data['date'] >= date_start_dt) & (fg_data['date'] <= date_end_dt)]
+    batch_data = df.drop(['date', 'city'], axis=1)
+    df['pm25'] = model.predict(batch_data)
+    return df[['date', 'pm25']].sort_values('date').reset_index(drop=True)

functions/context_engineering.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import xml.etree.ElementTree as ET
+import re
+import inspect
+from typing import get_type_hints
+import json
+import datetime
+import torch
+import sys
+import pandas as pd
+from openai import OpenAI
+from functions.air_quality_data_retrieval import (
+    get_historical_data_for_date,
+    get_historical_data_in_date_range,
+    get_future_data_in_date_range,
+    get_future_data_for_date,
+)
+from typing import Any, Dict, List
+def get_type_name(t: Any) -> str:
+    """Get the name of the type."""
+    name = str(t)
+    if "list" in name or "dict" in name:
+        return name
+    else:
+        return t.__name__
+def serialize_function_to_json(func: Any) -> str:
+    """Serialize a function to JSON."""
+    signature = inspect.signature(func)
+    type_hints = get_type_hints(func)
+    function_info = {
+        "name": func.__name__,
+        "description": func.__doc__,
+        "parameters": {
+            "type": "object",
+            "properties": {}
+        },
+        "returns": type_hints.get('return', 'void').__name__
+    }
+    for name, _ in signature.parameters.items():
+        param_type = get_type_name(type_hints.get(name, type(None)))
+        function_info["parameters"]["properties"][name] = {"type": param_type}
+    return json.dumps(function_info, indent=2)
+def get_function_calling_prompt(user_query):
+    fn = """{"name": "function_name", "arguments": {"arg_1": "value_1", "arg_2": value_2, ...}}"""
+    example = """{"name": "get_historical_data_in_date_range", "arguments": {"date_start": "2024-01-10", "date_end": "2024-01-14"}}"""
+    prompt = f"""<|im_start|>system
+You are a helpful assistant with access to the following functions:
+{serialize_function_to_json(get_historical_data_for_date)}
+{serialize_function_to_json(get_historical_data_in_date_range)}
+{serialize_function_to_json(get_future_data_for_date)}
+{serialize_function_to_json(get_future_data_in_date_range)}
+###INSTRUCTIONS:
+- You need to choose one function to use and retrieve paramenters for this function from the user input.
+- If the user query contains 'will', and specifies a single day or date, use get_future_data_in_date_range function
+- If the user query contains 'will', and specifies a range of days or dates, use get_future_data_in_date_range function.
+- If the user query is for future data, but only includes a single day or date, use the get_future_data_in_date_range function,
+- If the user query contains 'today' or 'yesterday', use get_historical_data_for_date function.
+- If the user query contains 'tomorrow', use get_future_data_in_date_range function.
+- If the user query is for historical data, and specifies a range of days or dates, use use get_historical_data_for_date function.
+- If the user says a day of the week, assume the date of that day is when that day next arrives.
+- Do not include feature_view and model parameters.
+- Provide dates STRICTLY in the YYYY-MM-DD format.
+- Generate an 'No Function needed' string if the user query does not require function calling.
+IMPORTANT: Today is {datetime.date.today().strftime("%A")}, {datetime.date.today()}.
+To use one of there functions respond STRICTLY with:
+<onefunctioncall>
+    <functioncall> {fn} </functioncall>
+</onefunctioncall>
+###EXAMPLES
+EXAMPLE 1:
+- User: Hi!
+- AI Assiatant: No Function needed.
+EXAMPLE 2:
+- User: Is this Air Quality level good or bad?
+- AI Assiatant: No Function needed.
+EXAMPLE 3:
+- User: When and what was the minimum air quality from 2024-01-10 till 2024-01-14?
+- AI Assistant:
+<onefunctioncall>
+    <functioncall> {example} </functioncall>
+</onefunctioncall>
+<|im_end|>
+<|im_start|>user
+{user_query}
+<|im_end|>
+<|im_start|>assistant"""
+    return prompt
+def generate_hermes(user_query: str, model_llm, tokenizer) -> str:
+    """Retrieves a function name and extracts function parameters based on the user query."""
+    prompt = get_function_calling_prompt(user_query)
+    tokens = tokenizer(prompt, return_tensors="pt").to(model_llm.device)
+    input_size = tokens.input_ids.numel()
+    with torch.inference_mode():
+        generated_tokens = model_llm.generate(
+            **tokens,
+            use_cache=True,
+            do_sample=True,
+            temperature=0.2,
+            top_p=1.0,
+            top_k=0,
+            max_new_tokens=512,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    return tokenizer.decode(
+        generated_tokens.squeeze()[input_size:],
+        skip_special_tokens=True,
+    )
+def function_calling_with_openai(user_query: str, client) -> str:
+    """
+    Generates a response using OpenAI's chat API.
+    Args:
+        user_query (str): The user's query or prompt.
+        instructions (str): Instructions or context to provide to the GPT model.
+    Returns:
+        str: The generated response from the assistant.
+    """
+    instructions = get_function_calling_prompt(user_query).split('<|im_start|>user')[0]
+    completion = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": instructions},
+            {"role": "user", "content": user_query},
+        ]
+    )
+    # Extract and return the assistant's reply from the response
+    if completion and completion.choices:
+        last_choice = completion.choices[0]
+        if last_choice.message:
+            return last_choice.message.content.strip()
+    return ""
+def extract_function_calls(completion: str) -> List[Dict[str, Any]]:
+    """Extract function calls from completion."""
+    completion = completion.strip()
+    pattern = r"(<onefunctioncall>(.*?)</onefunctioncall>)"
+    match = re.search(pattern, completion, re.DOTALL)
+    if not match:
+        return None
+    multiplefn = match.group(1)
+    root = ET.fromstring(multiplefn)
+    functions = root.findall("functioncall")
+    return [json.loads(fn.text) for fn in functions]
+def invoke_function(function, feature_view, weather_fg, model) -> pd.DataFrame:
+    """Invoke a function with given arguments."""
+    # Extract function name and arguments from input_data
+    function_name = function['name']
+    arguments = function['arguments']
+    # Using Python's getattr function to dynamically call the function by its name and passing the arguments
+    function_output = getattr(sys.modules[__name__], function_name)(
+        **arguments,
+        feature_view=feature_view,
+        weather_fg=weather_fg,
+        model=model,
+    )
+    if type(function_output) == str:
+        return function_output
+    # Round the 'pm25' value to 2 decimal places
+    function_output['pm25'] = function_output['pm25'].apply(round, ndigits=2)
+    return function_output
+def get_context_data(user_query: str, feature_view, weather_fg, model_air_quality, model_llm=None, tokenizer=None, client=None) -> str:
+    """
+    Retrieve context data based on user query.
+    Args:
+        user_query (str): The user query.
+        feature_view: Feature View for data retrieval.
+        model_air_quality: The air quality model.
+        tokenizer: The tokenizer.
+    Returns:
+        str: The context data.
+    """
+    if client:
+        # Generate a response using LLM
+        completion = function_calling_with_openai(user_query, client)
+    else:
+        # Generate a response using LLM
+        completion = generate_hermes(
+            user_query,
+            model_llm,
+            tokenizer,
+        )
+    # Extract function calls from the completion
+    functions = extract_function_calls(completion)
+    # If function calls were found
+    if functions:
+        # Invoke the function with provided arguments
+        data = invoke_function(functions[0], feature_view, weather_fg, model_air_quality)
+        # Return formatted data as string
+        if isinstance(data, pd.DataFrame):
+            return f'Air Quality Measurements:\n' + '\n'.join(
+                [f'Date: {row["date"]}; Air Quality: {row["pm25"]}' for _, row in data.iterrows()]
+            )
+        # Return message if data is not updated
+        return data
+    # If no function calls were found, return an empty string
+    return ''

functions/llm_chain.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, AutoModel
+from langchain.llms import HuggingFacePipeline
+from langchain.prompts import PromptTemplate
+from langchain.chains.llm import LLMChain
+from langchain.memory import ConversationBufferWindowMemory
+import torch
+import datetime
+from typing import Any, Dict, Union
+from functions.context_engineering import get_context_data
+import os
+from safetensors.torch import load_model, save_model
+def load_model(model_id: str = "teknium/OpenHermes-2.5-Mistral-7B") -> tuple:
+    """
+    Load the LLM and its corresponding tokenizer.
+    Args:
+        model_id (str, optional): Identifier for the pre-trained model. Defaults to "teknium/OpenHermes-2.5-Mistral-7B".
+    Returns:
+        tuple: A tuple containing the loaded model and tokenizer.
+    """
+    # Load the tokenizer for Mistral-7B-Instruct model
+    tokenizer_path = "./mistral/tokenizer"
+    if os.path.isdir(tokenizer_path) == False:
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tokenizer_path)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    # Set the pad token to the unknown token to handle padding
+    tokenizer.pad_token = tokenizer.unk_token
+    # Set the padding side to "right" to prevent warnings during tokenization
+    tokenizer.padding_side = "right"
+    # BitsAndBytesConfig int-4 config
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    model_path = "/tmp/mistral/model"
+    if os.path.exists(model_path):
+        print("Loading model from disk")
+        model_llm = AutoModelForCausalLM.from_pretrained(model_path)
+    else:
+        # Load the Mistral-7B-Instruct model with quantization configuration
+        model_llm = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="auto",
+            quantization_config=bnb_config,
+        )
+        model_llm.save_pretrained(model_path)
+    # Configure the pad token ID in the model to match the tokenizer's pad token ID
+    model_llm.config.pad_token_id = tokenizer.pad_token_id
+    return model_llm, tokenizer
+def get_prompt_template():
+    """
+    Retrieve a template for generating prompts in a conversational AI system.
+    Returns:
+        str: A string representing the template for generating prompts.
+            This template includes placeholders for system information,
+            instructions, previous conversation, context, date and user query.
+    """
+    prompt_template = """<|im_start|>system
+You are one of the best air quality experts in the world.
+###INSTRUCTIONS:
+- If you don't know the answer, you will respond politely that you cannot help.
+- Use the context table with air quality indicators for city provided by user to generate your answer.
+- You answer should be at least one sentence.
+- Do not show any calculations to the user.
+- Make sure that you use correct air quality indicators for the corresponding date.
+- Add a rich analysis of the air quality level, such as whether it is safe, whether to go for a walk, etc.
+- Do not mention in your answer that you are using context table.
+<|im_end|>
+### CONTEXT:
+{context}
+IMPORTANT: Today is {date_today}.
+<|im_start|>user
+{question}<|im_end|>
+<|im_start|>assistant"""
+    return prompt_template
+def get_llm_chain(model_llm, tokenizer):
+    """
+    Create and configure a language model chain.
+    Args:
+        model_llm: The pre-trained language model for text generation.
+        tokenizer: The tokenizer corresponding to the language model.
+    Returns:
+        LLMChain: The configured language model chain.
+    """
+    # Create a text generation pipeline using the loaded model and tokenizer
+    text_generation_pipeline = transformers.pipeline(
+        model=model_llm,                      # The pre-trained language model for text generation
+        tokenizer=tokenizer,                  # The tokenizer corresponding to the language model
+        task="text-generation",               # Specify the task as text generation
+        use_cache=True,
+        do_sample=True,
+        temperature=0.4,
+        top_p=1.0,
+        top_k=0,
+        max_new_tokens=512,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    # Create a Hugging Face pipeline for Mistral LLM using the text generation pipeline
+    mistral_llm = HuggingFacePipeline(
+        pipeline=text_generation_pipeline,
+    )
+    # Create prompt from prompt template
+    prompt = PromptTemplate(
+        input_variables=["context", "question", "date_today"],
+        template=get_prompt_template(),
+    )
+    # Create LLM chain
+    llm_chain = LLMChain(
+        llm=mistral_llm,
+        prompt=prompt,
+        verbose=False,
+    )
+    return llm_chain
+def generate_response(
+    user_query: str,
+    feature_view,
+    weather_fg,
+    model_air_quality,
+    model_llm,
+    tokenizer,
+    llm_chain=None,
+    verbose: bool = False,
+) -> str:
+    """
+    Generate response to user query using LLM chain and context data.
+    Args:
+        user_query (str): The user's query.
+        feature_view: Feature view for data retrieval.
+        model_llm: Language model for text generation.
+        tokenizer: Tokenizer for processing text.
+        model_air_quality: Model for predicting air quality.
+        llm_chain: LLM Chain.
+        verbose (bool): Whether to print verbose information. Defaults to False.
+    Returns:
+        str: Generated response to the user query.
+    """
+    # Get context data based on user query
+    context = get_context_data(
+        user_query,
+        feature_view,
+        weather_fg,
+        model_air_quality,
+        model_llm=model_llm,
+        tokenizer=tokenizer,
+    )
+    # Get today's date in a readable format
+    date_today = f'{datetime.date.today().strftime("%A")}, {datetime.date.today()}'
+    # Print today's date and context information if verbose mode is enabled
+    if verbose:
+        print(f"🗓️ Today's date: {date_today}")
+        print(f'📖 {context}')
+    # Invoke the language model chain with relevant context
+    model_output = llm_chain.invoke({
+        "context": context,
+        "date_today": date_today,
+        "question": user_query,
+    })
+    # Return the generated text from the model output
+    return model_output['text'].split('<|im_start|>assistant')[-1]
+def generate_response_openai(
+    user_query: str,
+    feature_view,
+    weather_fg,
+    model_air_quality,
+    client,
+    verbose=True,
+):
+    context = get_context_data(
+        user_query,
+        feature_view,
+        weather_fg,
+        model_air_quality,
+        client=client,
+    )
+    # Get today's date in a readable format
+    date_today = f'{datetime.date.today().strftime("%A")}, {datetime.date.today()}'
+    # Print today's date and context information if verbose mode is enabled
+    if verbose:
+        print(f"🗓️ Today's date: {date_today}")
+        print(f'📖 {context}')
+    instructions = get_prompt_template().split('<|im_start|>user')[0]
+    instructions_filled = instructions.format(
+        context=context,
+        date_today=date_today
+    )
+    completion = client.chat.completions.create(
+        model="gpt-4-0125-preview",
+        messages=[
+            {"role": "system", "content": instructions_filled},
+            {"role": "user", "content": user_query},
+        ]
+    )
+    # Extract and return the assistant's reply from the response
+    if completion and completion.choices:
+        last_choice = completion.choices[0]
+        if last_choice.message:
+            return last_choice.message.content.strip()
+    return ""

functions/util.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+import datetime
+import time
+import requests
+import pandas as pd
+import json
+from geopy.geocoders import Nominatim
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+from matplotlib.ticker import MultipleLocator
+import openmeteo_requests
+import requests_cache
+from retry_requests import retry
+import hopsworks
+import hsfs
+from pathlib import Path
+def get_historical_weather(city, start_date,  end_date, latitude, longitude):
+    # latitude, longitude = get_city_coordinates(city)
+    # Setup the Open-Meteo API client with cache and retry on error
+    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
+    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
+    openmeteo = openmeteo_requests.Client(session = retry_session)
+    # Make sure all required weather variables are listed here
+    # The order of variables in hourly or daily is important to assign them correctly below
+    url = "https://archive-api.open-meteo.com/v1/archive"
+    params = {
+        "latitude": latitude,
+        "longitude": longitude,
+        "start_date": start_date,
+        "end_date": end_date,
+        "daily": ["temperature_2m_mean", "precipitation_sum", "wind_speed_10m_max", "wind_direction_10m_dominant"]
+    }
+    responses = openmeteo.weather_api(url, params=params)
+    # Process first location. Add a for-loop for multiple locations or weather models
+    response = responses[0]
+    print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
+    print(f"Elevation {response.Elevation()} m asl")
+    print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
+    print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
+    # Process daily data. The order of variables needs to be the same as requested.
+    daily = response.Daily()
+    daily_temperature_2m_mean = daily.Variables(0).ValuesAsNumpy()
+    daily_precipitation_sum = daily.Variables(1).ValuesAsNumpy()
+    daily_wind_speed_10m_max = daily.Variables(2).ValuesAsNumpy()
+    daily_wind_direction_10m_dominant = daily.Variables(3).ValuesAsNumpy()
+    daily_data = {"date": pd.date_range(
+        start = pd.to_datetime(daily.Time(), unit = "s"),
+        end = pd.to_datetime(daily.TimeEnd(), unit = "s"),
+        freq = pd.Timedelta(seconds = daily.Interval()),
+        inclusive = "left"
+    )}
+    daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
+    daily_data["precipitation_sum"] = daily_precipitation_sum
+    daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
+    daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
+    daily_dataframe = pd.DataFrame(data = daily_data)
+    daily_dataframe = daily_dataframe.dropna()
+    daily_dataframe['city'] = city
+    return daily_dataframe
+def get_hourly_weather_forecast(city, latitude, longitude):
+    # latitude, longitude = get_city_coordinates(city)
+    # Setup the Open-Meteo API client with cache and retry on error
+    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
+    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
+    openmeteo = openmeteo_requests.Client(session = retry_session)
+    # Make sure all required weather variables are listed here
+    # The order of variables in hourly or daily is important to assign them correctly below
+    url = "https://api.open-meteo.com/v1/ecmwf"
+    params = {
+        "latitude": latitude,
+        "longitude": longitude,
+        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"]
+    }
+    responses = openmeteo.weather_api(url, params=params)
+    # Process first location. Add a for-loop for multiple locations or weather models
+    response = responses[0]
+    print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
+    print(f"Elevation {response.Elevation()} m asl")
+    print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
+    print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
+    # Process hourly data. The order of variables needs to be the same as requested.
+    hourly = response.Hourly()
+    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
+    hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()
+    hourly_wind_speed_10m = hourly.Variables(2).ValuesAsNumpy()
+    hourly_wind_direction_10m = hourly.Variables(3).ValuesAsNumpy()
+    hourly_data = {"date": pd.date_range(
+        start = pd.to_datetime(hourly.Time(), unit = "s"),
+        end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
+        freq = pd.Timedelta(seconds = hourly.Interval()),
+        inclusive = "left"
+    )}
+    hourly_data["temperature_2m_mean"] = hourly_temperature_2m
+    hourly_data["precipitation_sum"] = hourly_precipitation
+    hourly_data["wind_speed_10m_max"] = hourly_wind_speed_10m
+    hourly_data["wind_direction_10m_dominant"] = hourly_wind_direction_10m
+    hourly_dataframe = pd.DataFrame(data = hourly_data)
+    hourly_dataframe = hourly_dataframe.dropna()
+    return hourly_dataframe
+def get_city_coordinates(city_name: str):
+    """
+    Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
+    """
+    # Initialize Nominatim API (for getting lat and long of the city)
+    geolocator = Nominatim(user_agent="MyApp")
+    city = geolocator.geocode(city_name)
+    latitude = round(city.latitude, 2)
+    longitude = round(city.longitude, 2)
+    return latitude, longitude
+def trigger_request(url:str):
+    response = requests.get(url)
+    if response.status_code == 200:
+        # Extract the JSON content from the response
+        data = response.json()
+    else:
+        print("Failed to retrieve data. Status Code:", response.status_code)
+        raise requests.exceptions.RequestException(response.status_code)
+    return data
+def get_pm25(aqicn_url: str, country: str, city: str, street: str, day: datetime.date, AQI_API_KEY: str):
+    """
+    Returns DataFrame with air quality (pm25) as dataframe
+    """
+    # The API endpoint URL
+    url = f"{aqicn_url}/?token={AQI_API_KEY}"
+    # Make a GET request to fetch the data from the API
+    data = trigger_request(url)
+    # if we get 'Unknown station' response then retry with city in url
+    if data['data'] == "Unknown station":
+        url1 = f"https://api.waqi.info/feed/{country}/{street}/?token={AQI_API_KEY}"
+        data = trigger_request(url1)
+    if data['data'] == "Unknown station":
+        url2 = f"https://api.waqi.info/feed/{country}/{city}/{street}/?token={AQI_API_KEY}"
+        data = trigger_request(url2)
+    # Check if the API response contains the data
+    if data['status'] == 'ok':
+        # Extract the air quality data
+        aqi_data = data['data']
+        aq_today_df = pd.DataFrame()
+        aq_today_df['pm25'] = [aqi_data['iaqi'].get('pm25', {}).get('v', None)]
+        aq_today_df['pm25'] = aq_today_df['pm25'].astype('float32')
+        aq_today_df['country'] = country
+        aq_today_df['city'] = city
+        aq_today_df['street'] = street
+        aq_today_df['date'] = day
+        aq_today_df['date'] = pd.to_datetime(aq_today_df['date'])
+        aq_today_df['url'] = aqicn_url
+    else:
+        print("Error: There may be an incorrect  URL for your Sensor or it is not contactable right now. The API response does not contain data.  Error message:", data['data'])
+        raise requests.exceptions.RequestException(data['data'])
+    return aq_today_df
+def plot_air_quality_forecast(city: str, street: str, df: pd.DataFrame, file_path: str, hindcast=False):
+    fig, ax = plt.subplots(figsize=(10, 6))
+    day = pd.to_datetime(df['date']).dt.date
+    # Plot each column separately in matplotlib
+    ax.plot(day, df['predicted_pm25'], label='Predicted PM2.5', color='red', linewidth=2, marker='o', markersize=5, markerfacecolor='blue')
+    # Set the y-axis to a logarithmic scale
+    ax.set_yscale('log')
+    ax.set_yticks([0, 10, 25, 50, 100, 250, 500])
+    ax.get_yaxis().set_major_formatter(plt.ScalarFormatter())
+    ax.set_ylim(bottom=1)
+    # Set the labels and title
+    ax.set_xlabel('Date')
+    ax.set_title(f"PM2.5 Predicted (Logarithmic Scale) for {city}, {street}")
+    ax.set_ylabel('PM2.5')
+    colors = ['green', 'yellow', 'orange', 'red', 'purple', 'darkred']
+    labels = ['Good', 'Moderate', 'Unhealthy for Some', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
+    ranges = [(0, 49), (50, 99), (100, 149), (150, 199), (200, 299), (300, 500)]
+    for color, (start, end) in zip(colors, ranges):
+        ax.axhspan(start, end, color=color, alpha=0.3)
+    # Add a legend for the different Air Quality Categories
+    patches = [Patch(color=colors[i], label=f"{labels[i]}: {ranges[i][0]}-{ranges[i][1]}") for i in range(len(colors))]
+    legend1 = ax.legend(handles=patches, loc='upper right', title="Air Quality Categories", fontsize='x-small')
+    # Aim for ~10 annotated values on x-axis, will work for both forecasts ans hindcasts
+    if len(df.index) > 11:
+        every_x_tick = len(df.index) / 10
+        ax.xaxis.set_major_locator(MultipleLocator(every_x_tick))
+    plt.xticks(rotation=45)
+    if hindcast == True:
+        ax.plot(day, df['pm25'], label='Actual PM2.5', color='black', linewidth=2, marker='^', markersize=5, markerfacecolor='grey')
+        legend2 = ax.legend(loc='upper left', fontsize='x-small')
+        ax.add_artist(legend1)
+    # Ensure everything is laid out neatly
+    plt.tight_layout()
+    # # Save the figure, overwriting any existing file with the same name
+    plt.savefig(file_path)
+    return plt
+def delete_feature_groups(fs, name):
+    try:
+        for fg in fs.get_feature_groups(name):
+            fg.delete()
+            print(f"Deleted {fg.name}/{fg.version}")
+    except hsfs.client.exceptions.RestAPIError:
+        print(f"No {name} feature group found")
+def delete_feature_views(fs, name):
+    try:
+        for fv in fs.get_feature_views(name):
+            fv.delete()
+            print(f"Deleted {fv.name}/{fv.version}")
+    except hsfs.client.exceptions.RestAPIError:
+        print(f"No {name} feature view found")
+def delete_models(mr, name):
+    models = mr.get_models(name)
+    if not models:
+        print(f"No {name} model found")
+    for model in models:
+        model.delete()
+        print(f"Deleted model {model.name}/{model.version}")
+def delete_secrets(proj, name):
+    secrets = secrets_api(proj.name)
+    try:
+        secret = secrets.get_secret(name)
+        secret.delete()
+        print(f"Deleted secret {name}")
+    except hopsworks.client.exceptions.RestAPIError:
+        print(f"No {name} secret found")
+# WARNING - this will wipe out all your feature data and models
+def purge_project(proj):
+    fs = proj.get_feature_store()
+    mr = proj.get_model_registry()
+    # Delete Feature Views before deleting the feature groups
+    delete_feature_views(fs, "air_quality_fv")
+    # Delete ALL Feature Groups
+    delete_feature_groups(fs, "air_quality")
+    delete_feature_groups(fs, "weather")
+    delete_feature_groups(fs, "aq_predictions")
+    # Delete all Models
+    delete_models(mr, "air_quality_xgboost_model")
+    delete_secrets(proj, "SENSOR_LOCATION_JSON")
+def secrets_api(proj):
+    host = "c.app.hopsworks.ai"
+    api_key = os.environ.get('HOPSWORKS_API_KEY')
+    conn = hopsworks.connection(host=host, project=proj, api_key_value=api_key)
+    return conn.get_secrets_api()
+def check_file_path(file_path):
+    my_file = Path(file_path)
+    if my_file.is_file() == False:
+        print(f"Error. File not found at the path: {file_path} ")
+    else:
+        print(f"File successfully found at the path: {file_path}")
+def backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, model):
+    features_df = weather_fg.read()
+    features_df = features_df.sort_values(by=['date'], ascending=True)
+    features_df = features_df.tail(10)
+    features_df['predicted_pm25'] = model.predict(features_df[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
+    air_quality_df['date'] = pd.to_datetime(air_quality_df['date'])
+    features_df['date'] = features_df['date'].dt.tz_convert(None).astype('datetime64[ns]')
+    df = pd.merge(features_df, air_quality_df[['date','pm25','street','country']], on="date")
+    df['days_before_forecast_day'] = 1
+    hindcast_df = df
+    df = df.drop('pm25', axis=1)
+    monitor_fg.insert(df, write_options={"wait_for_job": True})
+    return hindcast_df

infer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import datetime
+import pandas as pd
+from xgboost import XGBRegressor
+import hopsworks
+import json
+from functions import util
+import os
+# Set up
+api_key = os.getenv('HOPSWORKS_API_KEY')
+project_name = os.getenv('HOPSWORKS_PROJECT')
+project = hopsworks.login(project=project_name, api_key_value=api_key)
+fs = project.get_feature_store()
+secrets = util.secrets_api(project.name)
+AQI_API_KEY = secrets.get_secret("AQI_API_KEY").value
+location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
+location = json.loads(location_str)
+today = datetime.datetime.now() - datetime.timedelta(0)
+feature_view = fs.get_feature_view(
+    name='air_quality_fv',
+    version=1,
+)
+# Retreive model
+mr = project.get_model_registry()
+retrieved_model = mr.get_model(
+    name="air_quality_xgboost_model",
+    version=1,
+)
+saved_model_dir = retrieved_model.download()
+retrieved_xgboost_model = XGBRegressor()
+retrieved_xgboost_model.load_model(saved_model_dir + "/model.json")
+# Retrieve features
+weather_fg = fs.get_feature_group(
+    name='weather',
+    version=1,
+)
+today_timestamp = pd.to_datetime(today)
+batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read()
+batch_data['predicted_pm25'] = retrieved_xgboost_model.predict(
+    batch_data[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])

requirements-llm.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# LLM libraries
+gradio==3.40.1
+getpass4==0.0.14.1
+transformers==4.38.2
+langchain==0.1.10
+bitsandbytes==0.42.0
+accelerate==0.27.2
+# OpenAI
+openai==1.14.3

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# Feature store and model registry
+hopsworks
+# Resolve city names from (longitude, latitude) coordinates
+geopy==2.4.1
+# Read weather data. Unpinned version - if we don't update, we won't get the weather data
+openmeteo-requests
+# Be more efficient when making REST (Http) requests
+requests-cache==1.2.0
+retry-requests==2.0.0
+# ML framework libraries
+xgboost==2.0.3
+scikit-learn==1.4.1.post1
+# Plot charts
+matplotlib==3.8.3
+python-dotenv

training.py ADDED Viewed

File without changes