Spaces:

ryanrahmadifa
/

poc

Sleeping

poc / experimentation_mlops /mlops /ingest_request.py

ryanrahmadifa

Added files

79e1719 4 months ago

1.81 kB

	"""
	This module defines the following routines used by the 'ingest' step of the time series forecasting flow:

	- ``load_file_as_dataframe``: Defines customizable logic for parsing dataset formats that are not
	natively parsed by MLflow Recipes (i.e. formats other than Parquet, Delta, and Spark SQL).
	"""

	import pandas as pd
	import os
	import tempfile
	import click
	import mlflow

	import gdown
	import requests
	import zipfile

	@click.command(
	help="Downloads the dataset and saves it as an mlflow artifact "
	"called 'data-csv-dir'."
	)
	@click.option("--url", default="https://drive.google.com/uc?id=1H8RHsrgYMd6VC23_OJqrN6o_mL78pWpx")
	def ingest_request(url) -> pd.DataFrame:
	"""
	Downloads data from the specified url.

	:param url: Url to the dataset file.
	:return: MLflow artifact containing the downloaded data in its raw form.
	"""
	with mlflow.start_run():
	local_dir = tempfile.mkdtemp()
	local_filename = os.path.join(local_dir, "news-data.zip")
	print(f"Downloading {url} to {local_filename}")
	# r = requests.get(url, stream=True)
	# with open(local_filename, "wb") as f:
	# for chunk in r.iter_content(chunk_size=1024):
	# if chunk: # filter out keep-alive new chunks
	# f.write(chunk)

	gdown.download(url, local_filename, quiet=False)
	extracted_dir = os.path.join(local_dir)
	print(f"Extracting {local_filename} into {extracted_dir}")

	with zipfile.ZipFile(local_filename, "r") as zip_ref:
	zip_ref.extractall(local_dir)

	data_file = os.path.join(extracted_dir, "2week_news_data.csv")

	print(f"Uploading data: {data_file}")
	mlflow.log_artifact(data_file, "data-csv-dir")

	if __name__ == "__main__":
	ingest_request()