poc / experimentation_mlops /example /load_raw_data.py
ryanrahmadifa
Added files
79e1719
raw
history blame
1.29 kB
"""
Downloads the MovieLens dataset and saves it as an artifact
"""
import os
import tempfile
import zipfile
import click
import requests
import mlflow
@click.command(
help="Downloads the MovieLens dataset and saves it as an mlflow artifact "
"called 'ratings-csv-dir'."
)
@click.option("--url", default="http://files.grouplens.org/datasets/movielens/ml-20m.zip")
def load_raw_data(url):
with mlflow.start_run():
local_dir = tempfile.mkdtemp()
local_filename = os.path.join(local_dir, "ml-20m.zip")
print(f"Downloading {url} to {local_filename}")
r = requests.get(url, stream=True)
with open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
extracted_dir = os.path.join(local_dir, "ml-20m")
print(f"Extracting {local_filename} into {extracted_dir}")
with zipfile.ZipFile(local_filename, "r") as zip_ref:
zip_ref.extractall(local_dir)
ratings_file = os.path.join(extracted_dir, "ratings.csv")
print(f"Uploading ratings: {ratings_file}")
mlflow.log_artifact(ratings_file, "ratings-csv-dir")
if __name__ == "__main__":
load_raw_data()