Spaces:

htahir1
/

zenml_breast_cancer_classifier

Runtime error

App Files Files Community

zenml_breast_cancer_classifier / run.py

htahir1

Upload folder using huggingface_hub

7f1aa39 11 months ago

raw

history blame contribute delete

7.2 kB

	# {% include 'templates/license_header' %}

	import os
	from typing import Optional

	import click
	from pipelines import (
	feature_engineering,
	inference,
	breast_cancer_training,
	breast_cancer_deployment_pipeline
	)
	from zenml.client import Client
	from zenml.logger import get_logger

	logger = get_logger(__name__)


	@click.command(
	help="""
	ZenML Starter project CLI v0.0.1.

	Run the ZenML starter project with basic options.

	Examples:

	\b
	# Run the feature engineering pipeline
	python run.py --feature-pipeline

	\b
	# Run the training pipeline
	python run.py --training-pipeline

	\b
	# Run the training pipeline with versioned artifacts
	python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1

	\b
	# Run the inference pipeline
	python run.py --inference-pipeline

	"""
	)
	@click.option(
	"--train-dataset-name",
	default="dataset_trn",
	type=click.STRING,
	help="The name of the train dataset produced by feature engineering.",
	)
	@click.option(
	"--train-dataset-version-name",
	default=None,
	type=click.STRING,
	help="Version of the train dataset produced by feature engineering. "
	"If not specified, a new version will be created.",
	)
	@click.option(
	"--test-dataset-name",
	default="dataset_tst",
	type=click.STRING,
	help="The name of the test dataset produced by feature engineering.",
	)
	@click.option(
	"--test-dataset-version-name",
	default=None,
	type=click.STRING,
	help="Version of the test dataset produced by feature engineering. "
	"If not specified, a new version will be created.",
	)
	@click.option(
	"--config",
	default=None,
	type=click.STRING,
	help="The name of the config",
	)
	@click.option(
	"--feature-pipeline",
	is_flag=True,
	default=False,
	help="Whether to run the pipeline that creates the dataset.",
	)
	@click.option(
	"--training-pipeline",
	is_flag=True,
	default=False,
	help="Whether to run the pipeline that trains the model.",
	)
	@click.option(
	"--inference-pipeline",
	is_flag=True,
	default=False,
	help="Whether to run the pipeline that performs inference.",
	)
	@click.option(
	"--deployment-pipeline",
	is_flag=True,
	default=False,
	help="Whether to run the pipeline that deploys the model.",
	)
	def main(
	train_dataset_name: str = "dataset_trn",
	train_dataset_version_name: Optional[str] = None,
	test_dataset_name: str = "dataset_tst",
	test_dataset_version_name: Optional[str] = None,
	config: Optional[str] = None,
	feature_pipeline: bool = False,
	training_pipeline: bool = False,
	inference_pipeline: bool = False,
	deployment_pipeline: bool = False,
	):
	"""Main entry point for the pipeline execution.

	This entrypoint is where everything comes together:

	* configuring pipeline with the required parameters
	(some of which may come from command line arguments, but most
	of which comes from the YAML config files)
	* launching the pipeline
	"""
	config_folder = os.path.join(
	os.path.dirname(os.path.realpath(__file__)),
	"configs",
	)
	client = Client()

	# Execute Feature Engineering Pipeline
	if feature_pipeline:
	pipeline_args = {}
	pipeline_args["config_path"] = os.path.join(
	config_folder, "feature_engineering.yaml"
	)
	run_args_feature = {}
	feature_engineering.with_options(pipeline_args)(run_args_feature)
	logger.info("Feature Engineering pipeline finished successfully!")
	train_dataset_artifact = client.get_artifact_version(
	train_dataset_name
	)
	test_dataset_artifact = client.get_artifact_version(test_dataset_name)
	logger.info(
	"The latest feature engineering pipeline produced the following "
	f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, "
	f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: "
	f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}"
	)

	# Execute Training Pipeline
	if training_pipeline:
	pipeline_args = {}
	if config is None:
	pipeline_args["config_path"] = os.path.join(config_folder, "training.yaml")
	else:
	pipeline_args["config_path"] = os.path.join(config_folder, config)
	run_args_train = {}

	# If train_dataset_version_name is specified, use versioned artifacts
	if train_dataset_version_name or test_dataset_version_name:
	# However, both train and test dataset versions must be specified
	assert (
	train_dataset_version_name is not None
	and test_dataset_version_name is not None
	)
	train_dataset_artifact = client.get_artifact_version(
	train_dataset_name, train_dataset_version_name
	)
	# If train dataset is specified, test dataset must be specified
	test_dataset_artifact = client.get_artifact_version(
	test_dataset_name, test_dataset_version_name
	)
	# Use versioned artifacts
	run_args_train["train_dataset_id"] = train_dataset_artifact.id
	run_args_train["test_dataset_id"] = test_dataset_artifact.id

	from zenml.config import DockerSettings

	# The actual code will stay the same, all that needs to be done is some configuration
	step_args = {}

	# We configure which step operator should be used
	# M5 Large is what we need for this big data!
	step_args["settings"] = {"step_operator.sagemaker": {"estimator_args": {"instance_type" : "ml.m5.large"}}}

	# Update the step. We could also do this in YAML
	model_trainer = model_trainer.with_options(**step_args)

	docker_settings = DockerSettings(
	requirements=[
	"pyarrow",
	"scikit-learn==1.1.1"
	],
	)

	pipeline_args = {
	"enable_cache": True,
	"settings": {"docker": docker_settings}
	}

	breast_cancer_training.with_options(pipeline_args)(run_args_train)
	logger.info("Training pipeline finished successfully!")

	if inference_pipeline:
	pipeline_args = {}
	if config is None:
	pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml")
	else:
	pipeline_args["config_path"] = os.path.join(config_folder, config)
	run_args_inference = {}
	inference.with_options(pipeline_args)(run_args_inference)
	logger.info("Inference pipeline finished successfully!")

	if deployment_pipeline:
	pipeline_args = {}
	pipeline_args["config_path"] = os.path.join(config_folder, "deployment.yaml")
	run_args_inference = {}
	breast_cancer_deployment_pipeline.with_options(pipeline_args)(run_args_inference)
	logger.info("Deployment pipeline finished successfully!")

	if __name__ == "__main__":
	main()