RefurnishAI

Sleeping

App Files Files Community

RefurnishAI / custom_nodes /comfy_mtb /extern /frame_interpolation /training /train.py

multimodalart HF staff

Squashing commit

4450790 verified 26 days ago

raw

history blame

4.76 kB

	# Copyright 2022 Google LLC

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at

	# https://www.apache.org/licenses/LICENSE-2.0

	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	r"""The training loop for frame interpolation.

	gin_config: The gin configuration file containing model, losses and datasets.

	To run on GPUs:
	python3 -m frame_interpolation.training.train \
	--gin_config <path to network.gin> \
	--base_folder <base folder for all training runs> \
	--label <descriptive label for the run>

	To debug the training loop on CPU:
	python3 -m frame_interpolation.training.train \
	--gin_config <path to config.gin> \
	--base_folder /tmp
	--label test_run \
	--mode cpu

	The training output directory will be created at <base_folder>/<label>.
	"""
	import os

	from . import augmentation_lib
	from . import data_lib
	from . import eval_lib
	from . import metrics_lib
	from . import model_lib
	from . import train_lib
	from absl import app
	from absl import flags
	from absl import logging
	import gin.tf
	from ..losses import losses

	# Reduce tensorflow logs to ERRORs only.
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
	import tensorflow as tf # pylint: disable=g-import-not-at-top
	tf.get_logger().setLevel('ERROR')


	_GIN_CONFIG = flags.DEFINE_string('gin_config', None, 'Gin config file.')
	_LABEL = flags.DEFINE_string('label', 'run0',
	'Descriptive label for this run.')
	_BASE_FOLDER = flags.DEFINE_string('base_folder', None,
	'Path to checkpoints/summaries.')
	_MODE = flags.DEFINE_enum('mode', 'gpu', ['cpu', 'gpu'],
	'Distributed strategy approach.')


	@gin.configurable('training')
	class TrainingOptions(object):
	"""Training-related options."""

	def __init__(self, learning_rate: float, learning_rate_decay_steps: int,
	learning_rate_decay_rate: int, learning_rate_staircase: int,
	num_steps: int):
	self.learning_rate = learning_rate
	self.learning_rate_decay_steps = learning_rate_decay_steps
	self.learning_rate_decay_rate = learning_rate_decay_rate
	self.learning_rate_staircase = learning_rate_staircase
	self.num_steps = num_steps


	def main(argv):
	if len(argv) > 1:
	raise app.UsageError('Too many command-line arguments.')

	output_dir = os.path.join(_BASE_FOLDER.value, _LABEL.value)
	logging.info('Creating output_dir @ %s ...', output_dir)

	# Copy config file to <base_folder>/<label>/config.gin.
	tf.io.gfile.makedirs(output_dir)
	tf.io.gfile.copy(
	_GIN_CONFIG.value, os.path.join(output_dir, 'config.gin'), overwrite=True)

	gin.external_configurable(
	tf.keras.optimizers.schedules.PiecewiseConstantDecay,
	module='tf.keras.optimizers.schedules')

	gin_configs = [_GIN_CONFIG.value]
	gin.parse_config_files_and_bindings(
	config_files=gin_configs, bindings=None, skip_unknown=True)

	training_options = TrainingOptions() # pylint: disable=no-value-for-parameter

	learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
	training_options.learning_rate,
	training_options.learning_rate_decay_steps,
	training_options.learning_rate_decay_rate,
	training_options.learning_rate_staircase,
	name='learning_rate')

	# Initialize data augmentation functions
	augmentation_fns = augmentation_lib.data_augmentations()

	saved_model_folder = os.path.join(_BASE_FOLDER.value, _LABEL.value,
	'saved_model')
	train_folder = os.path.join(_BASE_FOLDER.value, _LABEL.value, 'train')
	eval_folder = os.path.join(_BASE_FOLDER.value, _LABEL.value, 'eval')

	train_lib.train(
	strategy=train_lib.get_strategy(_MODE.value),
	train_folder=train_folder,
	saved_model_folder=saved_model_folder,
	n_iterations=training_options.num_steps,
	create_model_fn=model_lib.create_model,
	create_losses_fn=losses.training_losses,
	create_metrics_fn=metrics_lib.create_metrics_fn,
	dataset=data_lib.create_training_dataset(
	augmentation_fns=augmentation_fns),
	learning_rate=learning_rate,
	eval_loop_fn=eval_lib.eval_loop,
	eval_folder=eval_folder,
	eval_datasets=data_lib.create_eval_datasets() or None)


	if __name__ == '__main__':
	app.run(main)