test: - dataset_name: gsm8k dataset_type: text split_type: test metric_type: perplexity value: 1.726 format: '' primary: true