#include "task_group_split.h" __global__ void task_group_split_kernel( int* group, bool* value, bool* output, const int batch_size, const int task_num, const int group_num) { group += blockIdx.x * task_num; value += blockIdx.x * task_num; extern __shared__ bool temp[]; __shared__ bool split; if(threadIdx.x == 0) split = false; for(int i=threadIdx.x; i>>( group, value, output, batch_size, task_num, group_num); GRL_CHECK_CUDA(cudaGetLastError()); };