GreedRL / csrc /task_group_priority.cu
先坤
add greedrl
db26c81
#include "task_group_priority.h"
__global__ void task_group_priority_kernel(
int* group, int* priority, bool* value, bool* output,
int batch_size, int task_num, int group_num)
{
group += blockIdx.x * task_num;
priority += blockIdx.x * task_num;
value += blockIdx.x * task_num;
output += blockIdx.x * task_num;
extern __shared__ int temp[];
for(int i=threadIdx.x; i<group_num; i+=blockDim.x)
{
temp[i] = std::numeric_limits<int>::max();
}
__syncthreads();
for(int i=threadIdx.x; i<task_num; i+=blockDim.x){
if(value[i]){
continue;
}
int g = group[i];
int p = priority[i];
atomicMin(&temp[g], p);
}
__syncthreads();
for(int i=threadIdx.x; i<task_num; i+=blockDim.x){
int g = group[i];
output[i] = priority[i]!=temp[g];
}
};
template<typename _Tg, typename _Tp>
__global__ void cuda_do_task_group_priority(
const torch::PackedTensorAccessor<_Tg,2,torch::RestrictPtrTraits> group,
const torch::PackedTensorAccessor<_Tp,2,torch::RestrictPtrTraits> priority,
const torch::PackedTensorAccessor<bool,2,torch::RestrictPtrTraits> value,
torch::PackedTensorAccessor<bool,2,torch::RestrictPtrTraits> result,
const _Tg NG)
{
const int NP = group.size(0);
const int NT = group.size(1);
const int p = blockIdx.x * blockDim.x + threadIdx.x;
if(p < NP)
{
extern __shared__ char _temp[];
auto temp = reinterpret_cast<_Tp*>(_temp);
temp += (threadIdx.x * NG);
for(_Tg g=0; g<NG; g++){
temp[g] = std::numeric_limits<_Tp>::max();
}
for(int t=0; t<NT; t++){
if(value[p][t]){
continue;
}
_Tg g = group[p][t];
_Tp _p = priority[p][t];
if(_p < temp[g]){
temp[g] = _p;
}
}
for(int t=0; t<NT; t++){
_Tg g = group[p][t];
if(priority[p][t]==temp[g]){
result[p][t] = false;
}
}
}
};
void task_group_priority_cuda(
int* group, int* priority, bool* value, bool* output,
const int batch_size, const int task_num, const int group_num, const int device)
{
const int shared_mem = group_num * sizeof(int);
GRL_CHECK_CUDA(cudaSetDevice(device));
task_group_priority_kernel<<<batch_size, 256, shared_mem>>>(
group, priority, value, output, batch_size, task_num, group_num);
GRL_CHECK_CUDA(cudaGetLastError());
};