File size: 2,591 Bytes
db26c81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include "task_group_priority.h"

__global__ void task_group_priority_kernel(
        int* group, int* priority, bool* value, bool* output,
        int batch_size, int task_num, int group_num)
{
    group += blockIdx.x * task_num;
    priority += blockIdx.x * task_num;
    value += blockIdx.x * task_num;
    output += blockIdx.x * task_num;

    extern __shared__ int temp[];
    
    for(int i=threadIdx.x; i<group_num; i+=blockDim.x)
    {
        temp[i] = std::numeric_limits<int>::max();
    }
    
    __syncthreads();

    for(int i=threadIdx.x; i<task_num; i+=blockDim.x){
        if(value[i]){
            continue;
        }
        int g = group[i];
        int p = priority[i];
        atomicMin(&temp[g], p);
    }

    __syncthreads();

    for(int i=threadIdx.x; i<task_num; i+=blockDim.x){
        int g = group[i];
        output[i] = priority[i]!=temp[g];
    }
};

template<typename _Tg, typename _Tp>
__global__ void cuda_do_task_group_priority(
        const torch::PackedTensorAccessor<_Tg,2,torch::RestrictPtrTraits> group,
        const torch::PackedTensorAccessor<_Tp,2,torch::RestrictPtrTraits> priority,
        const torch::PackedTensorAccessor<bool,2,torch::RestrictPtrTraits> value,
        torch::PackedTensorAccessor<bool,2,torch::RestrictPtrTraits> result,
        const _Tg NG)
{
    const int NP = group.size(0);
    const int NT = group.size(1);
    const int p = blockIdx.x * blockDim.x + threadIdx.x;
    if(p < NP)
    {
        extern __shared__ char _temp[];
        auto temp = reinterpret_cast<_Tp*>(_temp);
        temp += (threadIdx.x * NG);
        for(_Tg g=0; g<NG; g++){
            temp[g] = std::numeric_limits<_Tp>::max();
        }

        for(int t=0; t<NT; t++){
            if(value[p][t]){
                continue;
            }
            _Tg g = group[p][t];
            _Tp _p = priority[p][t];
            if(_p < temp[g]){
                temp[g] = _p;
            }
        }

        for(int t=0; t<NT; t++){
            _Tg g = group[p][t];
            if(priority[p][t]==temp[g]){
                result[p][t] = false;
            }
        }
    }
};



void task_group_priority_cuda(
        int* group, int* priority, bool* value, bool* output,
        const int batch_size, const int task_num, const int group_num, const int device)
{
    const int shared_mem = group_num * sizeof(int);

    GRL_CHECK_CUDA(cudaSetDevice(device));

    task_group_priority_kernel<<<batch_size, 256, shared_mem>>>(
        group, priority, value, output, batch_size, task_num, group_num);

    GRL_CHECK_CUDA(cudaGetLastError());
};