File size: 1,681 Bytes
462dacf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
#ifndef _util_cuh
#define _util_cuh
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown
#else
#define cudaUnspecified cudaErrorApiFailureBase
#endif
// React to failure on return code != cudaSuccess
#define _cuda_check(fn) \
do { \
{_cuda_err = fn;} \
if (_cuda_err != cudaSuccess) goto _cuda_fail; \
} while(false)
// React to failure on return code == 0
#define _alloc_check(fn) \
do { \
if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
else _cuda_err = cudaSuccess; \
} while(false)
// Clone CPU <-> CUDA
template <typename T>
T* cuda_clone(const void* ptr, int num)
{
T* cuda_ptr;
cudaError_t r;
r = cudaMalloc(&cuda_ptr, num * sizeof(T));
if (r != cudaSuccess) return NULL;
r = cudaMemcpy(cuda_ptr, ptr, num * sizeof(T), cudaMemcpyHostToDevice);
if (r != cudaSuccess) return NULL;
cudaDeviceSynchronize();
return cuda_ptr;
}
template <typename T>
T* cpu_clone(const void* ptr, int num)
{
T* cpu_ptr;
cudaError_t r;
cpu_ptr = (T*) malloc(num * sizeof(T));
if (cpu_ptr == NULL) return NULL;
r = cudaMemcpy(cpu_ptr, ptr, num * sizeof(T), cudaMemcpyDeviceToHost);
if (r != cudaSuccess) return NULL;
cudaDeviceSynchronize();
return cpu_ptr;
}
// Pack two half values into a half2, host version
__host__ inline __half2 pack_half2(__half h1, __half h2)
{
unsigned short s1 = *reinterpret_cast<unsigned short*>(&h1);
unsigned short s2 = *reinterpret_cast<unsigned short*>(&h2);
ushort2 us2 = make_ushort2(s1, s2);
return *reinterpret_cast<__half2*>(&us2);
}
#endif |