- What is a GPU?
- Graphical Processing Unit
- 3D computer graphics
- Mostly computations involve vector and matrix operations
- Scientists find cumbersome ways to exploit this (OpenGL, DirectX)
- CUDA (2006), OpenCL (2009), OpenACC (2012), …
Apr 9, 2021
“CUDA® is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs.”
y because R does not typically modify objects directlyvoid daxpy(int n, double a, double *x, double *y) {
for (int i = 0; i < n; i++) {
y[i] = a * x[i] + y[i];
}
}
void daxpy(int n, double a, double *x, double *y) {
for (int i = 0; i < n; i++) {
y[i] = a * x[i] + y[i];
}
}
__global__ void cuda_daxpy_kernel(int n, double a, double *d_x, double *d_y) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
d_y[i] = a * d_x[i] + d_y[i];
}
}
__global__ void cuda_daxpy_kernel(int n, double a, double *d_x, double *d_y) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
d_y[i] = a * d_x[i] + d_y[i];
}
}
n times, each in a separate threadgridDim and blockDim come from somewhere else (“magic”)n__global__ and has a return type of voidd_ prefix means “data on device”void cuda_daxpy(int n, double a, double *h_x, double *h_y, double *h_out) {
// Allocate device memory
double *d_x, *d_y;
cudaMalloc(&d_x, n * sizeof (double));
cudaMalloc(&d_y, n * sizeof (double));
// Copy data from host to device
cudaMemcpy(d_x, h_x, n * sizeof (double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, n * sizeof (double), cudaMemcpyHostToDevice);
// Run device function (kernel)
int threadsPerBlock = 256;
int blocksPerGrid = (n + (threadsPerBlock - 1)) / threadsPerBlock;
cuda_daxpy_kernel<<<blocksPerGrid, threadsPerBlock>>>(n, a, d_x, d_y);
// Copy data from device to host
cudaMemcpy(h_out, d_y, n * sizeof (double), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_x);
cudaFree(d_y);
}
cudaMalloc to allocate memory on devicecudaMemcpy to copy from host to device or device to hostcudaFree to free up allocated memory on device<<<blocksPerGrid, threadsPerBlock>>> is special syntax for launching a kernel (i.e., a special compiler is needed)h_ prefix means “data on host”#include <Rinternals.h>
extern "C" SEXP C_daxpy(SEXP a, SEXP x, SEXP y) {
// Allocate memory for result
R_xlen_t n = Rf_xlength(y);
SEXP out = PROTECT(Rf_allocVector(REALSXP, n));
// Call host function code
cuda_daxpy(n, Rf_asReal(a), REAL(x), REAL(y), REAL(out));
// Return result
UNPROTECT(1); // out
return out;
}
dev-intel14-k20, dev-intel16-k80, dev-amd20-v100) or request a GPU using the --gres=gpu:[type:]1 option (type is optional and can be k20, k80, or v100)CUDA module is already loaded: nvcc, nsys, nsight-sys, nvidia-smi, …nvcc -shared -Xcompiler -fPIC
-I/mnt/research/quantgen/tools/software/R/4.0.3/lib64/R/include
-arch=compute_35
-o daxpy.so daxpy.cu
dyn.load("daxpy.so")
a <- 2.0
x <- rnorm(1e8, 10)
y <- rnorm(1e8)
res <- .Call("C_daxpy", a, x, y)
nsys profile --stats=true Rscript daxpy.R # nsight-sys
Same implementation as BGData rayOLS
Simulated height
Chromosome 22 of UK Biobank (409,629 x 9,927)
Time in milliseconds
expr min lq mean median uq max neval
CPU 39926.4360 40038.1847 40174.8242 40133.1558 40249.4981 40663.422 25
GPU 334.4782 343.9245 538.7745 345.6474 346.3567 5188.819 25