- What is a GPU?
- Graphical Processing Unit
- 3D computer graphics
- Mostly computations involve vector and matrix operations
- Scientists find cumbersome ways to exploit this (OpenGL, DirectX)
- CUDA (2006), OpenCL (2009), OpenACC (2012), …
Apr 9, 2021
“CUDA® is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs.”
y
because R does not typically modify objects directlyvoid daxpy(int n, double a, double *x, double *y) { for (int i = 0; i < n; i++) { y[i] = a * x[i] + y[i]; } }
void daxpy(int n, double a, double *x, double *y) { for (int i = 0; i < n; i++) { y[i] = a * x[i] + y[i]; } }
__global__ void cuda_daxpy_kernel(int n, double a, double *d_x, double *d_y) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { d_y[i] = a * d_x[i] + d_y[i]; } }
__global__ void cuda_daxpy_kernel(int n, double a, double *d_x, double *d_y) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { d_y[i] = a * d_x[i] + d_y[i]; } }
n
times, each in a separate threadgridDim
and blockDim
come from somewhere else (“magic”)n
__global__
and has a return type of void
d_
prefix means “data on device”void cuda_daxpy(int n, double a, double *h_x, double *h_y, double *h_out) { // Allocate device memory double *d_x, *d_y; cudaMalloc(&d_x, n * sizeof (double)); cudaMalloc(&d_y, n * sizeof (double)); // Copy data from host to device cudaMemcpy(d_x, h_x, n * sizeof (double), cudaMemcpyHostToDevice); cudaMemcpy(d_y, h_y, n * sizeof (double), cudaMemcpyHostToDevice); // Run device function (kernel) int threadsPerBlock = 256; int blocksPerGrid = (n + (threadsPerBlock - 1)) / threadsPerBlock; cuda_daxpy_kernel<<<blocksPerGrid, threadsPerBlock>>>(n, a, d_x, d_y); // Copy data from device to host cudaMemcpy(h_out, d_y, n * sizeof (double), cudaMemcpyDeviceToHost); // Free device memory cudaFree(d_x); cudaFree(d_y); }
cudaMalloc
to allocate memory on devicecudaMemcpy
to copy from host to device or device to hostcudaFree
to free up allocated memory on device<<<blocksPerGrid, threadsPerBlock>>>
is special syntax for launching a kernel (i.e., a special compiler is needed)h_
prefix means “data on host”#include <Rinternals.h> extern "C" SEXP C_daxpy(SEXP a, SEXP x, SEXP y) { // Allocate memory for result R_xlen_t n = Rf_xlength(y); SEXP out = PROTECT(Rf_allocVector(REALSXP, n)); // Call host function code cuda_daxpy(n, Rf_asReal(a), REAL(x), REAL(y), REAL(out)); // Return result UNPROTECT(1); // out return out; }
dev-intel14-k20
, dev-intel16-k80
, dev-amd20-v100
) or request a GPU using the --gres=gpu:[type:]1
option (type
is optional and can be k20
, k80
, or v100
)CUDA
module is already loaded: nvcc
, nsys
, nsight-sys
, nvidia-smi
, …nvcc -shared -Xcompiler -fPIC -I/mnt/research/quantgen/tools/software/R/4.0.3/lib64/R/include -arch=compute_35 -o daxpy.so daxpy.cu
dyn.load("daxpy.so") a <- 2.0 x <- rnorm(1e8, 10) y <- rnorm(1e8) res <- .Call("C_daxpy", a, x, y)
nsys profile --stats=true Rscript daxpy.R # nsight-sys
Same implementation as BGData rayOLS
Simulated height
Chromosome 22 of UK Biobank (409,629 x 9,927)
Time in milliseconds
expr min lq mean median uq max neval CPU 39926.4360 40038.1847 40174.8242 40133.1558 40249.4981 40663.422 25 GPU 334.4782 343.9245 538.7745 345.6474 346.3567 5188.819 25