Advanced CUDA Programming

Memory management

Use the following calls:
- cudaMalloc: Allocate memory
- cudaFree: Free memory
- cudaMemcpy: Copy memory

Example: Addition on device

__global__ void add(int *a, int *b, int *c) {
  // Since this runs on device, we need they to be in
  // Device memory
  *c = *a + *b;
}

int main(void) {
  int a, b, c;
  // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = sizeof(int);
  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  // Setup input values
  a = 2;
  b = 7;
  // Copy inputs to device
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  // Launch add() kernel on GPU
  add<<<1, 1>>>(d_a, d_b, d_c);
  // Copy result back to host
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  return 0;
}

Memory in GPU and CPU

They cannot dereference a pointer that points to memory not in their device:
- Device pointers point to GPU memory, can be passed from or to host, but can't be dereferenced by Host
use APIs to manage device memory: link

Heterogeneous programming

Because both host CPU and device GPU are used
The code has serial code, executed by host CPU, and parallel kernel and code executed by the GPU

Parallel computing With CUDA

Parallel Programming

Kernel: executed on GPU, as an array of threads in parallel
Threads: execute the same code, but can take different paths
- Thread has an ID, to select data and control decisions
Blocks: Threads are grouped into blocks, each block represent a stream processor
Grids: Blocks are grouped into grids

Iterations

#include <stdio.h>
__global__ void add(int *a, int *b, int *c) {
  int bid = blockIdx.x;
  printf("bid: %d\n", bid);
  if (bid < 6) { // The students can also use a "for loop here"
    c[bid] = a[bid] + b[bid];
    printf("c: %d\n", c[bid]);
  }
}

int main(void) {
  int a[6], b[6], c[6];
  int *dev_a, *dev_b, *dev_c;
  // allocate memory to device
  cudaMalloc((void **)&dev_a, 6 * sizeof(int));
  cudaMalloc((void **)&dev_b, 6 * sizeof(int));
  cudaMalloc((void **)&dev_c, 6 * sizeof(int));
  // Fill arrays "a" and "b" with values on the host
  for (int i = 0; i < 6; i++) {
    a[i] = i;
    b[i] = i * i;
  }
  // Copy arrays "a" and "b" to the device
  cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
  // Launch the kernel
  add<<<4, 1>>>(dev_a, dev_b, dev_c);
  // Copy the array "c" from the device to the host
  cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the array "c"
  for (int i = 0; i < 6; i++) {
    printf("%d\n", c[i]);
  }
  // Free memory allocated to the device
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  return 0;
} // End main

Assumptions:

Assume CUDA thread execute on a physically separate device than Host
Assume both Host and Device has their own separate memory spaces in DRAM

Thread vs. Block

Thread per block is limited, while number of block is not (they can wait until previous block finish)
Threads in a block has shared memory, while blocks don't have shared memory, only global memory.

3.4 KiB Raw Blame History

Advanced CUDA Programming

Memory management

Example: Addition on device

Memory in GPU and CPU

Heterogeneous programming

Parallel computing With CUDA

Parallel Programming

Iterations

Assumptions:

Thread vs. Block

Examples and tasks

3.4 KiB

Raw Blame History