EBU6502_cloud_computing_notes/2-3-advanced-cuda.md
2024-12-29 16:45:17 +08:00

3.4 KiB

Advanced CUDA Programming

Memory management

  • Use the following calls:
    • cudaMalloc: Allocate memory
    • cudaFree: Free memory
    • cudaMemcpy: Copy memory

Example: Addition on device

__global__ void add(int *a, int *b, int *c) {
  // Since this runs on device, we need they to be in
  // Device memory
  *c = *a + *b;
}

int main(void) {
  int a, b, c;
  // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = sizeof(int);
  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  // Setup input values
  a = 2;
  b = 7;
  // Copy inputs to device
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  // Launch add() kernel on GPU
  add<<<1, 1>>>(d_a, d_b, d_c);
  // Copy result back to host
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  return 0;
}

Memory in GPU and CPU

  • They cannot dereference a pointer that points to memory not in their device:
    • Device pointers point to GPU memory, can be passed from or to host, but can't be dereferenced by Host
  • use APIs to manage device memory: link

Heterogeneous programming

  • Because both host CPU and device GPU are used
  • The code has serial code, executed by host CPU, and parallel kernel and code executed by the GPU

Parallel computing With CUDA

Parallel Programming

  • Kernel: executed on GPU, as an array of threads in parallel
  • Threads: execute the same code, but can take different paths
    • Thread has an ID, to select data and control decisions
  • Blocks: Threads are grouped into blocks, each block represent a stream processor
  • Grids: Blocks are grouped into grids

Iterations

#include <stdio.h>
__global__ void add(int *a, int *b, int *c) {
  int bid = blockIdx.x;
  printf("bid: %d\n", bid);
  if (bid < 6) { // The students can also use a "for loop here"
    c[bid] = a[bid] + b[bid];
    printf("c: %d\n", c[bid]);
  }
}

int main(void) {
  int a[6], b[6], c[6];
  int *dev_a, *dev_b, *dev_c;
  // allocate memory to device
  cudaMalloc((void **)&dev_a, 6 * sizeof(int));
  cudaMalloc((void **)&dev_b, 6 * sizeof(int));
  cudaMalloc((void **)&dev_c, 6 * sizeof(int));
  // Fill arrays "a" and "b" with values on the host
  for (int i = 0; i < 6; i++) {
    a[i] = i;
    b[i] = i * i;
  }
  // Copy arrays "a" and "b" to the device
  cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
  // Launch the kernel
  add<<<4, 1>>>(dev_a, dev_b, dev_c);
  // Copy the array "c" from the device to the host
  cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the array "c"
  for (int i = 0; i < 6; i++) {
    printf("%d\n", c[i]);
  }
  // Free memory allocated to the device
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  return 0;
} // End main

Assumptions:

  • Assume CUDA thread execute on a physically separate device than Host
  • Assume both Host and Device has their own separate memory spaces in DRAM

Thread vs. Block

  • Thread per block is limited, while number of block is not (they can wait until previous block finish)
  • Threads in a block has shared memory, while blocks don't have shared memory, only global memory.

Examples and tasks