diff --git a/2-2-cuda.md b/2-2-cuda.md new file mode 100644 index 0000000..e1e374e --- /dev/null +++ b/2-2-cuda.md @@ -0,0 +1,138 @@ +# CUDA + +## Definition + +- Compute Unified Device Architecture +- Is a general purpose **parallel** programming platform and model + +## Scalability with CUDA + +- GPU is has an array of Streaming Multiprocessors +- A program can be partitioned into blocks that execute independently +- Allow performance to scale with the number of GPU multiprocessors + +## Architecture + +### Kernel + +- CUDA extends C by allow to define C functions, which is called a kernel + - Kernel is executed in parallel, by N different CUDA threads, rather than + only once in serial + - Defined with specifier `__global__` +- Number of threads is given using `<<>>`, and used with + `func_name<<>>(params);` + - They take an integer or dim3, so it cal also be called Dimension of + Blocks, or Dimension of Threads +- Thread number is given an ID, accessible by `threadIdx` variable +- Example: + ```c + // Add array A and B of size N together, store it in C + #define N 2 + // Kernel definition + __global__ void VecAdd(float* A, float* B, float* C) { + // Each thread performs one pair wise addition + int i = threadIdx.x; + // The array is a float, meaning we need to allocate memory on GPU + C[i] = A[i] + B[i]; + } + int main() { + ... + // Kernel invocation with N threads + VecAdd<<<1, N>>>(A, B, C); + ... + } + ``` + +### Device code and Host code + +- Compiler `nvcc` separates source code into host and device code + - Device code (kernels): definition marked with `__global__`, called from + host code (with `func<<>>(a, b)`), and run on device (GPU) + - Host code: normal C code, processed by system compiler like `gcc` or + `clang` +- If the device code definitions are used, the compiled code execute on both CPU + and GPU + +### Thread hierarchy + +- `threadIdx` is a 3 component vector +- Enables threads to be identified in 1-3 dimension blocks, which can form a + **grid** of thread blocks +- Number of threads depend on the size of data being processed +- Features: + - Maximum number of threads per block: because they reside on the same + block, on the same processor core + - A kernel can be executed by multiple equally shaped thread blocks, so the + total number of threads is threads per block times blocks +- Example: + ```c + #define N 10 + // Kernel definition + __global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N]) { + int i = threadIdx.x; + int j = threadIdx.y; + C[i][j] = A[i][j] + B[i][j]; + } + int main() { + ... + // Kernel invocation with one block of N * N * 1 threads + int numBlocks = 1; + dim3 threadsPerBlock(N, N); + MatAdd<<>>(A, B, C); + ... + } + ``` +- A complete example: + + ```c + + ``` + +### Block Hierarchy + +- Blocks can be represented in 1-3 dimensional **grids**, size is dependent on + the input data +- Accessing + - Block index: `blockIdx` + - Dimension: `blockDim` +- Example: multi block kernel: + ```c + #define N 16 + // Kernel definition + __global__ void MatAdd(float A[N][N], float B[N][N], + float C[N][N]) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + if (i < N && j < N) + C[i][j] = A[i][j] + B[i][j]; + } + int main() { + ... + // Kernel invocation + dim3 threadsPerBlock(16, 16); + dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); + MatAdd<<>>(A, B, C); + ... + } + ``` + +### Memory Hierarchy + +- Thread: Registers, Local memory +- Block of threads: Shared memory +- All Blocks: Global memory + +### GPU vs. CPU + +- GPU has more stream processors, and each processor has much more ALUs +- TODO: see code in assets/code, and figure shit out, and index array + +## Coordinating with CPU + +- GPU kernel launch are **asynchronous**, while control returns to CPU + immediately +- CPU need to **synchronize** before using the results + - `cudaMemcpy()`: block and copy + - `cudaMemcpyAsync()`: copy but don't block + - `cudaDeviceSynchronize()`: Block the CPU until all CUDA calls before have + completed diff --git a/assets/code/a.out b/assets/code/a.out new file mode 100755 index 0000000..b6d11fd Binary files /dev/null and b/assets/code/a.out differ diff --git a/assets/code/test.cu b/assets/code/test.cu new file mode 100644 index 0000000..3f4b7b7 --- /dev/null +++ b/assets/code/test.cu @@ -0,0 +1,10 @@ +#include +__global__ void mykernel(void) { + return; +} + +int main(void) { + mykernel<<<1,1>>>(); + std::printf("Hello, World!\n"); + return 0; +} diff --git a/assets/code/test1.cu b/assets/code/test1.cu new file mode 100644 index 0000000..e89b45d --- /dev/null +++ b/assets/code/test1.cu @@ -0,0 +1,39 @@ +#include +#define N 10 +// 1. Define the kernel +__global__ void add(int *a, int *b, int *c) { + int tid = blockIdx.x; // handle the data at this index + if (tid < N) + c[tid] = a[tid] + b[tid]; +} + +// 2. Declare the main method +int main(void) { + int a[N], b[N], c[N]; + int *dev_a, *dev_b, *dev_c; + // 3. allocate the memory on the GPU + cudaMalloc((void **)&dev_a, N * sizeof(int)); + cudaMalloc((void **)&dev_b, N * sizeof(int)); + cudaMalloc((void **)&dev_c, N * sizeof(int)); + // 4. fill the arrays 'a' and 'b' on the CPU + for (int i = 0; i < N; i++) { + a[i] = -i; + b[i] = i * i; + } + // 5. copy the arrays 'a' and 'b' to the GPU + cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice); + // 6. launch the kernel on the GPU + add<<>>(dev_a, dev_b, dev_c); + // 7. copy the array 'c' back from the GPU to the CPU + cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost); + // 8. the results through the CPU + for (int i = 0; i < N; i++) { + printf("%d + %d = %d\n", a[i], b[i], c[i]); + } + // 9. free the memory allocated on the GPU + cudaFree(dev_a); + cudaFree(dev_b); + cudaFree(dev_c); + return 0; +}