finish cuda programming, took around 1.8 hrs

2024-12-29 12:40:01 +08:00 · 2024-12-29 12:40:01 +08:00 · eeb097662a
parent 557be68e19
commit eeb097662a
4 changed files with 187 additions and 0 deletions
--- a/2-2-cuda.md
+++ b/2-2-cuda.md
@ -0,0 +1,138 @@
+# CUDA
+
+## Definition
+
+- Compute Unified Device Architecture
+- Is a general purpose **parallel** programming platform and model
+
+## Scalability with CUDA
+
+- GPU is has an array of Streaming Multiprocessors
+- A program can be partitioned into blocks that execute independently
+- Allow performance to scale with the number of GPU multiprocessors
+
+## Architecture
+
+### Kernel
+
+- CUDA extends C by allow to define C functions, which is called a kernel
+    - Kernel is executed in parallel, by N different CUDA threads, rather than
+      only once in serial
+    - Defined with specifier `__global__`
+- Number of threads is given using `<<<NBlocks, NThreads>>>`, and used with
+  `func_name<<<NBlocks, NThreads>>>(params);`
+    - They take an integer or dim3, so it cal also be called Dimension of
+      Blocks, or Dimension of Threads
+- Thread number is given an ID, accessible by `threadIdx` variable
+- Example:
+    ```c
+    // Add array A and B of size N together, store it in C
+    #define N 2
+    // Kernel definition
+    __global__ void VecAdd(float* A, float* B, float* C) {
+        // Each thread performs one pair wise addition
+        int i = threadIdx.x;
+        // The array is a float, meaning we need to allocate memory on GPU
+        C[i] = A[i] + B[i];
+    }
+    int main() {
+        ...
+        // Kernel invocation with N threads
+        VecAdd<<<1, N>>>(A, B, C);
+        ...
+    }
+    ```
+
+### Device code and Host code
+
+- Compiler `nvcc` separates source code into host and device code
+    - Device code (kernels): definition marked with `__global__`, called from
+      host code (with `func<<<x, y>>>(a, b)`), and run on device (GPU)
+    - Host code: normal C code, processed by system compiler like `gcc` or
+      `clang`
+- If the device code definitions are used, the compiled code execute on both CPU
+  and GPU
+
+### Thread hierarchy
+
+- `threadIdx` is a 3 component vector
+- Enables threads to be identified in 1-3 dimension blocks, which can form a
+  **grid** of thread blocks
+- Number of threads depend on the size of data being processed
+- Features:
+    - Maximum number of threads per block: because they reside on the same
+      block, on the same processor core
+    - A kernel can be executed by multiple equally shaped thread blocks, so the
+      total number of threads is threads per block times blocks
+- Example:
+    ```c
+    #define N 10
+    // Kernel definition
+    __global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N]) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        C[i][j] = A[i][j] + B[i][j];
+    }
+    int main() {
+        ...
+        // Kernel invocation with one block of N * N * 1 threads
+        int numBlocks = 1;
+        dim3 threadsPerBlock(N, N);
+        MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
+        ...
+    }
+    ```
+- A complete example:
+
+    ```c
+
+    ```
+
+### Block Hierarchy
+
+- Blocks can be represented in 1-3 dimensional **grids**, size is dependent on
+  the input data
+- Accessing
+    - Block index: `blockIdx`
+    - Dimension: `blockDim`
+- Example: multi block kernel:
+    ```c
+    #define N 16
+    // Kernel definition
+    __global__ void MatAdd(float A[N][N], float B[N][N],
+    float C[N][N]) {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int j = blockIdx.y * blockDim.y + threadIdx.y;
+        if (i < N && j < N)
+        C[i][j] = A[i][j] + B[i][j];
+    }
+    int main() {
+        ...
+        // Kernel invocation
+        dim3 threadsPerBlock(16, 16);
+        dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
+        MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
+        ...
+    }
+    ```
+
+### Memory Hierarchy
+
+- Thread: Registers, Local memory
+- Block of threads: Shared memory
+- All Blocks: Global memory
+
+### GPU vs. CPU
+
+- GPU has more stream processors, and each processor has much more ALUs
+- TODO: see code in assets/code, and figure shit out, and index array
+
+## Coordinating with CPU
+
+- GPU kernel launch are **asynchronous**, while control returns to CPU
+  immediately
+- CPU need to **synchronize** before using the results
+    - `cudaMemcpy()`: block and copy
+    - `cudaMemcpyAsync()`: copy but don't block
+    - `cudaDeviceSynchronize()`: Block the CPU until all CUDA calls before have
+      completed
--- a/assets/code/a.out
+++ b/assets/code/a.out
--- a/assets/code/test.cu
+++ b/assets/code/test.cu
@ -0,0 +1,10 @@
+#include <cstdio>
+__global__ void mykernel(void) {
+	return;
+}
+
+int main(void) {
+	mykernel<<<1,1>>>();
+	std::printf("Hello, World!\n");
+	return 0;
+}
--- a/assets/code/test1.cu
+++ b/assets/code/test1.cu
@ -0,0 +1,39 @@
+#include <stdio.h>
+#define N 10
+// 1. Define the kernel
+__global__ void add(int *a, int *b, int *c) {
+  int tid = blockIdx.x; // handle the data at this index
+  if (tid < N)
+	c[tid] = a[tid] + b[tid];
+}
+
+// 2. Declare the main method
+int main(void) {
+  int a[N], b[N], c[N];
+  int *dev_a, *dev_b, *dev_c;
+  // 3. allocate the memory on the GPU
+  cudaMalloc((void **)&dev_a, N * sizeof(int));
+  cudaMalloc((void **)&dev_b, N * sizeof(int));
+  cudaMalloc((void **)&dev_c, N * sizeof(int));
+  // 4. fill the arrays 'a' and 'b' on the CPU
+  for (int i = 0; i < N; i++) {
+    a[i] = -i;
+    b[i] = i * i;
+  }
+  // 5. copy the arrays 'a' and 'b' to the GPU
+  cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
+  // 6. launch the kernel on the GPU
+  add<<<N, 1>>>(dev_a, dev_b, dev_c);
+  // 7. copy the array 'c' back from the GPU to the CPU
+  cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
+  // 8. the results through the CPU
+  for (int i = 0; i < N; i++) {
+    printf("%d + %d = %d\n", a[i], b[i], c[i]);
+  }
+  // 9. free the memory allocated on the GPU
+  cudaFree(dev_a);
+  cudaFree(dev_b);
+  cudaFree(dev_c);
+  return 0;
+}