CUDA advanced Took 1hr 50 mins

2024-12-29 16:45:17 +08:00 · 2024-12-29 16:45:17 +08:00 · b3a8881805
parent eeb097662a
commit b3a8881805
7 changed files with 364 additions and 1 deletions
--- a/2-2-cuda.md
+++ b/2-2-cuda.md
@ -42,6 +42,7 @@
        ...
    }
    ```
+- Note to self: run `nvcc` with `-arch=native` otherwise it won't work.

 ### Device code and Host code

--- a/2-3-advanced-cuda.md
+++ b/2-3-advanced-cuda.md
@ -0,0 +1,127 @@
+# Advanced CUDA Programming
+
+## Memory management
+
+- Use the following calls:
+    - `cudaMalloc`: Allocate memory
+    - `cudaFree`: Free memory
+    - `cudaMemcpy`: Copy memory
+
+### Example: Addition on device
+
+```c
+__global__ void add(int *a, int *b, int *c) {
+  // Since this runs on device, we need they to be in
+  // Device memory
+  *c = *a + *b;
+}
+
+int main(void) {
+  int a, b, c;
+  // host copies of a, b, c
+  int *d_a, *d_b, *d_c; // device copies of a, b, c
+  int size = sizeof(int);
+  // Allocate space for device copies of a, b, c
+  cudaMalloc((void **)&d_a, size);
+  cudaMalloc((void **)&d_b, size);
+  cudaMalloc((void **)&d_c, size);
+  // Setup input values
+  a = 2;
+  b = 7;
+  // Copy inputs to device
+  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
+  // Launch add() kernel on GPU
+  add<<<1, 1>>>(d_a, d_b, d_c);
+  // Copy result back to host
+  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
+  // Cleanup
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+  return 0;
+}
+```
+
+### Memory in GPU and CPU
+
+- They cannot dereference a pointer that points to memory not in their device:
+    - Device pointers point to GPU memory, can be passed from or to host, but
+      can't be dereferenced by Host
+- use APIs to manage device memory: [link](#memory-management)
+
+## Heterogeneous programming
+
+- Because both host CPU and device GPU are used
+- The code has serial code, executed by host CPU, and parallel kernel and code
+  executed by the GPU
+
+## Parallel computing With CUDA
+
+### Parallel Programming
+
+- Kernel: executed on GPU, as an array of threads in parallel
+- Threads: execute the same code, but can take different paths
+    - Thread has an ID, to select data and control decisions
+- Blocks: Threads are grouped into blocks, each block represent a stream
+  processor
+- Grids: Blocks are grouped into grids
+
+### Iterations
+
+```c
+#include <stdio.h>
+__global__ void add(int *a, int *b, int *c) {
+  int bid = blockIdx.x;
+  printf("bid: %d\n", bid);
+  if (bid < 6) { // The students can also use a "for loop here"
+    c[bid] = a[bid] + b[bid];
+    printf("c: %d\n", c[bid]);
+  }
+}
+
+int main(void) {
+  int a[6], b[6], c[6];
+  int *dev_a, *dev_b, *dev_c;
+  // allocate memory to device
+  cudaMalloc((void **)&dev_a, 6 * sizeof(int));
+  cudaMalloc((void **)&dev_b, 6 * sizeof(int));
+  cudaMalloc((void **)&dev_c, 6 * sizeof(int));
+  // Fill arrays "a" and "b" with values on the host
+  for (int i = 0; i < 6; i++) {
+    a[i] = i;
+    b[i] = i * i;
+  }
+  // Copy arrays "a" and "b" to the device
+  cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
+  // Launch the kernel
+  add<<<4, 1>>>(dev_a, dev_b, dev_c);
+  // Copy the array "c" from the device to the host
+  cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);
+
+  // Print the array "c"
+  for (int i = 0; i < 6; i++) {
+    printf("%d\n", c[i]);
+  }
+  // Free memory allocated to the device
+  cudaFree(dev_a);
+  cudaFree(dev_b);
+  cudaFree(dev_c);
+  return 0;
+} // End main
+```
+
+### Assumptions:
+
+- Assume CUDA thread execute on a physically separate device than Host
+- Assume both Host and Device has their own separate memory spaces in DRAM
+
+### Thread vs. Block
+
+- Thread per block is limited, while number of block is not (they can wait until
+  previous block finish)
+- Threads in a block has shared memory, while blocks don't have shared memory,
+  only global memory.
+
+## Examples and tasks
--- a/assets/code/a.out
+++ b/assets/code/a.out
--- a/assets/code/test1.cu
+++ b/assets/code/test1.cu
@ -1,5 +1,5 @@
 #include <stdio.h>
-#define N 10
+#define N 16
 // 1. Define the kernel
 __global__ void add(int *a, int *b, int *c) {
  int tid = blockIdx.x; // handle the data at this index
--- a/assets/code/test2.cu
+++ b/assets/code/test2.cu
@ -0,0 +1,31 @@
+#include <stdio.h>
+__global__ void add(int *a, int *b, int *c) {
+	*c = *a + *b; 
+}
+
+int main(void) {
+  int a, b, c;
+  // host copies of a, b, c
+  int *d_a, *d_b, *d_c; // device copies of a, b, c
+  int size = sizeof(int);
+  // Allocate space for device copies of a, b, c
+  cudaMalloc((void **)&d_a, size);
+  cudaMalloc((void **)&d_b, size);
+  cudaMalloc((void **)&d_c, size);
+  // Setup input values
+  a = 2;
+  b = 7;
+  // Copy inputs to device
+  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
+  // Launch add() kernel on GPU
+  add<<<1, 1>>>(d_a, d_b, d_c);
+  // Copy result back to host
+  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
+  printf("result c: %d", c);
+  // Cleanup
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+  return 0;
+}
--- a/assets/code/test3.cu
+++ b/assets/code/test3.cu
@ -0,0 +1,42 @@
+#include <stdio.h>
+#define N 6
+
+__global__ void add(int *a, int *b, int *c) {
+  int bid = blockIdx.x;
+  printf("bid: %d\n", bid);
+  if (bid < N) { // The students can also use a "for loop here"
+    c[bid] = a[bid] + b[bid];
+    printf("c: %d\n", c[bid]);
+  }
+}
+
+int main(void) {
+  int a[N], b[N], c[N];
+  int *dev_a, *dev_b, *dev_c;
+  // allocate memory to device
+  cudaMalloc((void **)&dev_a, N * sizeof(int));
+  cudaMalloc((void **)&dev_b, N * sizeof(int));
+  cudaMalloc((void **)&dev_c, N * sizeof(int));
+  // Fill arrays "a" and "b" with values on the host
+  for (int i = 0; i < N; i++) {
+    a[i] = i;
+    b[i] = i * i;
+  }
+  // Copy arrays "a" and "b" to the device
+  cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
+  // Launch the kernel
+  add<<<12, 1>>>(dev_a, dev_b, dev_c);
+  // Copy the array "c" from the device to the host
+  cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
+
+  // Print the array "c"
+  for (int i = 0; i < N; i++) {
+    printf("%d\n", c[i]);
+  }
+  // Free memory allocated to the device
+  cudaFree(dev_a);
+  cudaFree(dev_b);
+  cudaFree(dev_c);
+  return 0;
+} // End main
--- a/mdsf.json
+++ b/mdsf.json
@ -0,0 +1,162 @@
+{
+  "$schema": "https://raw.githubusercontent.com/hougesen/mdsf/main/schemas/v0.3.2/mdsf.schema.json",
+  "format_finished_document": false,
+  "javascript_runtime": "node",
+  "languages": {
+    "c": "clang-format",
+    "clojure": [
+      [
+        "cljstyle",
+        "joker"
+      ]
+    ],
+    "cpp": "clang-format",
+    "crystal": "crystal:format",
+    "csharp": [
+      [
+        "csharpier",
+        "clang-format"
+      ]
+    ],
+    "css": [
+      [
+        "prettier",
+        "stylelint"
+      ]
+    ],
+    "d": "dfmt",
+    "dart": "dart:format",
+    "elixir": "mix:format",
+    "elm": "elm-format",
+    "erlang": [
+      [
+        "erlfmt",
+        "efmt"
+      ]
+    ],
+    "gleam": "gleam:format",
+    "go": [
+      [
+        "gci",
+        "goimports-reviser",
+        "goimports"
+      ],
+      [
+        "gofumpt",
+        "gofmt",
+        "crlfmt"
+      ]
+    ],
+    "haskell": [
+      [
+        "fourmolu",
+        "ormolu",
+        "hindent",
+        "stylish-haskell"
+      ]
+    ],
+    "html": [
+      [
+        "prettier",
+        "djlint"
+      ]
+    ],
+    "java": [
+      [
+        "google-java-format",
+        "clang-format"
+      ]
+    ],
+    "javascript": [
+      [
+        "prettier",
+        "biome:format",
+        "deno:fmt",
+        "clang-format",
+        "standardjs"
+      ]
+    ],
+    "json": [
+      [
+        "prettier",
+        "biome:format",
+        "deno:fmt",
+        "clang-format"
+      ]
+    ],
+    "kotlin": "ktfmt",
+    "lua": [
+      [
+        "stylua",
+        "luaformatter"
+      ]
+    ],
+    "nim": "nimpretty",
+    "ocaml": [
+      [
+        "ocamlformat",
+        "ocp-indent"
+      ]
+    ],
+    "python": [
+      [
+        "usort",
+        "isort"
+      ],
+      [
+        "ruff:format",
+        "blue",
+        "black",
+        "yapf",
+        "autopep8",
+        "pyink"
+      ]
+    ],
+    "roc": "roc:format",
+    "ruby": [
+      [
+        "rubocop",
+        "rufo",
+        "rubyfmt",
+        "standardrb"
+      ]
+    ],
+    "rust": "rustfmt",
+    "shell": [
+      [
+        "shfmt",
+        "beautysh"
+      ]
+    ],
+    "sql": [
+      [
+        "sql-formatter",
+        "sqlfluff:format",
+        "sqlfmt"
+      ]
+    ],
+    "swift": [
+      [
+        "swift-format",
+        "swiftformat"
+      ]
+    ],
+    "toml": "taplo",
+    "typescript": [
+      [
+        "prettier",
+        "biome:format",
+        "deno:fmt"
+      ]
+    ],
+    "yaml": [
+      [
+        "prettier",
+        "yamlfmt",
+        "yamlfix"
+      ]
+    ],
+    "zig": "zig:fmt"
+  },
+  "custom_file_extensions": {}
+}