diff --git a/2-2-cuda.md b/2-2-cuda.md index e1e374e..b4311de 100644 --- a/2-2-cuda.md +++ b/2-2-cuda.md @@ -42,6 +42,7 @@ ... } ``` +- Note to self: run `nvcc` with `-arch=native` otherwise it won't work. ### Device code and Host code diff --git a/2-3-advanced-cuda.md b/2-3-advanced-cuda.md new file mode 100644 index 0000000..44b489a --- /dev/null +++ b/2-3-advanced-cuda.md @@ -0,0 +1,127 @@ +# Advanced CUDA Programming + +## Memory management + +- Use the following calls: + - `cudaMalloc`: Allocate memory + - `cudaFree`: Free memory + - `cudaMemcpy`: Copy memory + +### Example: Addition on device + +```c +__global__ void add(int *a, int *b, int *c) { + // Since this runs on device, we need they to be in + // Device memory + *c = *a + *b; +} + +int main(void) { + int a, b, c; + // host copies of a, b, c + int *d_a, *d_b, *d_c; // device copies of a, b, c + int size = sizeof(int); + // Allocate space for device copies of a, b, c + cudaMalloc((void **)&d_a, size); + cudaMalloc((void **)&d_b, size); + cudaMalloc((void **)&d_c, size); + // Setup input values + a = 2; + b = 7; + // Copy inputs to device + cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice); + // Launch add() kernel on GPU + add<<<1, 1>>>(d_a, d_b, d_c); + // Copy result back to host + cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost); + // Cleanup + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); + return 0; +} +``` + +### Memory in GPU and CPU + +- They cannot dereference a pointer that points to memory not in their device: + - Device pointers point to GPU memory, can be passed from or to host, but + can't be dereferenced by Host +- use APIs to manage device memory: [link](#memory-management) + +## Heterogeneous programming + +- Because both host CPU and device GPU are used +- The code has serial code, executed by host CPU, and parallel kernel and code + executed by the GPU + +## Parallel computing With CUDA + +### Parallel Programming + +- Kernel: executed on GPU, as an array of threads in parallel +- Threads: execute the same code, but can take different paths + - Thread has an ID, to select data and control decisions +- Blocks: Threads are grouped into blocks, each block represent a stream + processor +- Grids: Blocks are grouped into grids + +### Iterations + +```c +#include +__global__ void add(int *a, int *b, int *c) { + int bid = blockIdx.x; + printf("bid: %d\n", bid); + if (bid < 6) { // The students can also use a "for loop here" + c[bid] = a[bid] + b[bid]; + printf("c: %d\n", c[bid]); + } +} + +int main(void) { + int a[6], b[6], c[6]; + int *dev_a, *dev_b, *dev_c; + // allocate memory to device + cudaMalloc((void **)&dev_a, 6 * sizeof(int)); + cudaMalloc((void **)&dev_b, 6 * sizeof(int)); + cudaMalloc((void **)&dev_c, 6 * sizeof(int)); + // Fill arrays "a" and "b" with values on the host + for (int i = 0; i < 6; i++) { + a[i] = i; + b[i] = i * i; + } + // Copy arrays "a" and "b" to the device + cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice); + // Launch the kernel + add<<<4, 1>>>(dev_a, dev_b, dev_c); + // Copy the array "c" from the device to the host + cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost); + + // Print the array "c" + for (int i = 0; i < 6; i++) { + printf("%d\n", c[i]); + } + // Free memory allocated to the device + cudaFree(dev_a); + cudaFree(dev_b); + cudaFree(dev_c); + return 0; +} // End main +``` + +### Assumptions: + +- Assume CUDA thread execute on a physically separate device than Host +- Assume both Host and Device has their own separate memory spaces in DRAM + +### Thread vs. Block + +- Thread per block is limited, while number of block is not (they can wait until + previous block finish) +- Threads in a block has shared memory, while blocks don't have shared memory, + only global memory. + +## Examples and tasks diff --git a/assets/code/a.out b/assets/code/a.out index b6d11fd..10f82e0 100755 Binary files a/assets/code/a.out and b/assets/code/a.out differ diff --git a/assets/code/test1.cu b/assets/code/test1.cu index e89b45d..c02bf55 100644 --- a/assets/code/test1.cu +++ b/assets/code/test1.cu @@ -1,5 +1,5 @@ #include -#define N 10 +#define N 16 // 1. Define the kernel __global__ void add(int *a, int *b, int *c) { int tid = blockIdx.x; // handle the data at this index diff --git a/assets/code/test2.cu b/assets/code/test2.cu new file mode 100644 index 0000000..3327992 --- /dev/null +++ b/assets/code/test2.cu @@ -0,0 +1,31 @@ +#include +__global__ void add(int *a, int *b, int *c) { + *c = *a + *b; +} + +int main(void) { + int a, b, c; + // host copies of a, b, c + int *d_a, *d_b, *d_c; // device copies of a, b, c + int size = sizeof(int); + // Allocate space for device copies of a, b, c + cudaMalloc((void **)&d_a, size); + cudaMalloc((void **)&d_b, size); + cudaMalloc((void **)&d_c, size); + // Setup input values + a = 2; + b = 7; + // Copy inputs to device + cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice); + // Launch add() kernel on GPU + add<<<1, 1>>>(d_a, d_b, d_c); + // Copy result back to host + cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost); + printf("result c: %d", c); + // Cleanup + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); + return 0; +} diff --git a/assets/code/test3.cu b/assets/code/test3.cu new file mode 100644 index 0000000..ae2d001 --- /dev/null +++ b/assets/code/test3.cu @@ -0,0 +1,42 @@ +#include +#define N 6 + +__global__ void add(int *a, int *b, int *c) { + int bid = blockIdx.x; + printf("bid: %d\n", bid); + if (bid < N) { // The students can also use a "for loop here" + c[bid] = a[bid] + b[bid]; + printf("c: %d\n", c[bid]); + } +} + +int main(void) { + int a[N], b[N], c[N]; + int *dev_a, *dev_b, *dev_c; + // allocate memory to device + cudaMalloc((void **)&dev_a, N * sizeof(int)); + cudaMalloc((void **)&dev_b, N * sizeof(int)); + cudaMalloc((void **)&dev_c, N * sizeof(int)); + // Fill arrays "a" and "b" with values on the host + for (int i = 0; i < N; i++) { + a[i] = i; + b[i] = i * i; + } + // Copy arrays "a" and "b" to the device + cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice); + // Launch the kernel + add<<<12, 1>>>(dev_a, dev_b, dev_c); + // Copy the array "c" from the device to the host + cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost); + + // Print the array "c" + for (int i = 0; i < N; i++) { + printf("%d\n", c[i]); + } + // Free memory allocated to the device + cudaFree(dev_a); + cudaFree(dev_b); + cudaFree(dev_c); + return 0; +} // End main diff --git a/mdsf.json b/mdsf.json new file mode 100644 index 0000000..15ddb3f --- /dev/null +++ b/mdsf.json @@ -0,0 +1,162 @@ +{ + "$schema": "https://raw.githubusercontent.com/hougesen/mdsf/main/schemas/v0.3.2/mdsf.schema.json", + "format_finished_document": false, + "javascript_runtime": "node", + "languages": { + "c": "clang-format", + "clojure": [ + [ + "cljstyle", + "joker" + ] + ], + "cpp": "clang-format", + "crystal": "crystal:format", + "csharp": [ + [ + "csharpier", + "clang-format" + ] + ], + "css": [ + [ + "prettier", + "stylelint" + ] + ], + "d": "dfmt", + "dart": "dart:format", + "elixir": "mix:format", + "elm": "elm-format", + "erlang": [ + [ + "erlfmt", + "efmt" + ] + ], + "gleam": "gleam:format", + "go": [ + [ + "gci", + "goimports-reviser", + "goimports" + ], + [ + "gofumpt", + "gofmt", + "crlfmt" + ] + ], + "haskell": [ + [ + "fourmolu", + "ormolu", + "hindent", + "stylish-haskell" + ] + ], + "html": [ + [ + "prettier", + "djlint" + ] + ], + "java": [ + [ + "google-java-format", + "clang-format" + ] + ], + "javascript": [ + [ + "prettier", + "biome:format", + "deno:fmt", + "clang-format", + "standardjs" + ] + ], + "json": [ + [ + "prettier", + "biome:format", + "deno:fmt", + "clang-format" + ] + ], + "kotlin": "ktfmt", + "lua": [ + [ + "stylua", + "luaformatter" + ] + ], + "nim": "nimpretty", + "ocaml": [ + [ + "ocamlformat", + "ocp-indent" + ] + ], + "python": [ + [ + "usort", + "isort" + ], + [ + "ruff:format", + "blue", + "black", + "yapf", + "autopep8", + "pyink" + ] + ], + "roc": "roc:format", + "ruby": [ + [ + "rubocop", + "rufo", + "rubyfmt", + "standardrb" + ] + ], + "rust": "rustfmt", + "shell": [ + [ + "shfmt", + "beautysh" + ] + ], + "sql": [ + [ + "sql-formatter", + "sqlfluff:format", + "sqlfmt" + ] + ], + "swift": [ + [ + "swift-format", + "swiftformat" + ] + ], + "toml": "taplo", + "typescript": [ + [ + "prettier", + "biome:format", + "deno:fmt" + ] + ], + "yaml": [ + [ + "prettier", + "yamlfmt", + "yamlfix" + ] + ], + "zig": "zig:fmt" + }, + "custom_file_extensions": {} +} \ No newline at end of file