CUDA advanced Took 1hr 50 mins

This commit is contained in:
Ryan 2024-12-29 16:45:17 +08:00
parent eeb097662a
commit b3a8881805
7 changed files with 364 additions and 1 deletions

View file

@ -42,6 +42,7 @@
... ...
} }
``` ```
- Note to self: run `nvcc` with `-arch=native` otherwise it won't work.
### Device code and Host code ### Device code and Host code

127
2-3-advanced-cuda.md Normal file
View file

@ -0,0 +1,127 @@
# Advanced CUDA Programming
## Memory management
- Use the following calls:
- `cudaMalloc`: Allocate memory
- `cudaFree`: Free memory
- `cudaMemcpy`: Copy memory
### Example: Addition on device
```c
__global__ void add(int *a, int *b, int *c) {
// Since this runs on device, we need they to be in
// Device memory
*c = *a + *b;
}
int main(void) {
int a, b, c;
// host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values
a = 2;
b = 7;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1, 1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
```
### Memory in GPU and CPU
- They cannot dereference a pointer that points to memory not in their device:
- Device pointers point to GPU memory, can be passed from or to host, but
can't be dereferenced by Host
- use APIs to manage device memory: [link](#memory-management)
## Heterogeneous programming
- Because both host CPU and device GPU are used
- The code has serial code, executed by host CPU, and parallel kernel and code
executed by the GPU
## Parallel computing With CUDA
### Parallel Programming
- Kernel: executed on GPU, as an array of threads in parallel
- Threads: execute the same code, but can take different paths
- Thread has an ID, to select data and control decisions
- Blocks: Threads are grouped into blocks, each block represent a stream
processor
- Grids: Blocks are grouped into grids
### Iterations
```c
#include <stdio.h>
__global__ void add(int *a, int *b, int *c) {
int bid = blockIdx.x;
printf("bid: %d\n", bid);
if (bid < 6) { // The students can also use a "for loop here"
c[bid] = a[bid] + b[bid];
printf("c: %d\n", c[bid]);
}
}
int main(void) {
int a[6], b[6], c[6];
int *dev_a, *dev_b, *dev_c;
// allocate memory to device
cudaMalloc((void **)&dev_a, 6 * sizeof(int));
cudaMalloc((void **)&dev_b, 6 * sizeof(int));
cudaMalloc((void **)&dev_c, 6 * sizeof(int));
// Fill arrays "a" and "b" with values on the host
for (int i = 0; i < 6; i++) {
a[i] = i;
b[i] = i * i;
}
// Copy arrays "a" and "b" to the device
cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
// Launch the kernel
add<<<4, 1>>>(dev_a, dev_b, dev_c);
// Copy the array "c" from the device to the host
cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);
// Print the array "c"
for (int i = 0; i < 6; i++) {
printf("%d\n", c[i]);
}
// Free memory allocated to the device
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
} // End main
```
### Assumptions:
- Assume CUDA thread execute on a physically separate device than Host
- Assume both Host and Device has their own separate memory spaces in DRAM
### Thread vs. Block
- Thread per block is limited, while number of block is not (they can wait until
previous block finish)
- Threads in a block has shared memory, while blocks don't have shared memory,
only global memory.
## Examples and tasks

Binary file not shown.

View file

@ -1,5 +1,5 @@
#include <stdio.h> #include <stdio.h>
#define N 10 #define N 16
// 1. Define the kernel // 1. Define the kernel
__global__ void add(int *a, int *b, int *c) { __global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x; // handle the data at this index int tid = blockIdx.x; // handle the data at this index

31
assets/code/test2.cu Normal file
View file

@ -0,0 +1,31 @@
#include <stdio.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main(void) {
int a, b, c;
// host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values
a = 2;
b = 7;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1, 1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
printf("result c: %d", c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

42
assets/code/test3.cu Normal file
View file

@ -0,0 +1,42 @@
#include <stdio.h>
#define N 6
__global__ void add(int *a, int *b, int *c) {
int bid = blockIdx.x;
printf("bid: %d\n", bid);
if (bid < N) { // The students can also use a "for loop here"
c[bid] = a[bid] + b[bid];
printf("c: %d\n", c[bid]);
}
}
int main(void) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate memory to device
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, N * sizeof(int));
// Fill arrays "a" and "b" with values on the host
for (int i = 0; i < N; i++) {
a[i] = i;
b[i] = i * i;
}
// Copy arrays "a" and "b" to the device
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
// Launch the kernel
add<<<12, 1>>>(dev_a, dev_b, dev_c);
// Copy the array "c" from the device to the host
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
// Print the array "c"
for (int i = 0; i < N; i++) {
printf("%d\n", c[i]);
}
// Free memory allocated to the device
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
} // End main

162
mdsf.json Normal file
View file

@ -0,0 +1,162 @@
{
"$schema": "https://raw.githubusercontent.com/hougesen/mdsf/main/schemas/v0.3.2/mdsf.schema.json",
"format_finished_document": false,
"javascript_runtime": "node",
"languages": {
"c": "clang-format",
"clojure": [
[
"cljstyle",
"joker"
]
],
"cpp": "clang-format",
"crystal": "crystal:format",
"csharp": [
[
"csharpier",
"clang-format"
]
],
"css": [
[
"prettier",
"stylelint"
]
],
"d": "dfmt",
"dart": "dart:format",
"elixir": "mix:format",
"elm": "elm-format",
"erlang": [
[
"erlfmt",
"efmt"
]
],
"gleam": "gleam:format",
"go": [
[
"gci",
"goimports-reviser",
"goimports"
],
[
"gofumpt",
"gofmt",
"crlfmt"
]
],
"haskell": [
[
"fourmolu",
"ormolu",
"hindent",
"stylish-haskell"
]
],
"html": [
[
"prettier",
"djlint"
]
],
"java": [
[
"google-java-format",
"clang-format"
]
],
"javascript": [
[
"prettier",
"biome:format",
"deno:fmt",
"clang-format",
"standardjs"
]
],
"json": [
[
"prettier",
"biome:format",
"deno:fmt",
"clang-format"
]
],
"kotlin": "ktfmt",
"lua": [
[
"stylua",
"luaformatter"
]
],
"nim": "nimpretty",
"ocaml": [
[
"ocamlformat",
"ocp-indent"
]
],
"python": [
[
"usort",
"isort"
],
[
"ruff:format",
"blue",
"black",
"yapf",
"autopep8",
"pyink"
]
],
"roc": "roc:format",
"ruby": [
[
"rubocop",
"rufo",
"rubyfmt",
"standardrb"
]
],
"rust": "rustfmt",
"shell": [
[
"shfmt",
"beautysh"
]
],
"sql": [
[
"sql-formatter",
"sqlfluff:format",
"sqlfmt"
]
],
"swift": [
[
"swift-format",
"swiftformat"
]
],
"toml": "taplo",
"typescript": [
[
"prettier",
"biome:format",
"deno:fmt"
]
],
"yaml": [
[
"prettier",
"yamlfmt",
"yamlfix"
]
],
"zig": "zig:fmt"
},
"custom_file_extensions": {}
}