CUDA advanced Took 1hr 50 mins
This commit is contained in:
parent
eeb097662a
commit
b3a8881805
|
@ -42,6 +42,7 @@
|
|||
...
|
||||
}
|
||||
```
|
||||
- Note to self: run `nvcc` with `-arch=native` otherwise it won't work.
|
||||
|
||||
### Device code and Host code
|
||||
|
||||
|
|
127
2-3-advanced-cuda.md
Normal file
127
2-3-advanced-cuda.md
Normal file
|
@ -0,0 +1,127 @@
|
|||
# Advanced CUDA Programming
|
||||
|
||||
## Memory management
|
||||
|
||||
- Use the following calls:
|
||||
- `cudaMalloc`: Allocate memory
|
||||
- `cudaFree`: Free memory
|
||||
- `cudaMemcpy`: Copy memory
|
||||
|
||||
### Example: Addition on device
|
||||
|
||||
```c
|
||||
__global__ void add(int *a, int *b, int *c) {
|
||||
// Since this runs on device, we need they to be in
|
||||
// Device memory
|
||||
*c = *a + *b;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int a, b, c;
|
||||
// host copies of a, b, c
|
||||
int *d_a, *d_b, *d_c; // device copies of a, b, c
|
||||
int size = sizeof(int);
|
||||
// Allocate space for device copies of a, b, c
|
||||
cudaMalloc((void **)&d_a, size);
|
||||
cudaMalloc((void **)&d_b, size);
|
||||
cudaMalloc((void **)&d_c, size);
|
||||
// Setup input values
|
||||
a = 2;
|
||||
b = 7;
|
||||
// Copy inputs to device
|
||||
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
|
||||
// Launch add() kernel on GPU
|
||||
add<<<1, 1>>>(d_a, d_b, d_c);
|
||||
// Copy result back to host
|
||||
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
|
||||
// Cleanup
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_c);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
### Memory in GPU and CPU
|
||||
|
||||
- They cannot dereference a pointer that points to memory not in their device:
|
||||
- Device pointers point to GPU memory, can be passed from or to host, but
|
||||
can't be dereferenced by Host
|
||||
- use APIs to manage device memory: [link](#memory-management)
|
||||
|
||||
## Heterogeneous programming
|
||||
|
||||
- Because both host CPU and device GPU are used
|
||||
- The code has serial code, executed by host CPU, and parallel kernel and code
|
||||
executed by the GPU
|
||||
|
||||
## Parallel computing With CUDA
|
||||
|
||||
### Parallel Programming
|
||||
|
||||
- Kernel: executed on GPU, as an array of threads in parallel
|
||||
- Threads: execute the same code, but can take different paths
|
||||
- Thread has an ID, to select data and control decisions
|
||||
- Blocks: Threads are grouped into blocks, each block represent a stream
|
||||
processor
|
||||
- Grids: Blocks are grouped into grids
|
||||
|
||||
### Iterations
|
||||
|
||||
```c
|
||||
#include <stdio.h>
|
||||
__global__ void add(int *a, int *b, int *c) {
|
||||
int bid = blockIdx.x;
|
||||
printf("bid: %d\n", bid);
|
||||
if (bid < 6) { // The students can also use a "for loop here"
|
||||
c[bid] = a[bid] + b[bid];
|
||||
printf("c: %d\n", c[bid]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int a[6], b[6], c[6];
|
||||
int *dev_a, *dev_b, *dev_c;
|
||||
// allocate memory to device
|
||||
cudaMalloc((void **)&dev_a, 6 * sizeof(int));
|
||||
cudaMalloc((void **)&dev_b, 6 * sizeof(int));
|
||||
cudaMalloc((void **)&dev_c, 6 * sizeof(int));
|
||||
// Fill arrays "a" and "b" with values on the host
|
||||
for (int i = 0; i < 6; i++) {
|
||||
a[i] = i;
|
||||
b[i] = i * i;
|
||||
}
|
||||
// Copy arrays "a" and "b" to the device
|
||||
cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
|
||||
// Launch the kernel
|
||||
add<<<4, 1>>>(dev_a, dev_b, dev_c);
|
||||
// Copy the array "c" from the device to the host
|
||||
cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// Print the array "c"
|
||||
for (int i = 0; i < 6; i++) {
|
||||
printf("%d\n", c[i]);
|
||||
}
|
||||
// Free memory allocated to the device
|
||||
cudaFree(dev_a);
|
||||
cudaFree(dev_b);
|
||||
cudaFree(dev_c);
|
||||
return 0;
|
||||
} // End main
|
||||
```
|
||||
|
||||
### Assumptions:
|
||||
|
||||
- Assume CUDA thread execute on a physically separate device than Host
|
||||
- Assume both Host and Device has their own separate memory spaces in DRAM
|
||||
|
||||
### Thread vs. Block
|
||||
|
||||
- Thread per block is limited, while number of block is not (they can wait until
|
||||
previous block finish)
|
||||
- Threads in a block has shared memory, while blocks don't have shared memory,
|
||||
only global memory.
|
||||
|
||||
## Examples and tasks
|
Binary file not shown.
|
@ -1,5 +1,5 @@
|
|||
#include <stdio.h>
|
||||
#define N 10
|
||||
#define N 16
|
||||
// 1. Define the kernel
|
||||
__global__ void add(int *a, int *b, int *c) {
|
||||
int tid = blockIdx.x; // handle the data at this index
|
||||
|
|
31
assets/code/test2.cu
Normal file
31
assets/code/test2.cu
Normal file
|
@ -0,0 +1,31 @@
|
|||
#include <stdio.h>
|
||||
__global__ void add(int *a, int *b, int *c) {
|
||||
*c = *a + *b;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int a, b, c;
|
||||
// host copies of a, b, c
|
||||
int *d_a, *d_b, *d_c; // device copies of a, b, c
|
||||
int size = sizeof(int);
|
||||
// Allocate space for device copies of a, b, c
|
||||
cudaMalloc((void **)&d_a, size);
|
||||
cudaMalloc((void **)&d_b, size);
|
||||
cudaMalloc((void **)&d_c, size);
|
||||
// Setup input values
|
||||
a = 2;
|
||||
b = 7;
|
||||
// Copy inputs to device
|
||||
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
|
||||
// Launch add() kernel on GPU
|
||||
add<<<1, 1>>>(d_a, d_b, d_c);
|
||||
// Copy result back to host
|
||||
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
|
||||
printf("result c: %d", c);
|
||||
// Cleanup
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_c);
|
||||
return 0;
|
||||
}
|
42
assets/code/test3.cu
Normal file
42
assets/code/test3.cu
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include <stdio.h>
|
||||
#define N 6
|
||||
|
||||
__global__ void add(int *a, int *b, int *c) {
|
||||
int bid = blockIdx.x;
|
||||
printf("bid: %d\n", bid);
|
||||
if (bid < N) { // The students can also use a "for loop here"
|
||||
c[bid] = a[bid] + b[bid];
|
||||
printf("c: %d\n", c[bid]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int a[N], b[N], c[N];
|
||||
int *dev_a, *dev_b, *dev_c;
|
||||
// allocate memory to device
|
||||
cudaMalloc((void **)&dev_a, N * sizeof(int));
|
||||
cudaMalloc((void **)&dev_b, N * sizeof(int));
|
||||
cudaMalloc((void **)&dev_c, N * sizeof(int));
|
||||
// Fill arrays "a" and "b" with values on the host
|
||||
for (int i = 0; i < N; i++) {
|
||||
a[i] = i;
|
||||
b[i] = i * i;
|
||||
}
|
||||
// Copy arrays "a" and "b" to the device
|
||||
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
// Launch the kernel
|
||||
add<<<12, 1>>>(dev_a, dev_b, dev_c);
|
||||
// Copy the array "c" from the device to the host
|
||||
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// Print the array "c"
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%d\n", c[i]);
|
||||
}
|
||||
// Free memory allocated to the device
|
||||
cudaFree(dev_a);
|
||||
cudaFree(dev_b);
|
||||
cudaFree(dev_c);
|
||||
return 0;
|
||||
} // End main
|
162
mdsf.json
Normal file
162
mdsf.json
Normal file
|
@ -0,0 +1,162 @@
|
|||
{
|
||||
"$schema": "https://raw.githubusercontent.com/hougesen/mdsf/main/schemas/v0.3.2/mdsf.schema.json",
|
||||
"format_finished_document": false,
|
||||
"javascript_runtime": "node",
|
||||
"languages": {
|
||||
"c": "clang-format",
|
||||
"clojure": [
|
||||
[
|
||||
"cljstyle",
|
||||
"joker"
|
||||
]
|
||||
],
|
||||
"cpp": "clang-format",
|
||||
"crystal": "crystal:format",
|
||||
"csharp": [
|
||||
[
|
||||
"csharpier",
|
||||
"clang-format"
|
||||
]
|
||||
],
|
||||
"css": [
|
||||
[
|
||||
"prettier",
|
||||
"stylelint"
|
||||
]
|
||||
],
|
||||
"d": "dfmt",
|
||||
"dart": "dart:format",
|
||||
"elixir": "mix:format",
|
||||
"elm": "elm-format",
|
||||
"erlang": [
|
||||
[
|
||||
"erlfmt",
|
||||
"efmt"
|
||||
]
|
||||
],
|
||||
"gleam": "gleam:format",
|
||||
"go": [
|
||||
[
|
||||
"gci",
|
||||
"goimports-reviser",
|
||||
"goimports"
|
||||
],
|
||||
[
|
||||
"gofumpt",
|
||||
"gofmt",
|
||||
"crlfmt"
|
||||
]
|
||||
],
|
||||
"haskell": [
|
||||
[
|
||||
"fourmolu",
|
||||
"ormolu",
|
||||
"hindent",
|
||||
"stylish-haskell"
|
||||
]
|
||||
],
|
||||
"html": [
|
||||
[
|
||||
"prettier",
|
||||
"djlint"
|
||||
]
|
||||
],
|
||||
"java": [
|
||||
[
|
||||
"google-java-format",
|
||||
"clang-format"
|
||||
]
|
||||
],
|
||||
"javascript": [
|
||||
[
|
||||
"prettier",
|
||||
"biome:format",
|
||||
"deno:fmt",
|
||||
"clang-format",
|
||||
"standardjs"
|
||||
]
|
||||
],
|
||||
"json": [
|
||||
[
|
||||
"prettier",
|
||||
"biome:format",
|
||||
"deno:fmt",
|
||||
"clang-format"
|
||||
]
|
||||
],
|
||||
"kotlin": "ktfmt",
|
||||
"lua": [
|
||||
[
|
||||
"stylua",
|
||||
"luaformatter"
|
||||
]
|
||||
],
|
||||
"nim": "nimpretty",
|
||||
"ocaml": [
|
||||
[
|
||||
"ocamlformat",
|
||||
"ocp-indent"
|
||||
]
|
||||
],
|
||||
"python": [
|
||||
[
|
||||
"usort",
|
||||
"isort"
|
||||
],
|
||||
[
|
||||
"ruff:format",
|
||||
"blue",
|
||||
"black",
|
||||
"yapf",
|
||||
"autopep8",
|
||||
"pyink"
|
||||
]
|
||||
],
|
||||
"roc": "roc:format",
|
||||
"ruby": [
|
||||
[
|
||||
"rubocop",
|
||||
"rufo",
|
||||
"rubyfmt",
|
||||
"standardrb"
|
||||
]
|
||||
],
|
||||
"rust": "rustfmt",
|
||||
"shell": [
|
||||
[
|
||||
"shfmt",
|
||||
"beautysh"
|
||||
]
|
||||
],
|
||||
"sql": [
|
||||
[
|
||||
"sql-formatter",
|
||||
"sqlfluff:format",
|
||||
"sqlfmt"
|
||||
]
|
||||
],
|
||||
"swift": [
|
||||
[
|
||||
"swift-format",
|
||||
"swiftformat"
|
||||
]
|
||||
],
|
||||
"toml": "taplo",
|
||||
"typescript": [
|
||||
[
|
||||
"prettier",
|
||||
"biome:format",
|
||||
"deno:fmt"
|
||||
]
|
||||
],
|
||||
"yaml": [
|
||||
[
|
||||
"prettier",
|
||||
"yamlfmt",
|
||||
"yamlfix"
|
||||
]
|
||||
],
|
||||
"zig": "zig:fmt"
|
||||
},
|
||||
"custom_file_extensions": {}
|
||||
}
|
Loading…
Reference in a new issue