128 lines
3.4 KiB
Markdown
128 lines
3.4 KiB
Markdown
# Advanced CUDA Programming
|
|
|
|
## Memory management
|
|
|
|
- Use the following calls:
|
|
- `cudaMalloc`: Allocate memory
|
|
- `cudaFree`: Free memory
|
|
- `cudaMemcpy`: Copy memory
|
|
|
|
### Example: Addition on device
|
|
|
|
```c
|
|
__global__ void add(int *a, int *b, int *c) {
|
|
// Since this runs on device, we need they to be in
|
|
// Device memory
|
|
*c = *a + *b;
|
|
}
|
|
|
|
int main(void) {
|
|
int a, b, c;
|
|
// host copies of a, b, c
|
|
int *d_a, *d_b, *d_c; // device copies of a, b, c
|
|
int size = sizeof(int);
|
|
// Allocate space for device copies of a, b, c
|
|
cudaMalloc((void **)&d_a, size);
|
|
cudaMalloc((void **)&d_b, size);
|
|
cudaMalloc((void **)&d_c, size);
|
|
// Setup input values
|
|
a = 2;
|
|
b = 7;
|
|
// Copy inputs to device
|
|
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
|
|
// Launch add() kernel on GPU
|
|
add<<<1, 1>>>(d_a, d_b, d_c);
|
|
// Copy result back to host
|
|
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
|
|
// Cleanup
|
|
cudaFree(d_a);
|
|
cudaFree(d_b);
|
|
cudaFree(d_c);
|
|
return 0;
|
|
}
|
|
```
|
|
|
|
### Memory in GPU and CPU
|
|
|
|
- They cannot dereference a pointer that points to memory not in their device:
|
|
- Device pointers point to GPU memory, can be passed from or to host, but
|
|
can't be dereferenced by Host
|
|
- use APIs to manage device memory: [link](#memory-management)
|
|
|
|
## Heterogeneous programming
|
|
|
|
- Because both host CPU and device GPU are used
|
|
- The code has serial code, executed by host CPU, and parallel kernel and code
|
|
executed by the GPU
|
|
|
|
## Parallel computing With CUDA
|
|
|
|
### Parallel Programming
|
|
|
|
- Kernel: executed on GPU, as an array of threads in parallel
|
|
- Threads: execute the same code, but can take different paths
|
|
- Thread has an ID, to select data and control decisions
|
|
- Blocks: Threads are grouped into blocks, each block represent a stream
|
|
processor
|
|
- Grids: Blocks are grouped into grids
|
|
|
|
### Iterations
|
|
|
|
```c
|
|
#include <stdio.h>
|
|
__global__ void add(int *a, int *b, int *c) {
|
|
int bid = blockIdx.x;
|
|
printf("bid: %d\n", bid);
|
|
if (bid < 6) { // The students can also use a "for loop here"
|
|
c[bid] = a[bid] + b[bid];
|
|
printf("c: %d\n", c[bid]);
|
|
}
|
|
}
|
|
|
|
int main(void) {
|
|
int a[6], b[6], c[6];
|
|
int *dev_a, *dev_b, *dev_c;
|
|
// allocate memory to device
|
|
cudaMalloc((void **)&dev_a, 6 * sizeof(int));
|
|
cudaMalloc((void **)&dev_b, 6 * sizeof(int));
|
|
cudaMalloc((void **)&dev_c, 6 * sizeof(int));
|
|
// Fill arrays "a" and "b" with values on the host
|
|
for (int i = 0; i < 6; i++) {
|
|
a[i] = i;
|
|
b[i] = i * i;
|
|
}
|
|
// Copy arrays "a" and "b" to the device
|
|
cudaMemcpy(dev_a, a, 6 * sizeof(int), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(dev_b, b, 6 * sizeof(int), cudaMemcpyHostToDevice);
|
|
// Launch the kernel
|
|
add<<<4, 1>>>(dev_a, dev_b, dev_c);
|
|
// Copy the array "c" from the device to the host
|
|
cudaMemcpy(c, dev_c, 6 * sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
// Print the array "c"
|
|
for (int i = 0; i < 6; i++) {
|
|
printf("%d\n", c[i]);
|
|
}
|
|
// Free memory allocated to the device
|
|
cudaFree(dev_a);
|
|
cudaFree(dev_b);
|
|
cudaFree(dev_c);
|
|
return 0;
|
|
} // End main
|
|
```
|
|
|
|
### Assumptions:
|
|
|
|
- Assume CUDA thread execute on a physically separate device than Host
|
|
- Assume both Host and Device has their own separate memory spaces in DRAM
|
|
|
|
### Thread vs. Block
|
|
|
|
- Thread per block is limited, while number of block is not (they can wait until
|
|
previous block finish)
|
|
- Threads in a block has shared memory, while blocks don't have shared memory,
|
|
only global memory.
|
|
|
|
## Examples and tasks
|