finish cuda programming, took around 1.8 hrs
This commit is contained in:
parent
557be68e19
commit
eeb097662a
138
2-2-cuda.md
Normal file
138
2-2-cuda.md
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
# CUDA
|
||||||
|
|
||||||
|
## Definition
|
||||||
|
|
||||||
|
- Compute Unified Device Architecture
|
||||||
|
- Is a general purpose **parallel** programming platform and model
|
||||||
|
|
||||||
|
## Scalability with CUDA
|
||||||
|
|
||||||
|
- GPU is has an array of Streaming Multiprocessors
|
||||||
|
- A program can be partitioned into blocks that execute independently
|
||||||
|
- Allow performance to scale with the number of GPU multiprocessors
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Kernel
|
||||||
|
|
||||||
|
- CUDA extends C by allow to define C functions, which is called a kernel
|
||||||
|
- Kernel is executed in parallel, by N different CUDA threads, rather than
|
||||||
|
only once in serial
|
||||||
|
- Defined with specifier `__global__`
|
||||||
|
- Number of threads is given using `<<<NBlocks, NThreads>>>`, and used with
|
||||||
|
`func_name<<<NBlocks, NThreads>>>(params);`
|
||||||
|
- They take an integer or dim3, so it cal also be called Dimension of
|
||||||
|
Blocks, or Dimension of Threads
|
||||||
|
- Thread number is given an ID, accessible by `threadIdx` variable
|
||||||
|
- Example:
|
||||||
|
```c
|
||||||
|
// Add array A and B of size N together, store it in C
|
||||||
|
#define N 2
|
||||||
|
// Kernel definition
|
||||||
|
__global__ void VecAdd(float* A, float* B, float* C) {
|
||||||
|
// Each thread performs one pair wise addition
|
||||||
|
int i = threadIdx.x;
|
||||||
|
// The array is a float, meaning we need to allocate memory on GPU
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
int main() {
|
||||||
|
...
|
||||||
|
// Kernel invocation with N threads
|
||||||
|
VecAdd<<<1, N>>>(A, B, C);
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Device code and Host code
|
||||||
|
|
||||||
|
- Compiler `nvcc` separates source code into host and device code
|
||||||
|
- Device code (kernels): definition marked with `__global__`, called from
|
||||||
|
host code (with `func<<<x, y>>>(a, b)`), and run on device (GPU)
|
||||||
|
- Host code: normal C code, processed by system compiler like `gcc` or
|
||||||
|
`clang`
|
||||||
|
- If the device code definitions are used, the compiled code execute on both CPU
|
||||||
|
and GPU
|
||||||
|
|
||||||
|
### Thread hierarchy
|
||||||
|
|
||||||
|
- `threadIdx` is a 3 component vector
|
||||||
|
- Enables threads to be identified in 1-3 dimension blocks, which can form a
|
||||||
|
**grid** of thread blocks
|
||||||
|
- Number of threads depend on the size of data being processed
|
||||||
|
- Features:
|
||||||
|
- Maximum number of threads per block: because they reside on the same
|
||||||
|
block, on the same processor core
|
||||||
|
- A kernel can be executed by multiple equally shaped thread blocks, so the
|
||||||
|
total number of threads is threads per block times blocks
|
||||||
|
- Example:
|
||||||
|
```c
|
||||||
|
#define N 10
|
||||||
|
// Kernel definition
|
||||||
|
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N]) {
|
||||||
|
int i = threadIdx.x;
|
||||||
|
int j = threadIdx.y;
|
||||||
|
C[i][j] = A[i][j] + B[i][j];
|
||||||
|
}
|
||||||
|
int main() {
|
||||||
|
...
|
||||||
|
// Kernel invocation with one block of N * N * 1 threads
|
||||||
|
int numBlocks = 1;
|
||||||
|
dim3 threadsPerBlock(N, N);
|
||||||
|
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- A complete example:
|
||||||
|
|
||||||
|
```c
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Block Hierarchy
|
||||||
|
|
||||||
|
- Blocks can be represented in 1-3 dimensional **grids**, size is dependent on
|
||||||
|
the input data
|
||||||
|
- Accessing
|
||||||
|
- Block index: `blockIdx`
|
||||||
|
- Dimension: `blockDim`
|
||||||
|
- Example: multi block kernel:
|
||||||
|
```c
|
||||||
|
#define N 16
|
||||||
|
// Kernel definition
|
||||||
|
__global__ void MatAdd(float A[N][N], float B[N][N],
|
||||||
|
float C[N][N]) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int j = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (i < N && j < N)
|
||||||
|
C[i][j] = A[i][j] + B[i][j];
|
||||||
|
}
|
||||||
|
int main() {
|
||||||
|
...
|
||||||
|
// Kernel invocation
|
||||||
|
dim3 threadsPerBlock(16, 16);
|
||||||
|
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
|
||||||
|
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory Hierarchy
|
||||||
|
|
||||||
|
- Thread: Registers, Local memory
|
||||||
|
- Block of threads: Shared memory
|
||||||
|
- All Blocks: Global memory
|
||||||
|
|
||||||
|
### GPU vs. CPU
|
||||||
|
|
||||||
|
- GPU has more stream processors, and each processor has much more ALUs
|
||||||
|
- TODO: see code in assets/code, and figure shit out, and index array
|
||||||
|
|
||||||
|
## Coordinating with CPU
|
||||||
|
|
||||||
|
- GPU kernel launch are **asynchronous**, while control returns to CPU
|
||||||
|
immediately
|
||||||
|
- CPU need to **synchronize** before using the results
|
||||||
|
- `cudaMemcpy()`: block and copy
|
||||||
|
- `cudaMemcpyAsync()`: copy but don't block
|
||||||
|
- `cudaDeviceSynchronize()`: Block the CPU until all CUDA calls before have
|
||||||
|
completed
|
BIN
assets/code/a.out
Executable file
BIN
assets/code/a.out
Executable file
Binary file not shown.
10
assets/code/test.cu
Normal file
10
assets/code/test.cu
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
#include <cstdio>
|
||||||
|
__global__ void mykernel(void) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
mykernel<<<1,1>>>();
|
||||||
|
std::printf("Hello, World!\n");
|
||||||
|
return 0;
|
||||||
|
}
|
39
assets/code/test1.cu
Normal file
39
assets/code/test1.cu
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#define N 10
|
||||||
|
// 1. Define the kernel
|
||||||
|
__global__ void add(int *a, int *b, int *c) {
|
||||||
|
int tid = blockIdx.x; // handle the data at this index
|
||||||
|
if (tid < N)
|
||||||
|
c[tid] = a[tid] + b[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Declare the main method
|
||||||
|
int main(void) {
|
||||||
|
int a[N], b[N], c[N];
|
||||||
|
int *dev_a, *dev_b, *dev_c;
|
||||||
|
// 3. allocate the memory on the GPU
|
||||||
|
cudaMalloc((void **)&dev_a, N * sizeof(int));
|
||||||
|
cudaMalloc((void **)&dev_b, N * sizeof(int));
|
||||||
|
cudaMalloc((void **)&dev_c, N * sizeof(int));
|
||||||
|
// 4. fill the arrays 'a' and 'b' on the CPU
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
a[i] = -i;
|
||||||
|
b[i] = i * i;
|
||||||
|
}
|
||||||
|
// 5. copy the arrays 'a' and 'b' to the GPU
|
||||||
|
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
// 6. launch the kernel on the GPU
|
||||||
|
add<<<N, 1>>>(dev_a, dev_b, dev_c);
|
||||||
|
// 7. copy the array 'c' back from the GPU to the CPU
|
||||||
|
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
// 8. the results through the CPU
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
printf("%d + %d = %d\n", a[i], b[i], c[i]);
|
||||||
|
}
|
||||||
|
// 9. free the memory allocated on the GPU
|
||||||
|
cudaFree(dev_a);
|
||||||
|
cudaFree(dev_b);
|
||||||
|
cudaFree(dev_c);
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in a new issue