40 lines
1.2 KiB
Plaintext
40 lines
1.2 KiB
Plaintext
|
#include <stdio.h>
|
||
|
#define N 10
|
||
|
// 1. Define the kernel
|
||
|
__global__ void add(int *a, int *b, int *c) {
|
||
|
int tid = blockIdx.x; // handle the data at this index
|
||
|
if (tid < N)
|
||
|
c[tid] = a[tid] + b[tid];
|
||
|
}
|
||
|
|
||
|
// 2. Declare the main method
|
||
|
int main(void) {
|
||
|
int a[N], b[N], c[N];
|
||
|
int *dev_a, *dev_b, *dev_c;
|
||
|
// 3. allocate the memory on the GPU
|
||
|
cudaMalloc((void **)&dev_a, N * sizeof(int));
|
||
|
cudaMalloc((void **)&dev_b, N * sizeof(int));
|
||
|
cudaMalloc((void **)&dev_c, N * sizeof(int));
|
||
|
// 4. fill the arrays 'a' and 'b' on the CPU
|
||
|
for (int i = 0; i < N; i++) {
|
||
|
a[i] = -i;
|
||
|
b[i] = i * i;
|
||
|
}
|
||
|
// 5. copy the arrays 'a' and 'b' to the GPU
|
||
|
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
|
||
|
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
|
||
|
// 6. launch the kernel on the GPU
|
||
|
add<<<N, 1>>>(dev_a, dev_b, dev_c);
|
||
|
// 7. copy the array 'c' back from the GPU to the CPU
|
||
|
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||
|
// 8. the results through the CPU
|
||
|
for (int i = 0; i < N; i++) {
|
||
|
printf("%d + %d = %d\n", a[i], b[i], c[i]);
|
||
|
}
|
||
|
// 9. free the memory allocated on the GPU
|
||
|
cudaFree(dev_a);
|
||
|
cudaFree(dev_b);
|
||
|
cudaFree(dev_c);
|
||
|
return 0;
|
||
|
}
|