#include #define N 16 // 1. Define the kernel __global__ void add(int *a, int *b, int *c) { int tid = blockIdx.x; // handle the data at this index if (tid < N) c[tid] = a[tid] + b[tid]; } // 2. Declare the main method int main(void) { int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; // 3. allocate the memory on the GPU cudaMalloc((void **)&dev_a, N * sizeof(int)); cudaMalloc((void **)&dev_b, N * sizeof(int)); cudaMalloc((void **)&dev_c, N * sizeof(int)); // 4. fill the arrays 'a' and 'b' on the CPU for (int i = 0; i < N; i++) { a[i] = -i; b[i] = i * i; } // 5. copy the arrays 'a' and 'b' to the GPU cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice); // 6. launch the kernel on the GPU add<<>>(dev_a, dev_b, dev_c); // 7. copy the array 'c' back from the GPU to the CPU cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost); // 8. the results through the CPU for (int i = 0; i < N; i++) { printf("%d + %d = %d\n", a[i], b[i], c[i]); } // 9. free the memory allocated on the GPU cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }