#include #define N 6 __global__ void add(int *a, int *b, int *c) { int bid = blockIdx.x; printf("bid: %d\n", bid); if (bid < N) { // The students can also use a "for loop here" c[bid] = a[bid] + b[bid]; printf("c: %d\n", c[bid]); } } int main(void) { int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; // allocate memory to device cudaMalloc((void **)&dev_a, N * sizeof(int)); cudaMalloc((void **)&dev_b, N * sizeof(int)); cudaMalloc((void **)&dev_c, N * sizeof(int)); // Fill arrays "a" and "b" with values on the host for (int i = 0; i < N; i++) { a[i] = i; b[i] = i * i; } // Copy arrays "a" and "b" to the device cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice); // Launch the kernel add<<<12, 1>>>(dev_a, dev_b, dev_c); // Copy the array "c" from the device to the host cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost); // Print the array "c" for (int i = 0; i < N; i++) { printf("%d\n", c[i]); } // Free memory allocated to the device cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; } // End main