#include <stdio.h>
#define N 6

__global__ void add(int *a, int *b, int *c) {
  int bid = blockIdx.x;
  printf("bid: %d\n", bid);
  if (bid < N) { // The students can also use a "for loop here"
    c[bid] = a[bid] + b[bid];
    printf("c: %d\n", c[bid]);
  }
}

int main(void) {
  int a[N], b[N], c[N];
  int *dev_a, *dev_b, *dev_c;
  // allocate memory to device
  cudaMalloc((void **)&dev_a, N * sizeof(int));
  cudaMalloc((void **)&dev_b, N * sizeof(int));
  cudaMalloc((void **)&dev_c, N * sizeof(int));
  // Fill arrays "a" and "b" with values on the host
  for (int i = 0; i < N; i++) {
    a[i] = i;
    b[i] = i * i;
  }
  // Copy arrays "a" and "b" to the device
  cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
  // Launch the kernel
  add<<<12, 1>>>(dev_a, dev_b, dev_c);
  // Copy the array "c" from the device to the host
  cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the array "c"
  for (int i = 0; i < N; i++) {
    printf("%d\n", c[i]);
  }
  // Free memory allocated to the device
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  return 0;
} // End main