EBU6502_cloud_computing_notes/assets/code/test3.cu

#include <stdio.h>
#define N 6

__global__ void add(int *a, int *b, int *c) {
  int bid = blockIdx.x;
  printf("bid: %d\n", bid);
  if (bid < N) { // The students can also use a "for loop here"
    c[bid] = a[bid] + b[bid];
    printf("c: %d\n", c[bid]);
  }
}

int main(void) {
  int a[N], b[N], c[N];
  int *dev_a, *dev_b, *dev_c;
  // allocate memory to device
  cudaMalloc((void **)&dev_a, N * sizeof(int));
  cudaMalloc((void **)&dev_b, N * sizeof(int));
  cudaMalloc((void **)&dev_c, N * sizeof(int));
  // Fill arrays "a" and "b" with values on the host
  for (int i = 0; i < N; i++) {
    a[i] = i;
    b[i] = i * i;
  }
  // Copy arrays "a" and "b" to the device
  cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
  // Launch the kernel
  add<<<12, 1>>>(dev_a, dev_b, dev_c);
  // Copy the array "c" from the device to the host
  cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

  // Print the array "c"
  for (int i = 0; i < N; i++) {
    printf("%d\n", c[i]);
  }
  // Free memory allocated to the device
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  return 0;
} // End main
CUDA advanced Took 1hr 50 mins 2024-12-29 16:45:17 +08:00			`#include <stdio.h>`
			`#define N 6`

			`__global__ void add(int a, int b, int *c) {`
			`int bid = blockIdx.x;`
			`printf("bid: %d\n", bid);`
			`if (bid < N) { // The students can also use a "for loop here"`
			`c[bid] = a[bid] + b[bid];`
			`printf("c: %d\n", c[bid]);`
			`}`
			`}`

			`int main(void) {`
			`int a[N], b[N], c[N];`
			`int dev_a, dev_b, *dev_c;`
			`// allocate memory to device`
			`cudaMalloc((void *)&dev_a, N sizeof(int));`
			`cudaMalloc((void *)&dev_b, N sizeof(int));`
			`cudaMalloc((void *)&dev_c, N sizeof(int));`
			`// Fill arrays "a" and "b" with values on the host`
			`for (int i = 0; i < N; i++) {`
			`a[i] = i;`
			`b[i] = i * i;`
			`}`
			`// Copy arrays "a" and "b" to the device`
			`cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);`
			`cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);`
			`// Launch the kernel`
			`add<<<12, 1>>>(dev_a, dev_b, dev_c);`
			`// Copy the array "c" from the device to the host`
			`cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);`

			`// Print the array "c"`
			`for (int i = 0; i < N; i++) {`
			`printf("%d\n", c[i]);`
			`}`
			`// Free memory allocated to the device`
			`cudaFree(dev_a);`
			`cudaFree(dev_b);`
			`cudaFree(dev_c);`
			`return 0;`
			`} // End main`