EBU6502_cloud_computing_notes/assets/code/test1.cu

#include <stdio.h>
#define N 16
// 1. Define the kernel
__global__ void add(int *a, int *b, int *c) {
  int tid = blockIdx.x; // handle the data at this index
  if (tid < N)
	c[tid] = a[tid] + b[tid];
}

// 2. Declare the main method
int main(void) {
  int a[N], b[N], c[N];
  int *dev_a, *dev_b, *dev_c;
  // 3. allocate the memory on the GPU
  cudaMalloc((void **)&dev_a, N * sizeof(int));
  cudaMalloc((void **)&dev_b, N * sizeof(int));
  cudaMalloc((void **)&dev_c, N * sizeof(int));
  // 4. fill the arrays 'a' and 'b' on the CPU
  for (int i = 0; i < N; i++) {
    a[i] = -i;
    b[i] = i * i;
  }
  // 5. copy the arrays 'a' and 'b' to the GPU
  cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
  // 6. launch the kernel on the GPU
  add<<<N, 1>>>(dev_a, dev_b, dev_c);
  // 7. copy the array 'c' back from the GPU to the CPU
  cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
  // 8. the results through the CPU
  for (int i = 0; i < N; i++) {
    printf("%d + %d = %d\n", a[i], b[i], c[i]);
  }
  // 9. free the memory allocated on the GPU
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  return 0;
}
finish cuda programming, took around 1.8 hrs 2024-12-29 12:40:01 +08:00			`#include <stdio.h>`
CUDA advanced Took 1hr 50 mins 2024-12-29 16:45:17 +08:00			`#define N 16`
finish cuda programming, took around 1.8 hrs 2024-12-29 12:40:01 +08:00			`// 1. Define the kernel`
			`__global__ void add(int a, int b, int *c) {`
			`int tid = blockIdx.x; // handle the data at this index`
			`if (tid < N)`
			`c[tid] = a[tid] + b[tid];`
			`}`

			`// 2. Declare the main method`
			`int main(void) {`
			`int a[N], b[N], c[N];`
			`int dev_a, dev_b, *dev_c;`
			`// 3. allocate the memory on the GPU`
			`cudaMalloc((void *)&dev_a, N sizeof(int));`
			`cudaMalloc((void *)&dev_b, N sizeof(int));`
			`cudaMalloc((void *)&dev_c, N sizeof(int));`
			`// 4. fill the arrays 'a' and 'b' on the CPU`
			`for (int i = 0; i < N; i++) {`
			`a[i] = -i;`
			`b[i] = i * i;`
			`}`
			`// 5. copy the arrays 'a' and 'b' to the GPU`
			`cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);`
			`cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);`
			`// 6. launch the kernel on the GPU`
			`add<<<N, 1>>>(dev_a, dev_b, dev_c);`
			`// 7. copy the array 'c' back from the GPU to the CPU`
			`cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);`
			`// 8. the results through the CPU`
			`for (int i = 0; i < N; i++) {`
			`printf("%d + %d = %d\n", a[i], b[i], c[i]);`
			`}`
			`// 9. free the memory allocated on the GPU`
			`cudaFree(dev_a);`
			`cudaFree(dev_b);`
			`cudaFree(dev_c);`
			`return 0;`
			`}`