43 lines
1.2 KiB
Plaintext
43 lines
1.2 KiB
Plaintext
|
#include <stdio.h>
|
||
|
#define N 6
|
||
|
|
||
|
__global__ void add(int *a, int *b, int *c) {
|
||
|
int bid = blockIdx.x;
|
||
|
printf("bid: %d\n", bid);
|
||
|
if (bid < N) { // The students can also use a "for loop here"
|
||
|
c[bid] = a[bid] + b[bid];
|
||
|
printf("c: %d\n", c[bid]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int main(void) {
|
||
|
int a[N], b[N], c[N];
|
||
|
int *dev_a, *dev_b, *dev_c;
|
||
|
// allocate memory to device
|
||
|
cudaMalloc((void **)&dev_a, N * sizeof(int));
|
||
|
cudaMalloc((void **)&dev_b, N * sizeof(int));
|
||
|
cudaMalloc((void **)&dev_c, N * sizeof(int));
|
||
|
// Fill arrays "a" and "b" with values on the host
|
||
|
for (int i = 0; i < N; i++) {
|
||
|
a[i] = i;
|
||
|
b[i] = i * i;
|
||
|
}
|
||
|
// Copy arrays "a" and "b" to the device
|
||
|
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
|
||
|
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
|
||
|
// Launch the kernel
|
||
|
add<<<12, 1>>>(dev_a, dev_b, dev_c);
|
||
|
// Copy the array "c" from the device to the host
|
||
|
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||
|
|
||
|
// Print the array "c"
|
||
|
for (int i = 0; i < N; i++) {
|
||
|
printf("%d\n", c[i]);
|
||
|
}
|
||
|
// Free memory allocated to the device
|
||
|
cudaFree(dev_a);
|
||
|
cudaFree(dev_b);
|
||
|
cudaFree(dev_c);
|
||
|
return 0;
|
||
|
} // End main
|