#include __global__ void calculate(float *a, float *b, float *c, float *d) { // Kernel declaration printf("test test\n"); float temp = *a + *b; // Pointer variable operation *d = temp / *c; } int main(void) { // Host variables and input float a = 3.0, b = 7.0, c = 2.0; float d; // Device variables float *a_dev, *b_dev, *c_dev, *d_dev; int float_size = sizeof(float); // Allocate memory device variables cudaMalloc((void **)&a_dev, float_size); cudaMalloc((void **)&b_dev, float_size); cudaMalloc((void **)&c_dev, float_size); cudaMalloc((void **)&d_dev, float_size); // Copy input from host to device cudaMemcpy(a_dev, &a, float_size, cudaMemcpyHostToDevice); cudaMemcpy(b_dev, &b, float_size, cudaMemcpyHostToDevice); cudaMemcpy(c_dev, &c, float_size, cudaMemcpyHostToDevice); // Launch kernel calculate<<<1, 1>>>(a_dev, b_dev, c_dev, d_dev); // Copy result, and print cudaMemcpy(&d, d_dev, float_size, cudaMemcpyDeviceToHost); printf("d is: %f\n", d); // Clean up device memory cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); cudaFree(d_dev); return 0; }