1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| #include <stdio.h>
__global__ void sum(float *a, float *b) { int t_id = threadIdx.x;
__shared__ float sData[16]; sData[t_id] = a[t_id]; __syncthreads(); for(int i = 8; i > 0; i /= 2) { if(t_id < i) { sData[t_id] = sData[t_id] + sData[t_id + i]; } __syncthreads(); } if(t_id == 0) { b[0] = sData[0]; } }
int main(int argc, char *argv[]) { float a[16]; for(int i = 0; i < 16; i ++) { a[i] = i * (i + 1); } float *aGpu; cudaMalloc((void **) &aGpu, 16 * sizeof(float)); cudaMemcpy(aGpu, a, 16 * sizeof(float), cudaMemcpyHostToDevice);
float *bGpu; cudaMalloc((void**) &bGpu, 1 * sizeof(float));
sum<<<1, 16>>>(aGpu, bGpu); float b[1]; cudaMemcpy(b, bGpu, 1 * sizeof(float), cudaMemcpyDeviceToHost); printf("b: %f\n",b[0]);
return 0; }
|