数组相加

main.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <stdio.h>

__global__ void sum(float *a, float *b) {
int t_id = threadIdx.x;

__shared__ float sData[16];
sData[t_id] = a[t_id];
__syncthreads();
for(int i = 8; i > 0; i /= 2) {
if(t_id < i) {
sData[t_id] = sData[t_id] + sData[t_id + i];
}
__syncthreads();
}
if(t_id == 0) {
b[0] = sData[0];
}
}

int main(int argc, char *argv[])
{
float a[16];
for(int i = 0; i < 16; i ++) {
a[i] = i * (i + 1);
}
float *aGpu;
cudaMalloc((void **) &aGpu, 16 * sizeof(float));
cudaMemcpy(aGpu, a, 16 * sizeof(float), cudaMemcpyHostToDevice);

float *bGpu;
cudaMalloc((void**) &bGpu, 1 * sizeof(float));

sum<<<1, 16>>>(aGpu, bGpu);
float b[1];
cudaMemcpy(b, bGpu, 1 * sizeof(float), cudaMemcpyDeviceToHost);
printf("b: %f\n",b[0]);

return 0;
}

结果:

1
b: 1360.000000