GPU运算简单步骤

main.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>

using namespace std;

__global__ void vector_add(float *vec1, float *vec2, float *vec_out, int length) {
int tid = threadIdx.x;
if(tid < length) {
vec_out[tid] = vec1[tid] + vec2[tid];
}
}

int main(int argc, char *argv[])
{
const int length = 16; // 数组长度为16
float a[length], b[length], c[length]; // host中的数组
for (int i = 0; i < length; i++) { // 初始赋值
a[i] = b[i] = i;
}
float* a_device, *b_device, *c_device; // device中的数组

cudaMalloc((void**)&a_device, length * sizeof(float)); // 分配内存
cudaMalloc((void**)&b_device, length * sizeof(float));
cudaMalloc((void**)&c_device, length * sizeof(float));

cudaMemcpy(a_device, a, length * sizeof(float), cudaMemcpyHostToDevice); // 将host数组的值拷贝给device数组
cudaMemcpy(b_device, b, length * sizeof(float), cudaMemcpyHostToDevice);

// 一:参数配置
dim3 grid(1, 1, 1), block(length, 1, 1); // 设置参数
vector_add<<<grid,block>>>(a_device, b_device, c_device, length); // 启动kernel

cudaMemcpy(c, c_device, length * sizeof(float), cudaMemcpyDeviceToHost); // 将结果拷贝到host

for (int i = 0; i < length; i++) { // 打印出来方便观察
cout << c[i] << " ";
}

return 0;
}

结果:

1
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30