1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| #include <iostream>
using namespace std;
__global__ void vector_add(float *vec1, float *vec2, float *vec_out, int length) { int tid = threadIdx.x; if(tid < length) { vec_out[tid] = vec1[tid] + vec2[tid]; } }
int main(int argc, char *argv[]) { const int length = 16; float a[length], b[length], c[length]; for (int i = 0; i < length; i++) { a[i] = b[i] = i; } float* a_device, *b_device, *c_device;
cudaMalloc((void**)&a_device, length * sizeof(float)); cudaMalloc((void**)&b_device, length * sizeof(float)); cudaMalloc((void**)&c_device, length * sizeof(float));
cudaMemcpy(a_device, a, length * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(b_device, b, length * sizeof(float), cudaMemcpyHostToDevice);
dim3 grid(1, 1, 1), block(length, 1, 1); vector_add<<<grid,block>>>(a_device, b_device, c_device, length);
cudaMemcpy(c, c_device, length * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < length; i++) { cout << c[i] << " "; }
return 0; }
|