卷积

main.cu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>

#define HANDLE_ERROR(err) (handleError(err, __FILE__, __LINE__))

void handleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d \n", cudaGetErrorString(err), file, line);
}
}

int getThreadNum() {
cudaDeviceProp prop;
int count;

HANDLE_ERROR(cudaGetDeviceCount(&count));
printf("gpu num: %d\n", count);
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
printf("max thread num: %d\n", prop.maxThreadsPerBlock);
printf("max grid dimensions: %d %d %d \n", prop.maxGridSize[0],
prop.maxGridSize[1], prop.maxGridSize[2]);
return prop.maxThreadsPerBlock;
}

__global__ void conv(float *img, float *kernel, float *result, int width,
int height, int kernelSize) {
int t_i = threadIdx.x;
int b_i = blockIdx.x;
int id = (b_i * blockDim.x + t_i);
if (id >= width * height) {
return;
}

int row = id / width;
int col = id % width;

for (int i = 0; i < kernelSize; i++) {
for (int j = 0; j < kernelSize; j++) {
float imgValue = 0;
int curRow = row - kernelSize / 2 + i;
int curCol = col - kernelSize / 2 + j;
if (curRow < 0 || curCol < 0 || curRow >= height || curCol >= width) {
} else {
imgValue = img[curRow * width + curCol];
}
result[id] += kernel[i * kernelSize + j] * imgValue;
}
}
}

int main(int argc, char *argv[]) {
int width = 800;
int height = 600;
float *img = new float[width * height];
for (int row = 0; row < height; row++) {
for (int col = 0; col < width; col++) {
img[col + row * width] = (col + row) % 256;
}
}

int kernelSize = 3;
float *kernel = new float[kernelSize * kernelSize];
for (int i = 0; i < kernelSize * kernelSize; i++) {
kernel[i] = i % kernelSize - 1;
}

float *imgGpu;
float *kernelGpu;
float *resultGpu;

HANDLE_ERROR(cudaMalloc((void **)&imgGpu, width * height * sizeof(float)));
HANDLE_ERROR(
cudaMalloc((void **)&kernelGpu, kernelSize * kernelSize * sizeof(float)));
HANDLE_ERROR(cudaMalloc((void **)&resultGpu, width * height * sizeof(float)));

HANDLE_ERROR(cudaMemcpy(imgGpu, img, width * height * sizeof(float),
cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel,
kernelSize * kernelSize * sizeof(float),
cudaMemcpyHostToDevice));

int threadNum = getThreadNum();
int blockNum = (width * height - 0.5) / threadNum + 1;
conv<<<blockNum, threadNum>>>(imgGpu, kernelGpu, resultGpu, width, height,
kernelSize);

float *result = new float[width * height];
HANDLE_ERROR(cudaMemcpy(result, resultGpu, width * height * sizeof(float),
cudaMemcpyDeviceToHost));

// visualization
printf("img:\n");
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 10; col++) {
printf("%2.0f ", img[col + row * width]);
}
printf("\n");
}

printf("kernel:\n");
for (int row = 0; row < kernelSize; row++) {
for (int col = 0; col < kernelSize; col++) {
printf("%2.0f ", kernel[col + row * kernelSize]);
}
printf("\n");
}

printf("result:\n");
for (int row = 0; row < 10; row++) {
for (int col = 0; col < 10; col++) {
printf("%2.0f ", result[col + row * width]);
}
printf("\n");
}

return 0;
}

结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
gpu num: 1
max thread num: 1024
max grid dimensions: 2147483647 65535 65535
img:
0 1 2 3 4 5 6 7 8 9
1 2 3 4 5 6 7 8 9 10
2 3 4 5 6 7 8 9 10 11
3 4 5 6 7 8 9 10 11 12
4 5 6 7 8 9 10 11 12 13
5 6 7 8 9 10 11 12 13 14
6 7 8 9 10 11 12 13 14 15
7 8 9 10 11 12 13 14 15 16
8 9 10 11 12 13 14 15 16 17
9 10 11 12 13 14 15 16 17 18
kernel:
-1 0 1
-1 0 1
-1 0 1
result:
3 4 4 4 4 4 4 4 4 4
6 6 6 6 6 6 6 6 6 6
9 6 6 6 6 6 6 6 6 6
12 6 6 6 6 6 6 6 6 6
15 6 6 6 6 6 6 6 6 6
18 6 6 6 6 6 6 6 6 6
21 6 6 6 6 6 6 6 6 6
24 6 6 6 6 6 6 6 6 6
27 6 6 6 6 6 6 6 6 6
30 6 6 6 6 6 6 6 6 6