1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
| #include <stdio.h>
#define HANDLE_ERROR(err) (handleError(err, __FILE__, __LINE__))
void handleError(cudaError_t err, const char *file, int line) { if (err != cudaSuccess) { printf("%s in %s at line %d \n", cudaGetErrorString(err), file, line); } }
int getThreadNum() { cudaDeviceProp prop; int count;
HANDLE_ERROR(cudaGetDeviceCount(&count)); printf("gpu num: %d\n", count); HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0)); printf("max thread num: %d\n", prop.maxThreadsPerBlock); printf("max grid dimensions: %d %d %d \n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); return prop.maxThreadsPerBlock; }
__global__ void conv(float *img, float *kernel, float *result, int width, int height, int kernelSize) { int t_i = threadIdx.x; int b_i = blockIdx.x; int id = (b_i * blockDim.x + t_i); if (id >= width * height) { return; }
int row = id / width; int col = id % width;
for (int i = 0; i < kernelSize; i++) { for (int j = 0; j < kernelSize; j++) { float imgValue = 0; int curRow = row - kernelSize / 2 + i; int curCol = col - kernelSize / 2 + j; if (curRow < 0 || curCol < 0 || curRow >= height || curCol >= width) { } else { imgValue = img[curRow * width + curCol]; } result[id] += kernel[i * kernelSize + j] * imgValue; } } }
int main(int argc, char *argv[]) { int width = 800; int height = 600; float *img = new float[width * height]; for (int row = 0; row < height; row++) { for (int col = 0; col < width; col++) { img[col + row * width] = (col + row) % 256; } }
int kernelSize = 3; float *kernel = new float[kernelSize * kernelSize]; for (int i = 0; i < kernelSize * kernelSize; i++) { kernel[i] = i % kernelSize - 1; }
float *imgGpu; float *kernelGpu; float *resultGpu;
HANDLE_ERROR(cudaMalloc((void **)&imgGpu, width * height * sizeof(float))); HANDLE_ERROR( cudaMalloc((void **)&kernelGpu, kernelSize * kernelSize * sizeof(float))); HANDLE_ERROR(cudaMalloc((void **)&resultGpu, width * height * sizeof(float)));
HANDLE_ERROR(cudaMemcpy(imgGpu, img, width * height * sizeof(float), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel, kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice));
int threadNum = getThreadNum(); int blockNum = (width * height - 0.5) / threadNum + 1; conv<<<blockNum, threadNum>>>(imgGpu, kernelGpu, resultGpu, width, height, kernelSize);
float *result = new float[width * height]; HANDLE_ERROR(cudaMemcpy(result, resultGpu, width * height * sizeof(float), cudaMemcpyDeviceToHost));
printf("img:\n"); for (int row = 0; row < 10; row++) { for (int col = 0; col < 10; col++) { printf("%2.0f ", img[col + row * width]); } printf("\n"); }
printf("kernel:\n"); for (int row = 0; row < kernelSize; row++) { for (int col = 0; col < kernelSize; col++) { printf("%2.0f ", kernel[col + row * kernelSize]); } printf("\n"); }
printf("result:\n"); for (int row = 0; row < 10; row++) { for (int col = 0; col < 10; col++) { printf("%2.0f ", result[col + row * width]); } printf("\n"); }
return 0; }
|