卷积

main.cu

#include <stdio.h>

#define HANDLE_ERROR(err) (handleError(err, __FILE__, __LINE__))

void handleError(cudaError_t err, const char *file, int line) {
  if (err != cudaSuccess) {
    printf("%s in %s at line %d \n", cudaGetErrorString(err), file, line);
  }
}

int getThreadNum() {
  cudaDeviceProp prop;
  int count;

  HANDLE_ERROR(cudaGetDeviceCount(&count));
  printf("gpu num: %d\n", count);
  HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
  printf("max thread num: %d\n", prop.maxThreadsPerBlock);
  printf("max grid dimensions: %d %d %d \n", prop.maxGridSize[0],
         prop.maxGridSize[1], prop.maxGridSize[2]);
  return prop.maxThreadsPerBlock;
}

__global__ void conv(float *img, float *kernel, float *result, int width,
                     int height, int kernelSize) {
  int t_i = threadIdx.x;
  int b_i = blockIdx.x;
  int id = (b_i * blockDim.x + t_i);
  if (id >= width * height) {
    return;
  }

  int row = id / width;
  int col = id % width;

  for (int i = 0; i < kernelSize; i++) {
    for (int j = 0; j < kernelSize; j++) {
      float imgValue = 0;
      int curRow = row - kernelSize / 2 + i;
      int curCol = col - kernelSize / 2 + j;
      if (curRow < 0 || curCol < 0 || curRow >= height || curCol >= width) {
      } else {
        imgValue = img[curRow * width + curCol];
      }
      result[id] += kernel[i * kernelSize + j] * imgValue;
    }
  }
}

int main(int argc, char *argv[]) {
  int width = 800;
  int height = 600;
  float *img = new float[width * height];
  for (int row = 0; row < height; row++) {
    for (int col = 0; col < width; col++) {
      img[col + row * width] = (col + row) % 256;
    }
  }

  int kernelSize = 3;
  float *kernel = new float[kernelSize * kernelSize];
  for (int i = 0; i < kernelSize * kernelSize; i++) {
    kernel[i] = i % kernelSize - 1;
  }

  float *imgGpu;
  float *kernelGpu;
  float *resultGpu;

  HANDLE_ERROR(cudaMalloc((void **)&imgGpu, width * height * sizeof(float)));
  HANDLE_ERROR(
      cudaMalloc((void **)&kernelGpu, kernelSize * kernelSize * sizeof(float)));
  HANDLE_ERROR(cudaMalloc((void **)&resultGpu, width * height * sizeof(float)));

  HANDLE_ERROR(cudaMemcpy(imgGpu, img, width * height * sizeof(float),
                          cudaMemcpyHostToDevice));
  HANDLE_ERROR(cudaMemcpy(kernelGpu, kernel,
                          kernelSize * kernelSize * sizeof(float),
                          cudaMemcpyHostToDevice));

  int threadNum = getThreadNum();
  int blockNum = (width * height - 0.5) / threadNum + 1;
  conv<<<blockNum, threadNum>>>(imgGpu, kernelGpu, resultGpu, width, height,
                                kernelSize);

  float *result = new float[width * height];
  HANDLE_ERROR(cudaMemcpy(result, resultGpu, width * height * sizeof(float),
                          cudaMemcpyDeviceToHost));

  // visualization
  printf("img:\n");
  for (int row = 0; row < 10; row++) {
    for (int col = 0; col < 10; col++) {
      printf("%2.0f ", img[col + row * width]);
    }
    printf("\n");
  }

  printf("kernel:\n");
  for (int row = 0; row < kernelSize; row++) {
    for (int col = 0; col < kernelSize; col++) {
      printf("%2.0f ", kernel[col + row * kernelSize]);
    }
    printf("\n");
  }

  printf("result:\n");
  for (int row = 0; row < 10; row++) {
    for (int col = 0; col < 10; col++) {
      printf("%2.0f ", result[col + row * width]);
    }
    printf("\n");
  }

  return 0;
}

结果：

gpu num: 1
max thread num: 1024
max grid dimensions: 2147483647 65535 65535 
img:
 0  1  2  3  4  5  6  7  8  9 
 1  2  3  4  5  6  7  8  9 10 
 2  3  4  5  6  7  8  9 10 11 
 3  4  5  6  7  8  9 10 11 12 
 4  5  6  7  8  9 10 11 12 13 
 5  6  7  8  9 10 11 12 13 14 
 6  7  8  9 10 11 12 13 14 15 
 7  8  9 10 11 12 13 14 15 16 
 8  9 10 11 12 13 14 15 16 17 
 9 10 11 12 13 14 15 16 17 18 
kernel:
-1  0  1 
-1  0  1 
-1  0  1 
result:
 3  4  4  4  4  4  4  4  4  4 
 6  6  6  6  6  6  6  6  6  6 
 9  6  6  6  6  6  6  6  6  6 
12  6  6  6  6  6  6  6  6  6 
15  6  6  6  6  6  6  6  6  6 
18  6  6  6  6  6  6  6  6  6 
21  6  6  6  6  6  6  6  6  6 
24  6  6  6  6  6  6  6  6  6 
27  6  6  6  6  6  6  6  6  6 
30  6  6  6  6  6  6  6  6  6