cuda 图像处理之sobel边缘检测

贴上来源代码吧，是在linux下调试通过的，包含源码和CMakeLists.txt

环境配置

配置好cuda pkg_config

配置好opencv pkg_config

之后的CMakeLists.txt会使用

源码

#include "cuda_runtime.h"
#include <cuda.h>
#include <device_functions.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <chrono>

using namespace std;
using namespace cv;

//GPU sobel
//  x0 x1 x2
//  x3 x4 x5
//  x6 x7 x8
__global__ void sobel_gpu(unsigned char *in, unsigned char *out, int imgHeight, int imgWidth)
{
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;

    int index = y * imgWidth + x;

    int Gx = 0;
    int Gy = 0;

    unsigned char x0, x1, x2, x3, x4, x5, x6,x7,x8;
    if (x > 0 && x < imgWidth-1 && y > 0 && y < imgHeight-1)
    {
        x0 = in[(y-1)* imgWidth + x - 1];
        x1 = in[(y-1)* imgWidth + x ];
        x2 = in[(y-1)* imgWidth + x + 1];

        x3 = in[y* imgWidth + x - 1];
        x4 = in[y* imgWidth + x ];
        x5 = in[y* imgWidth + x + 1];

        x6 = in[(y+1)* imgWidth + x - 1];
        x7 = in[(y+1)* imgWidth + x ];
        x8 = in[(y+1)* imgWidth + x + 1];

        Gx = x0 + 2*x3 + x6 - x2 - 2 * x5 - x8;
        Gy = x0 + 2 * x1 + x2 - x6 - 2 * x7 - x8;
        out[index] = (abs(Gx) + abs(Gy))/2;
    }

}

//CPU soble
void sobel_cpu(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth)
{
    int Gx = 0;
    int Gy = 0;
    for(int i = 1; i < imgHeight-1; i++)
    {
        unsigned char *dataUp = srcImg.ptr<unsigned char>(i-1);
        unsigned char *data = srcImg.ptr<unsigned char>(i);
        unsigned char *dataDown = srcImg.ptr<unsigned char>(i+1);
        unsigned char *out = dstImg.ptr<unsigned char>(i);
        for (int j = 1; j < imgWidth-1; j++)
        {
            Gx = (dataUp[j+1] + 2 * data[j+1] + dataDown[j+1]) - (dataUp[j-1] + 2 * data[j-1] + dataDown[j-1]);
            Gy = (dataUp[j-1] + 2 * dataUp[j] + dataUp[j+1]) - (dataDown[j-1] + 2 * dataDown[j] + dataDown[j+1]);
            out[j] = (abs(Gx) + abs(Gy))/2;

        }
    }

}

int main()
{
    //opencv 读图像

    Mat grayImg = imread("1.jpg", 0);

    int imgWidth = grayImg.cols;
    int imgHeight = grayImg.rows;

    // 对gray image 进行去噪
    Mat gaussImg;
    GaussianBlur(grayImg, gaussImg, Size(3,3), 0, 0, BORDER_DEFAULT);

    // dst_cpu, dst_gpu
    Mat dst_cpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));
    Mat dst_gpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));


    //sobel_cpu
    auto start = std::chrono::system_clock::now();
    sobel_cpu(gaussImg, dst_cpu, imgHeight, imgWidth);
    auto end = std::chrono::system_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    auto dur = double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
    std::cout << "cpu process time: " << 1000*dur << "ms" << std::endl;
	

    //申请指针，并将它指向GPU空间
    size_t num = imgHeight * imgWidth * sizeof(unsigned char);
    unsigned char * in_gpu;

    unsigned char *out_gpu;

    cudaMalloc((void**)&in_gpu, num);
    cudaMalloc((void **)&out_gpu, num);


    start = std::chrono::system_clock::now();
    //定义grid 和 block的维度
    dim3 threadsPerBlock(32,32); //32x32 = 1024 不能超过1024
    dim3 blocksPerGrid((imgWidth + threadsPerBlock.x -1)/threadsPerBlock.x,
    (imgHeight + threadsPerBlock.x-1)/threadsPerBlock.y);

    cudaMemcpy(in_gpu, gaussImg.data, num, cudaMemcpyHostToDevice);

    end = std::chrono::system_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    dur = double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
    std::cout << "cpu->gpu copy time: " << 1000*dur << "ms ";


    start = std::chrono::system_clock::now();

    sobel_gpu<<<blocksPerGrid, threadsPerBlock>>>(in_gpu, out_gpu, imgHeight, imgWidth);

    end = std::chrono::system_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    dur = double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
    std::cout << " gpu process time: " << 1000*dur << "ms ";

    start = std::chrono::system_clock::now();

    cudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);

    end = std::chrono::system_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    dur = double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
    std::cout << " gpu->cpu copy time: " << 1000*dur << "ms" << std::endl;

    //显示处理结果
    //imshow("gpu", dst_gpu);
    //imshow("cpu", dst_cpu);
    imwrite("gpu_process.bmp", dst_gpu);
    imwrite("cpu_process.bmp", dst_cpu);

    cudaFree(in_gpu);
    cudaFree(out_gpu);


    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.6)

project(gpu_sobel)

add_definitions(-std=c++11)
add_definitions(-DAPI_EXPORTS)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(/usr/local/cuda-11.1/include)
link_directories(/usr/local/cuda-11.1/lib64)


set(OpenCV_DIR /home/szfhy/lib/opencv4.4.0/lib/cmake/opencv4)
find_package(OpenCV)
# find_package(OpenCV 4.4.0  REQUIRED)
include_directories(/home/szfhy/lib/opencv4.4.0/include)
link_directories(/home/szfhy/lib/opencv4.4.0/lib)
include_directories(${OpenCV_INCLUDE_DIRS})

cuda_add_executable(gpu_soble gpu_sobel.cu)

target_link_libraries(gpu_soble cudart)
target_link_libraries(gpu_soble ${OpenCV_LIBS})