Opencv Inpaint use CUDA Backend

opencv只有cpu接口的Inpaint函数，对于需要使用CUDA进行图片处理时，反复内存显存迁移数据会影响计算速度。在不考虑填充效果十分好的情况下，可以使用如下CUDA算法，简单的进行填充。

算法来源

算法主要参考了https://github.com/Po-Ting-lin/HairRemoval.git 中的填充代码。该项目主要是去除皮肤上的毛发。对于其如何寻找需要填充的区域就不讨论了，直接使用其分析出的mask图进行填充。

该填充算法对细线和小面积的填充效果还可以大面积的效果就很一般了

1.需要填充的图片和 mask图

2.inpaint类

使用inpaint类实现cuda inpaintu算法，包括inpaintcuda.h inpaintcuda.c 和 inpaintcuda.cu3个文件。

说明：
类初始化时填入图片的大小和通道。支持单通道或3通道。
函数 inpaint 进行图片填充
src- 需要填充的原图支持8BIT图或 16BIT图
mask – 要填充的区域 8bit图
dst – 填充好的图片是float类型
iters – 迭代次数算法里每次执行就会有500次一般填1就够了。填2就是1000次以此类推

inpaintcuda.h

//inpaintcuda.h
#ifndef INPAINTCUDA_H
#define INPAINTCUDA_H

#include <opencv2/core.hpp>

#define INPAINT_TILE_X 32
#define INPAINT_TILE_Y 16
#define INPAINT_UNROLL_Y 1

class InpaintCUDA
{
public:
    InpaintCUDA(int cols,int rows, int channels);
    void inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst ,int iters);
    void _pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst);
    ~InpaintCUDA();

private:
    int cols;
    int rows;
    int channels;
    int iters;

    uint8_t* d_normalized_mask;
    float* d_normalized_src;
    float* d_normalized_masked_src_temp;

    int iDivUp(int a, int b);
};

#endif // INPAINTCUDA_H

inpaintcuda.cpp

//inpaintcuda.cpp
#include "inpaintcuda.h"

#include "cuda_runtime.h"
#include <opencv2/cudaarithm.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <iostream>

using namespace cv;

InpaintCUDA::InpaintCUDA(int cols, int rows, int channels)
{ //初始化 分配存储空间
    this->cols = cols;
    this->rows = rows;
    this->channels = channels;

    cudaMalloc((uint8_t**)&d_normalized_mask, cols* rows * channels * sizeof(uint8_t));
    cudaMalloc((float**)&d_normalized_src, cols* rows * channels * sizeof(float));
    cudaMalloc((float**)&d_normalized_masked_src_temp, cols* rows * channels * sizeof(float));
}

InpaintCUDA::~InpaintCUDA()
{
    cudaFree(d_normalized_mask);
    cudaFree(d_normalized_src);
    cudaFree(d_normalized_masked_src_temp);
}

int InpaintCUDA::iDivUp(int a, int b)
{
    return (a % b != 0) ? (a / b + 1) : (a / b);
}

void InpaintCUDA::inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst, int iters)
{
    this->iters = iters;
    std::vector<double> minvalues;
    std::vector<double> maxvalues;

    //将mask归一化到0 和 1的数值
    cuda::GpuMat normalized_mask_mat(rows, cols, CV_8UC1, d_normalized_mask);
    cuda::threshold(mask, normalized_mask_mat, 254, 1, THRESH_BINARY_INV);

    //将原图各通道归一化到0~1
    std::vector<cuda::GpuMat> m_list;
    cuda::split(src, m_list);
    for (int n = 0; n < m_list.size(); n++) {
        cuda::GpuMat t(rows, cols, CV_32FC1, d_normalized_src + n * cols * rows);
        m_list[n].convertTo(t, CV_32F);
        cuda::normalize(t, t, 1.0, 0.0, cv::NORM_INF,-1);

        double minval, maxval;
        cuda::minMax(m_list[n],&minval,&maxval);
        minvalues.push_back(minval);
        maxvalues.push_back(maxval);
    }
    cudaMemcpy(d_normalized_masked_src_temp, d_normalized_src, cols* rows * channels * sizeof(float), cudaMemcpyDeviceToDevice);

    //进行迭代填充
    for (int n = 0; n < iters; n++) {
        _pdeHeatDiffusion(d_normalized_mask, d_normalized_src, d_normalized_masked_src_temp);
    }

    //将归一化后的图 还原到原来的数值范围
    if (channels == 1) {
        cuda::GpuMat temp_nor;
        cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp), maxvalues[0], temp_nor);
        temp_nor.copyTo(dst);
    }
    else {
        std::vector<cuda::GpuMat> temp_list;
        for (int n = 0; n < channels;n++) {
            cuda::GpuMat temp_nor;
            cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp + n * cols * rows), maxvalues[n], temp_nor);
            temp_list.push_back(temp_nor);
        }
        cuda::GpuMat temp;
        cuda::merge(temp_list, temp);
        temp.copyTo(dst);
    }
}

Inpaintkernel.cu

//Inpaintkernel.cu
#include "InpaintCUDA.h"
#include <opencv2/imgcodecs.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>

__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height);

void InpaintCUDA::_pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst) {

	cudaStream_t* streams = new cudaStream_t[channels];
    for (int i = 0; i < channels; i++) 
        cudaStreamCreate(&streams[i]);

    dim3 block(INPAINT_TILE_X, INPAINT_TILE_Y / INPAINT_UNROLL_Y);
    dim3 grid(iDivUp(cols, INPAINT_TILE_X), iDivUp(rows / INPAINT_UNROLL_Y, INPAINT_TILE_Y / INPAINT_UNROLL_Y));

    for (int k = 0; k < channels; k++) {
        int offset = k * cols * rows;

        for (int i = 0; i < 500; i++) {
            _pdeHeatDiffusionKernel << <grid, block, 0 , streams[k]>> > (d_normalized_mask, d_normalized_src + offset, d_dst + offset, cols, rows);
        }
    }

	cudaDeviceSynchronize();

    for (int i = 0; i < channels; i++) 
        cudaStreamDestroy(streams[i]);
}

__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height) {
    const int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    if (x < 0 || y < 0 || x >= width || y >= height / INPAINT_UNROLL_Y) return;

#pragma unroll
    for (; y < height; y += height / INPAINT_UNROLL_Y) {
        float center = tempSrc[y * width + x];
        int i = y * width + x;

		if(mask[i]){ // 不在mask上的点 不改变数值
			return;
		}

        tempSrc[i] = center
            + 0.2f
            * (tempSrc[max(0, y - 1) * width + x]
                + tempSrc[min(height - 1, y + 1) * width + x]
                + tempSrc[y * width + max(0, x - 1)]
                + tempSrc[y * width + min(width - 1, x + 1)]
                - 4.0f * center)
            - 0.2f * mask[i] * (center - src[i]);
    }
}

3.main函数调用方法我只写核心代码包含文件和函数入口就省略了

Mat src = imread("demo1280.png", cv::ImreadModes::IMREAD_COLOR);
Mat mask = imread("mask_demo.png", cv::ImreadModes::IMREAD_GRAYSCALE);

cuda::GpuMat src_g(src);
cuda::GpuMat mask_g(mask);
cuda::GpuMat dst_g(src.size(),CV_32FC1);

InpaintCUDA ic(src.cols, src.rows , src.channels());
ic.inpaint(src_g, mask_g, dst_g, 1);

Mat dst_cpu;
dst_g.download(dst_cpu);
dst_cpu.convertTo(dst_cpu,CV_8U);
imwrite("dst_cpu.png", dst_cpu);

4.效果

一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

YangYouji's WebSite

IT,VISION

发表回复取消回复

发表回复 取消回复

发表回复取消回复