Opencv Inpaint use CUDA Backend

opencv只有cpu接口的Inpaint函数,对于需要使用CUDA进行图片处理时,反复内存显存迁移数据会影响计算速度。在不考虑填充效果十分好的情况下,可以使用如下CUDA算法,简单的进行填充。

算法来源

算法主要参考了https://github.com/Po-Ting-lin/HairRemoval.git 中的填充代码。该项目主要是去除皮肤上的毛发。对于其如何寻找需要填充的区域就不讨论了,直接使用其分析出的mask图进行填充。

该填充算法对细线和小面积的填充效果还可以 大面积的 效果就很一般了

1.需要填充的图片 和 mask图

2.inpaint类

使用inpaint类实现cuda inpaintu算法,包括inpaintcuda.h inpaintcuda.c 和 inpaintcuda.cu3个文件。

说明:
类初始化时 填入图片的大小 和 通道。支持单通道或3通道。
函数 inpaint 进行图片填充
src- 需要填充的原图 支持8BIT图 或 16BIT图
mask – 要填充的区域 8bit图
dst – 填充好的图片 是float类型
iters – 迭代次数 算法里每次执行就会有500次 一般填1就够了。填2就是1000次 以此类推

inpaintcuda.h

//inpaintcuda.h
#ifndef INPAINTCUDA_H
#define INPAINTCUDA_H

#include <opencv2/core.hpp>

#define INPAINT_TILE_X 32
#define INPAINT_TILE_Y 16
#define INPAINT_UNROLL_Y 1

class InpaintCUDA
{
public:
    InpaintCUDA(int cols,int rows, int channels);
    void inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst ,int iters);
    void _pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst);
    ~InpaintCUDA();

private:
    int cols;
    int rows;
    int channels;
    int iters;

    uint8_t* d_normalized_mask;
    float* d_normalized_src;
    float* d_normalized_masked_src_temp;

    int iDivUp(int a, int b);
};

#endif // INPAINTCUDA_H

inpaintcuda.cpp

//inpaintcuda.cpp
#include "inpaintcuda.h"

#include "cuda_runtime.h"
#include <opencv2/cudaarithm.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <iostream>

using namespace cv;

InpaintCUDA::InpaintCUDA(int cols, int rows, int channels)
{ //初始化 分配存储空间
    this->cols = cols;
    this->rows = rows;
    this->channels = channels;

    cudaMalloc((uint8_t**)&d_normalized_mask, cols* rows * channels * sizeof(uint8_t));
    cudaMalloc((float**)&d_normalized_src, cols* rows * channels * sizeof(float));
    cudaMalloc((float**)&d_normalized_masked_src_temp, cols* rows * channels * sizeof(float));
}

InpaintCUDA::~InpaintCUDA()
{
    cudaFree(d_normalized_mask);
    cudaFree(d_normalized_src);
    cudaFree(d_normalized_masked_src_temp);
}

int InpaintCUDA::iDivUp(int a, int b)
{
    return (a % b != 0) ? (a / b + 1) : (a / b);
}

void InpaintCUDA::inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst, int iters)
{
    this->iters = iters;
    std::vector<double> minvalues;
    std::vector<double> maxvalues;

    //将mask归一化到0 和 1的数值
    cuda::GpuMat normalized_mask_mat(rows, cols, CV_8UC1, d_normalized_mask);
    cuda::threshold(mask, normalized_mask_mat, 254, 1, THRESH_BINARY_INV);

    //将原图各通道归一化到0~1
    std::vector<cuda::GpuMat> m_list;
    cuda::split(src, m_list);
    for (int n = 0; n < m_list.size(); n++) {
        cuda::GpuMat t(rows, cols, CV_32FC1, d_normalized_src + n * cols * rows);
        m_list[n].convertTo(t, CV_32F);
        cuda::normalize(t, t, 1.0, 0.0, cv::NORM_INF,-1);

        double minval, maxval;
        cuda::minMax(m_list[n],&minval,&maxval);
        minvalues.push_back(minval);
        maxvalues.push_back(maxval);
    }
    cudaMemcpy(d_normalized_masked_src_temp, d_normalized_src, cols* rows * channels * sizeof(float), cudaMemcpyDeviceToDevice);

    //进行迭代填充
    for (int n = 0; n < iters; n++) {
        _pdeHeatDiffusion(d_normalized_mask, d_normalized_src, d_normalized_masked_src_temp);
    }

    //将归一化后的图 还原到原来的数值范围
    if (channels == 1) {
        cuda::GpuMat temp_nor;
        cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp), maxvalues[0], temp_nor);
        temp_nor.copyTo(dst);
    }
    else {
        std::vector<cuda::GpuMat> temp_list;
        for (int n = 0; n < channels;n++) {
            cuda::GpuMat temp_nor;
            cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp + n * cols * rows), maxvalues[n], temp_nor);
            temp_list.push_back(temp_nor);
        }
        cuda::GpuMat temp;
        cuda::merge(temp_list, temp);
        temp.copyTo(dst);
    }
}

Inpaintkernel.cu

//Inpaintkernel.cu
#include "InpaintCUDA.h"
#include <opencv2/imgcodecs.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>

__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height);

void InpaintCUDA::_pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst) {

	cudaStream_t* streams = new cudaStream_t[channels];
    for (int i = 0; i < channels; i++) 
        cudaStreamCreate(&streams[i]);

    dim3 block(INPAINT_TILE_X, INPAINT_TILE_Y / INPAINT_UNROLL_Y);
    dim3 grid(iDivUp(cols, INPAINT_TILE_X), iDivUp(rows / INPAINT_UNROLL_Y, INPAINT_TILE_Y / INPAINT_UNROLL_Y));

    for (int k = 0; k < channels; k++) {
        int offset = k * cols * rows;

        for (int i = 0; i < 500; i++) {
            _pdeHeatDiffusionKernel << <grid, block, 0 , streams[k]>> > (d_normalized_mask, d_normalized_src + offset, d_dst + offset, cols, rows);
        }
    }

	cudaDeviceSynchronize();

    for (int i = 0; i < channels; i++) 
        cudaStreamDestroy(streams[i]);
}

__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height) {
    const int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    if (x < 0 || y < 0 || x >= width || y >= height / INPAINT_UNROLL_Y) return;

#pragma unroll
    for (; y < height; y += height / INPAINT_UNROLL_Y) {
        float center = tempSrc[y * width + x];
        int i = y * width + x;

		if(mask[i]){ // 不在mask上的点 不改变数值
			return;
		}

        tempSrc[i] = center
            + 0.2f
            * (tempSrc[max(0, y - 1) * width + x]
                + tempSrc[min(height - 1, y + 1) * width + x]
                + tempSrc[y * width + max(0, x - 1)]
                + tempSrc[y * width + min(width - 1, x + 1)]
                - 4.0f * center)
            - 0.2f * mask[i] * (center - src[i]);
    }
}

3.main函数调用方法 我只写核心代码 包含文件和函数入口就省略了

Mat src = imread("demo1280.png", cv::ImreadModes::IMREAD_COLOR);
Mat mask = imread("mask_demo.png", cv::ImreadModes::IMREAD_GRAYSCALE);

cuda::GpuMat src_g(src);
cuda::GpuMat mask_g(mask);
cuda::GpuMat dst_g(src.size(),CV_32FC1);

InpaintCUDA ic(src.cols, src.rows , src.channels());
ic.inpaint(src_g, mask_g, dst_g, 1);

Mat dst_cpu;
dst_g.download(dst_cpu);
dst_cpu.convertTo(dst_cpu,CV_8U);
imwrite("dst_cpu.png", dst_cpu);

4.效果

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注